chunkrank 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Amit Nautiyal
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,104 @@
1
+ Metadata-Version: 2.4
2
+ Name: chunkrank
3
+ Version: 0.2.0
4
+ Summary: Model-Aware Chunking + Answer Ranking
5
+ License: MIT
6
+ License-File: LICENCE
7
+ Author: Your Name
8
+ Author-email: you@example.com
9
+ Requires-Python: >=3.14,<4.0
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.14
13
+ Requires-Dist: deptry (>=0.24.0,<0.25.0)
14
+ Requires-Dist: mypy (>=1.19.1,<2.0.0)
15
+ Requires-Dist: numpy (>=1.26)
16
+ Requires-Dist: rank-bm25 (>=0.2.2)
17
+ Requires-Dist: regex (>=2023.10.3)
18
+ Requires-Dist: ruff (>=0.14.10,<0.15.0)
19
+ Requires-Dist: scikit-learn (>=1.5)
20
+ Description-Content-Type: text/markdown
21
+
22
+ # ChunkRank: Model-Aware Chunking + Answer Ranking
23
+
24
+ ## Problem
25
+ When using LLMs, text often exceeds the model’s context window.
26
+ To handle this, text must be **chunked** into pieces that fit the model’s maximum token length.
27
+
28
+ Two challenges arise:
29
+ 1. **Model-aware chunking**
30
+ Each model (OpenAI, Anthropic, Llama, Gemini, t5, Bert, BigBert, LangBert etc.) has a different context length and tokenizer.
31
+ Current libraries require users to manually configure chunk sizes; no unified library automatically adapts to the chosen model.
32
+
33
+ 2. **Answer consolidation & ranking**
34
+ Once text is chunked, a query may return multiple answers from different chunks.
35
+ A **ranking step** is needed to decide the best, most relevant answer.
36
+ Existing solutions (e.g., RAG frameworks) combine retrieval + generation, but there’s no standalone library that couples **chunking** and **answer re-ranking**.
37
+
38
+ ---
39
+
40
+ ## Existing Libraries & Gaps
41
+
42
+ ### Chunking
43
+ - **LangChain Text Splitters** → Token-based, works with `tiktoken`, but requires manual chunk size config.
44
+ - **LlamaIndex `TokenTextSplitter`** → Similar functionality, manual sizing.
45
+ - **Haystack `PreProcessor`** → Can split by tokens, overlap supported, but not model-aware by default.
46
+ - **semantic-text-splitter / semchunk** → Standalone, supports tiktoken/HF tokenizers, still needs user-specified chunk length.
47
+
48
+ **Gap:** None of these libraries automatically map a model → tokenizer → context window → chunk size.
49
+
50
+ ---
51
+
52
+ ### Ranking
53
+ - **pygaggle** (Waterloo CAST) → neural re-ranker.
54
+ - **Tevatron** → dense retrieval + re-ranking toolkit.
55
+ - **Pyserini** (with pygaggle) → BM25 + neural re-rankers.
56
+ - **Haystack, LlamaIndex** → include ranking in RAG pipelines.
57
+
58
+ **Gap:** Ranking exists, but **not combined with chunking** in a single, simple package.
59
+
60
+ ---
61
+
62
+ ## What We Want to Build
63
+ A standalone Python library that:
64
+
65
+ 1. **Model-Aware Chunking**
66
+ - User specifies a model name (e.g., `gpt-4o-mini`, `claude-3.5-sonnet`, `Llama-3.1-8B`).
67
+ - Library looks up the model’s max context window and tokenizer.
68
+ - Automatically chunks text into model-compatible pieces with optional overlap and reserve space.
69
+
70
+ 2. **Answer Consolidation & Ranking**
71
+ - Given multiple answers from chunks, apply a re-ranking step to select the best one.
72
+ - Should integrate with existing ranking models (cross-encoder, bi-encoder, BM25 + re-ranker).
73
+ - Should work standalone, without needing a full RAG pipeline.
74
+
75
+ 3. **Unified Workflow**
76
+ - `chunks = chunkrank.split(text, model="gpt-4o-mini")`
77
+ - `answers = chunkrank.answer(question, chunks)`
78
+ - `best = chunkrank.rank(answers)`
79
+
80
+ ---
81
+
82
+ ## Vision
83
+ - Lightweight, model-agnostic utility library.
84
+ - Bridges the gap between **text preparation** (chunking) and **answer quality** (ranking).
85
+ - Complements existing RAG frameworks but can also work independently.
86
+ - Easy to drop into pipelines: preprocessing for QA, summarization, or information extraction.
87
+
88
+ ---
89
+
90
+ ## Next Steps
91
+ 1. Build the **model registry** (model → context window + tokenizer).
92
+ 2. Implement **chunking strategies** (tokens, sentences, paragraphs).
93
+ 3. Integrate a **re-ranking engine** (start with Hugging Face cross-encoder).
94
+ 4. Package and release to PyPI with a simple API.
95
+
96
+ ---
97
+
98
+ ## Community
99
+
100
+ - [Contributors](CONTRIBUTORS.md)
101
+ - [Maintainers](MAINTAINERS.md)
102
+ - [Contributing Guidelines](CONTRIBUTING.md)
103
+
104
+
@@ -0,0 +1,82 @@
1
+ # ChunkRank: Model-Aware Chunking + Answer Ranking
2
+
3
+ ## Problem
4
+ When using LLMs, text often exceeds the model’s context window.
5
+ To handle this, text must be **chunked** into pieces that fit the model’s maximum token length.
6
+
7
+ Two challenges arise:
8
+ 1. **Model-aware chunking**
9
+ Each model (OpenAI, Anthropic, Llama, Gemini, t5, Bert, BigBert, LangBert etc.) has a different context length and tokenizer.
10
+ Current libraries require users to manually configure chunk sizes; no unified library automatically adapts to the chosen model.
11
+
12
+ 2. **Answer consolidation & ranking**
13
+ Once text is chunked, a query may return multiple answers from different chunks.
14
+ A **ranking step** is needed to decide the best, most relevant answer.
15
+ Existing solutions (e.g., RAG frameworks) combine retrieval + generation, but there’s no standalone library that couples **chunking** and **answer re-ranking**.
16
+
17
+ ---
18
+
19
+ ## Existing Libraries & Gaps
20
+
21
+ ### Chunking
22
+ - **LangChain Text Splitters** → Token-based, works with `tiktoken`, but requires manual chunk size config.
23
+ - **LlamaIndex `TokenTextSplitter`** → Similar functionality, manual sizing.
24
+ - **Haystack `PreProcessor`** → Can split by tokens, overlap supported, but not model-aware by default.
25
+ - **semantic-text-splitter / semchunk** → Standalone, supports tiktoken/HF tokenizers, still needs user-specified chunk length.
26
+
27
+ **Gap:** None of these libraries automatically map a model → tokenizer → context window → chunk size.
28
+
29
+ ---
30
+
31
+ ### Ranking
32
+ - **pygaggle** (Waterloo CAST) → neural re-ranker.
33
+ - **Tevatron** → dense retrieval + re-ranking toolkit.
34
+ - **Pyserini** (with pygaggle) → BM25 + neural re-rankers.
35
+ - **Haystack, LlamaIndex** → include ranking in RAG pipelines.
36
+
37
+ **Gap:** Ranking exists, but **not combined with chunking** in a single, simple package.
38
+
39
+ ---
40
+
41
+ ## What We Want to Build
42
+ A standalone Python library that:
43
+
44
+ 1. **Model-Aware Chunking**
45
+ - User specifies a model name (e.g., `gpt-4o-mini`, `claude-3.5-sonnet`, `Llama-3.1-8B`).
46
+ - Library looks up the model’s max context window and tokenizer.
47
+ - Automatically chunks text into model-compatible pieces with optional overlap and reserve space.
48
+
49
+ 2. **Answer Consolidation & Ranking**
50
+ - Given multiple answers from chunks, apply a re-ranking step to select the best one.
51
+ - Should integrate with existing ranking models (cross-encoder, bi-encoder, BM25 + re-ranker).
52
+ - Should work standalone, without needing a full RAG pipeline.
53
+
54
+ 3. **Unified Workflow**
55
+ - `chunks = chunkrank.split(text, model="gpt-4o-mini")`
56
+ - `answers = chunkrank.answer(question, chunks)`
57
+ - `best = chunkrank.rank(answers)`
58
+
59
+ ---
60
+
61
+ ## Vision
62
+ - Lightweight, model-agnostic utility library.
63
+ - Bridges the gap between **text preparation** (chunking) and **answer quality** (ranking).
64
+ - Complements existing RAG frameworks but can also work independently.
65
+ - Easy to drop into pipelines: preprocessing for QA, summarization, or information extraction.
66
+
67
+ ---
68
+
69
+ ## Next Steps
70
+ 1. Build the **model registry** (model → context window + tokenizer).
71
+ 2. Implement **chunking strategies** (tokens, sentences, paragraphs).
72
+ 3. Integrate a **re-ranking engine** (start with Hugging Face cross-encoder).
73
+ 4. Package and release to PyPI with a simple API.
74
+
75
+ ---
76
+
77
+ ## Community
78
+
79
+ - [Contributors](CONTRIBUTORS.md)
80
+ - [Maintainers](MAINTAINERS.md)
81
+ - [Contributing Guidelines](CONTRIBUTING.md)
82
+
@@ -0,0 +1,8 @@
1
+ from .chunker import Chunker, chunk_text
2
+ from .pipeline import ChunkRankPipeline
3
+
4
+ __all__ = [
5
+ "Chunker",
6
+ "chunk_text",
7
+ "ChunkRankPipeline",
8
+ ]
@@ -0,0 +1,37 @@
1
+ from __future__ import annotations
2
+ from dataclasses import dataclass
3
+ from typing import List, Optional, Tuple
4
+ import re
5
+
6
+
7
+ @dataclass
8
+ class LocalExtractiveAnswerer:
9
+ min_overlap: int = 2
10
+
11
+ def answer(self, question: str, context: str) -> str:
12
+ sentences = _split_sentences(context)
13
+ if not sentences:
14
+ return ""
15
+
16
+ q_words = _norm_words(question)
17
+ best: Tuple[str, int] = ("", 0)
18
+
19
+ for s in sentences:
20
+ s_words = _norm_words(s)
21
+ overlap = len(q_words.intersection(s_words))
22
+ if overlap > best[1]:
23
+ best = (s, overlap)
24
+
25
+ if best[1] < self.min_overlap:
26
+ return ""
27
+ return best[0].strip()
28
+
29
+
30
+ def _split_sentences(text: str) -> List[str]:
31
+ parts = re.split(r"(?<=[.!?])\s+", text.strip())
32
+ return [p.strip() for p in parts if p.strip()]
33
+
34
+
35
+ def _norm_words(text: str) -> set[str]:
36
+ words = re.findall(r"[A-Za-z0-9']+", text.lower())
37
+ return set(words)
@@ -0,0 +1,93 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import List, Literal, Optional
5
+
6
+ from .models import get_model_info
7
+ from .tokenizers import build_tokenizer
8
+
9
+ Strategy = Literal["tokens"] # keep it simple for now
10
+
11
+
12
+ @dataclass
13
+ class ChunkerConfig:
14
+ model: str
15
+ strategy: Strategy = "tokens"
16
+ overlap_tokens: int = 0
17
+ reserve_tokens: Optional[int] = None
18
+
19
+
20
+ class Chunker:
21
+ def __init__(self, config: ChunkerConfig):
22
+ info = get_model_info(config.model)
23
+
24
+ reserve = config.reserve_tokens if config.reserve_tokens is not None else info.default_reserve
25
+ self.window = max(1, info.max_context - max(0, reserve))
26
+
27
+ self.overlap = max(0, config.overlap_tokens)
28
+ if self.overlap >= self.window:
29
+ raise ValueError("overlap_tokens must be < usable window size")
30
+
31
+ self.strategy = config.strategy
32
+ self.tok = build_tokenizer(info.tokenizer, info.tokenizer_id)
33
+
34
+ def split(self, text: str) -> List[str]:
35
+ if not isinstance(text, str) or not text:
36
+ return []
37
+
38
+ if self.strategy != "tokens":
39
+ raise NotImplementedError("Only 'tokens' strategy is implemented in this version.")
40
+
41
+ return list(self._chunk_by_token_budget(text))
42
+
43
+ def _chunk_by_token_budget(self, text: str):
44
+ """
45
+ Robust approach: grow a slice until token budget reached, then emit slice.
46
+ Avoids needing tokenizer.decode() (so no None chunks).
47
+ """
48
+ start = 0
49
+ n = len(text)
50
+
51
+ # Fast path: already fits
52
+ if self.tok.count(text) <= self.window:
53
+ yield text
54
+ return
55
+
56
+ # Character-based upper bound for initial probe (roughly 4 chars/token)
57
+ approx_chars = max(64, self.window * 4)
58
+
59
+ while start < n:
60
+ end = min(n, start + approx_chars)
61
+ chunk = text[start:end]
62
+
63
+ # If too big, shrink
64
+ while end > start and self.tok.count(chunk) > self.window:
65
+ end = start + max(1, (end - start) * 9 // 10)
66
+ chunk = text[start:end]
67
+
68
+ # If somehow cannot shrink (pathological), force a minimal progress
69
+ if end <= start:
70
+ end = min(n, start + 200)
71
+ chunk = text[start:end]
72
+
73
+ yield chunk
74
+
75
+ if end >= n:
76
+ break
77
+
78
+ # overlap handling (approx char backoff)
79
+ if self.overlap > 0:
80
+ backoff_chars = self.overlap * 4
81
+ start = max(0, end - backoff_chars)
82
+ else:
83
+ start = end
84
+
85
+
86
+ def chunk_text(
87
+ text: str,
88
+ model: str,
89
+ overlap_tokens: int = 0,
90
+ reserve_tokens: Optional[int] = None,
91
+ ) -> List[str]:
92
+ cfg = ChunkerConfig(model=model, overlap_tokens=overlap_tokens, reserve_tokens=reserve_tokens)
93
+ return Chunker(cfg).split(text)
@@ -0,0 +1,31 @@
1
+ import importlib.resources
2
+ import json
3
+ from dataclasses import dataclass
4
+ from typing import Optional
5
+
6
+
7
+ @dataclass
8
+ class ModelInfo:
9
+ name: str
10
+ max_context: int
11
+ tokenizer: Optional[str]
12
+ tokenizer_id: Optional[str]
13
+ default_reserve: int = 256
14
+
15
+
16
+ def load_registry() -> Dict[str, ModelInfo]:
17
+ "Loads the model registry from the json file"
18
+ with importlib.resources.open_text("chunkrank.registry", "model_registry.json") as file:
19
+ data = json.load(file)
20
+ return {k: ModelInfo(**v) for k, v in data.items()}
21
+
22
+
23
+ def get_model_info(model: str) -> ModelInfo:
24
+ registry = load_registry()
25
+ if model in registry:
26
+ return registry[model]
27
+ return ModelInfo(model,
28
+ 128_000,
29
+ "tiktoken",
30
+ "o200k_base",
31
+ 512)
@@ -0,0 +1,14 @@
1
+ from .chunker import Chunker, ChunkerConfig
2
+ from .ranker import Ranker
3
+
4
+
5
+ class ChunkRankPipeline:
6
+ def __init__(self, model: str):
7
+ self.chunker = Chunker(ChunkerConfig(model=model))
8
+ self.ranker = Ranker()
9
+
10
+ def process(self, question: str, text: str) -> str:
11
+ chunks = self.chunker.split(text)
12
+ answers = [f"Answer from chunk {i}" for i, _ in enumerate(chunks, 1)] # placeholder
13
+ best = self.ranker.rank(question, answers)[0][0]
14
+ return best
@@ -0,0 +1,56 @@
1
+ from typing import List, Tuple
2
+ import numpy as np
3
+ from sklearn.feature_extraction.text import TfidfVectorizer
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ from rank_bm25 import BM25Okapi
6
+
7
+
8
+ class Ranker:
9
+
10
+ def __init__(self, method: str = "bm25"):
11
+ self.method = method
12
+
13
+ def rank(self, question: str, answers: List[str]) -> List[Tuple[str, float]]:
14
+ clean = [a for a in answers if isinstance(a, str) and a.strip()]
15
+ if not clean:
16
+ return []
17
+
18
+ if self.method == "tfidf":
19
+ return self._rank_tfidf(question, clean)
20
+ elif self.method == "bm25":
21
+ return self._rank_bm25(question, clean)
22
+ else:
23
+ raise ValueError(f"Unknown ranking method: {self.method}")
24
+
25
+ def _rank_tfidf(self, question: str, answers: List[str]) -> List[Tuple[str, float]]:
26
+ vectorizer = TfidfVectorizer(stop_words="english")
27
+ corpus = [question] + answers
28
+ vectors = vectorizer.fit_transform(corpus)
29
+
30
+ q_vec = vectors[0]
31
+ a_vecs = vectors[1:]
32
+
33
+ scores = cosine_similarity(q_vec, a_vecs)[0]
34
+ return sorted(zip(answers, scores), key=lambda x: x[1], reverse=True)
35
+
36
+ def _rank_bm25(self, question: str, answers: List[str]) -> List[Tuple[str, float]]:
37
+ tokenized_answers = [a.split() for a in answers if a and a.strip()]
38
+ if not tokenized_answers:
39
+ return []
40
+ bm25 = BM25Okapi(tokenized_answers)
41
+
42
+ q_tokens = question.split()
43
+ scores = bm25.get_scores(q_tokens)
44
+ return sorted(zip(answers, scores), key=lambda x: x[1], reverse=True)
45
+
46
+ def rank_texts(self, query: str, texts: List[str]) -> List[Tuple[str, float]]:
47
+ """
48
+ Rank raw texts (chunks) against a query.
49
+ """
50
+ return self.rank(query, texts)
51
+
52
+
53
+ def rank_answers(question: str, answers: List[str], method: str = "bm25") -> str:
54
+ ranker = Ranker(method=method)
55
+ ranked = ranker.rank(question, answers)
56
+ return ranked[0][0]
File without changes
@@ -0,0 +1,149 @@
1
+ {
2
+ "gpt-4o-mini": {
3
+ "name": "gpt-4o-mini",
4
+ "max_context": 128000,
5
+ "tokenizer": "tiktoken",
6
+ "tokenizer_id": "o200k_base",
7
+ "default_reserve": 512
8
+ },
9
+ "claude-3-5-sonnet": {
10
+ "name": "claude-3-5-sonnet",
11
+ "max_context": 200000,
12
+ "tokenizer": "tiktoken",
13
+ "tokenizer_id": "o200k_base",
14
+ "default_reserve": 512
15
+ },
16
+ "Llama-3.1-8B": {
17
+ "name": "Llama-3.1-8B",
18
+ "max_context": 128000,
19
+ "tokenizer": "hf",
20
+ "tokenizer_id": "meta-llama/Llama-3.1-8B",
21
+ "default_reserve": 512
22
+ },
23
+ "Llama-3.1-70B": {
24
+ "name": "Llama-3.1-70B",
25
+ "max_context": 128000,
26
+ "tokenizer": "hf",
27
+ "tokenizer_id": "meta-llama/Llama-3.1-70B",
28
+ "default_reserve": 1024
29
+ },
30
+ "mistral-7b": {
31
+ "name": "Mistral-7B",
32
+ "max_context": 32768,
33
+ "tokenizer": "hf",
34
+ "tokenizer_id": "mistralai/Mistral-7B-v0.2",
35
+ "default_reserve": 512
36
+ },
37
+ "mixtral-8x7b": {
38
+ "name": "Mixtral-8x7B",
39
+ "max_context": 65536,
40
+ "tokenizer": "hf",
41
+ "tokenizer_id": "mistralai/Mixtral-8x7B-Instruct",
42
+ "default_reserve": 1024
43
+ },
44
+ "gpt-neo-2.7B": {
45
+ "name": "GPT-Neo-2.7B",
46
+ "max_context": 2048,
47
+ "tokenizer": "hf",
48
+ "tokenizer_id": "EleutherAI/gpt-neo-2.7B",
49
+ "default_reserve": 128
50
+ },
51
+ "gpt-j-6B": {
52
+ "name": "GPT-J-6B",
53
+ "max_context": 4096,
54
+ "tokenizer": "hf",
55
+ "tokenizer_id": "EleutherAI/gpt-j-6B",
56
+ "default_reserve": 256
57
+ },
58
+ "bert-base-uncased": {
59
+ "name": "BERT Base Uncased",
60
+ "max_context": 512,
61
+ "tokenizer": "hf",
62
+ "tokenizer_id": "bert-base-uncased",
63
+ "default_reserve": 64
64
+ },
65
+ "bert-large-uncased": {
66
+ "name": "BERT Large Uncased",
67
+ "max_context": 512,
68
+ "tokenizer": "hf",
69
+ "tokenizer_id": "bert-large-uncased",
70
+ "default_reserve": 64
71
+ },
72
+ "distilbert-base-uncased": {
73
+ "name": "DistilBERT Base Uncased",
74
+ "max_context": 512,
75
+ "tokenizer": "hf",
76
+ "tokenizer_id": "distilbert-base-uncased",
77
+ "default_reserve": 64
78
+ },
79
+ "bigbird-roberta-base": {
80
+ "name": "BigBird RoBERTa Base",
81
+ "max_context": 4096,
82
+ "tokenizer": "hf",
83
+ "tokenizer_id": "google/bigbird-roberta-base",
84
+ "default_reserve": 256
85
+ },
86
+ "bigbird-roberta-large": {
87
+ "name": "BigBird RoBERTa Large",
88
+ "max_context": 4096,
89
+ "tokenizer": "hf",
90
+ "tokenizer_id": "google/bigbird-roberta-large",
91
+ "default_reserve": 256
92
+ },
93
+ "longformer-base-4096": {
94
+ "name": "Longformer Base 4096",
95
+ "max_context": 4096,
96
+ "tokenizer": "hf",
97
+ "tokenizer_id": "allenai/longformer-base-4096",
98
+ "default_reserve": 256
99
+ },
100
+ "longformer-large-4096": {
101
+ "name": "Longformer Large 4096",
102
+ "max_context": 4096,
103
+ "tokenizer": "hf",
104
+ "tokenizer_id": "allenai/longformer-large-4096",
105
+ "default_reserve": 256
106
+ },
107
+ "deberta-v3-base": {
108
+ "name": "DeBERTa v3 Base",
109
+ "max_context": 512,
110
+ "tokenizer": "hf",
111
+ "tokenizer_id": "microsoft/deberta-v3-base",
112
+ "default_reserve": 64
113
+ },
114
+ "deberta-v3-large": {
115
+ "name": "DeBERTa v3 Large",
116
+ "max_context": 512,
117
+ "tokenizer": "hf",
118
+ "tokenizer_id": "microsoft/deberta-v3-large",
119
+ "default_reserve": 64
120
+ },
121
+ "t5-base": {
122
+ "name": "T5 Base",
123
+ "max_context": 512,
124
+ "tokenizer": "hf",
125
+ "tokenizer_id": "t5-base",
126
+ "default_reserve": 64
127
+ },
128
+ "t5-large": {
129
+ "name": "T5 Large",
130
+ "max_context": 512,
131
+ "tokenizer": "hf",
132
+ "tokenizer_id": "t5-large",
133
+ "default_reserve": 64
134
+ },
135
+ "flan-t5-xl": {
136
+ "name": "FLAN-T5 XL",
137
+ "max_context": 2048,
138
+ "tokenizer": "hf",
139
+ "tokenizer_id": "google/flan-t5-xl",
140
+ "default_reserve": 128
141
+ },
142
+ "gemini-pro-placeholder": {
143
+ "name": "Gemini Pro",
144
+ "max_context": 128000,
145
+ "tokenizer": "sentencepiece",
146
+ "tokenizer_id": "google/gemini",
147
+ "default_reserve": 512
148
+ }
149
+ }
@@ -0,0 +1,40 @@
1
+
2
+ def _try_importing_tiktoken():
3
+ try:
4
+ import tiktoken
5
+ return tiktoken
6
+ except ImportError:
7
+ return None
8
+
9
+ def _try_importing_transformers():
10
+ try:
11
+ import transformers
12
+ return transformers
13
+ except ImportError:
14
+ return None
15
+
16
+
17
+ class TokenizerAdapter:
18
+ def __init__(self, encode_fn: Callable[[str], List[int]]):
19
+ self._encode = encode_fn
20
+
21
+ def encode(self, text: str) -> List[int]:
22
+ return self._encode(text)
23
+
24
+ def count(self, text: str) -> int:
25
+ return len(self._encode(text))
26
+
27
+
28
+ def build_tokenizer(backend: Optional[str], tokenizer_id: Optional[str]) -> TokenizerAdapter:
29
+ if backend == "tiktoken":
30
+ tiktoken = _try_importing_tiktoken()
31
+ if tiktoken:
32
+ enc = tiktoken.get_encoding(tokenizer_id or "o200k_base")
33
+ return TokenizerAdapter(lambda s: enc.encode(s, disallowed_special=()))
34
+ elif backend == "hf":
35
+ transformers = _try_importing_transformers()
36
+ if transformers:
37
+ tok = transformers.AutoTokenizer.from_pretrained(tokenizer_id, use_fast=True)
38
+ return TokenizerAdapter(lambda s: tok.encode(s, add_special_tokens=False))
39
+
40
+ return TokenizerAdapter(lambda s: list(range(len(s) // 4)))
File without changes
@@ -0,0 +1,25 @@
1
+ [project]
2
+ name = "chunkrank"
3
+ version = "0.2.0"
4
+ description = "Model-Aware Chunking + Answer Ranking"
5
+ authors = [
6
+ { name = "Your Name", email = "you@example.com" }
7
+ ]
8
+ readme = "README.md"
9
+ license = { text = "MIT" }
10
+ requires-python = ">=3.14,<4.0"
11
+ dependencies = [
12
+ "mypy (>=1.19.1,<2.0.0)",
13
+ "deptry (>=0.24.0,<0.25.0)",
14
+ "ruff (>=0.14.10,<0.15.0)",
15
+ "regex>=2023.10.3",
16
+ "numpy>=1.26",
17
+ "scikit-learn>=1.5",
18
+ "rank-bm25>=0.2.2"
19
+
20
+ ]
21
+
22
+
23
+ [build-system]
24
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
25
+ build-backend = "poetry.core.masonry.api"