ragbandit-core 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragbandit_core-0.1.1/LICENSE.md +9 -0
- ragbandit_core-0.1.1/PKG-INFO +145 -0
- ragbandit_core-0.1.1/README.md +121 -0
- ragbandit_core-0.1.1/pyproject.toml +51 -0
- ragbandit_core-0.1.1/setup.cfg +4 -0
- ragbandit_core-0.1.1/src/ragbandit/__init__.py +26 -0
- ragbandit_core-0.1.1/src/ragbandit/config/__init__.py +3 -0
- ragbandit_core-0.1.1/src/ragbandit/config/llms.py +34 -0
- ragbandit_core-0.1.1/src/ragbandit/config/pricing.py +38 -0
- ragbandit_core-0.1.1/src/ragbandit/documents/__init__.py +66 -0
- ragbandit_core-0.1.1/src/ragbandit/documents/chunkers/__init__.py +18 -0
- ragbandit_core-0.1.1/src/ragbandit/documents/chunkers/base_chunker.py +201 -0
- ragbandit_core-0.1.1/src/ragbandit/documents/chunkers/fixed_size_chunker.py +174 -0
- ragbandit_core-0.1.1/src/ragbandit/documents/chunkers/semantic_chunker.py +205 -0
- ragbandit_core-0.1.1/src/ragbandit/documents/document_pipeline.py +350 -0
- ragbandit_core-0.1.1/src/ragbandit/documents/embedders/__init__.py +14 -0
- ragbandit_core-0.1.1/src/ragbandit/documents/embedders/base_embedder.py +82 -0
- ragbandit_core-0.1.1/src/ragbandit/documents/embedders/mistral_embedder.py +129 -0
- ragbandit_core-0.1.1/src/ragbandit/documents/ocr/__init__.py +13 -0
- ragbandit_core-0.1.1/src/ragbandit/documents/ocr/base_ocr.py +136 -0
- ragbandit_core-0.1.1/src/ragbandit/documents/ocr/mistral_ocr.py +147 -0
- ragbandit_core-0.1.1/src/ragbandit/documents/processors/__init__.py +16 -0
- ragbandit_core-0.1.1/src/ragbandit/documents/processors/base_processor.py +88 -0
- ragbandit_core-0.1.1/src/ragbandit/documents/processors/footnotes_processor.py +353 -0
- ragbandit_core-0.1.1/src/ragbandit/documents/processors/references_processor.py +408 -0
- ragbandit_core-0.1.1/src/ragbandit/documents/utils/__init__.py +11 -0
- ragbandit_core-0.1.1/src/ragbandit/documents/utils/secure_file_handler.py +95 -0
- ragbandit_core-0.1.1/src/ragbandit/prompt_tools/__init__.py +27 -0
- ragbandit_core-0.1.1/src/ragbandit/prompt_tools/footnotes_processor_tools.py +195 -0
- ragbandit_core-0.1.1/src/ragbandit/prompt_tools/prompt_tool.py +118 -0
- ragbandit_core-0.1.1/src/ragbandit/prompt_tools/references_processor_tools.py +31 -0
- ragbandit_core-0.1.1/src/ragbandit/prompt_tools/semantic_chunker_tools.py +56 -0
- ragbandit_core-0.1.1/src/ragbandit/schema.py +206 -0
- ragbandit_core-0.1.1/src/ragbandit/utils/__init__.py +19 -0
- ragbandit_core-0.1.1/src/ragbandit/utils/in_memory_log_handler.py +33 -0
- ragbandit_core-0.1.1/src/ragbandit/utils/llm_utils.py +188 -0
- ragbandit_core-0.1.1/src/ragbandit/utils/mistral_client.py +76 -0
- ragbandit_core-0.1.1/src/ragbandit/utils/token_usage_tracker.py +220 -0
- ragbandit_core-0.1.1/src/ragbandit_core.egg-info/PKG-INFO +145 -0
- ragbandit_core-0.1.1/src/ragbandit_core.egg-info/SOURCES.txt +41 -0
- ragbandit_core-0.1.1/src/ragbandit_core.egg-info/dependency_links.txt +1 -0
- ragbandit_core-0.1.1/src/ragbandit_core.egg-info/requires.txt +5 -0
- ragbandit_core-0.1.1/src/ragbandit_core.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Martim Chaves
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
6
|
+
|
|
7
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
8
|
+
|
|
9
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ragbandit-core
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Core utilities for document processing, RAG configuration, querying, and evaluation.
|
|
5
|
+
Author-email: Martim Chaves <martim@ragbandit.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/MartimChaves/ragbandit-core
|
|
8
|
+
Project-URL: Documentation, https://github.com/MartimChaves/ragbandit-core#readme
|
|
9
|
+
Project-URL: Source, https://github.com/MartimChaves/ragbandit-core
|
|
10
|
+
Project-URL: Issues, https://github.com/MartimChaves/ragbandit-core/issues
|
|
11
|
+
Classifier: Programming Language :: Python
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Requires-Python: >=3.10
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE.md
|
|
18
|
+
Requires-Dist: pydantic>=2.11.7
|
|
19
|
+
Requires-Dist: llama-index>=0.12.52
|
|
20
|
+
Requires-Dist: mistralai>=1.7.0
|
|
21
|
+
Requires-Dist: ragas>=0.3.0
|
|
22
|
+
Requires-Dist: cryptography>=44.0.2
|
|
23
|
+
Dynamic: license-file
|
|
24
|
+
|
|
25
|
+
# ragbandit-core
|
|
26
|
+
|
|
27
|
+
Core utilities for:
|
|
28
|
+
|
|
29
|
+
* Document ingestion & processing (OCR, chunking, embedding)
|
|
30
|
+
* Building and running Retrieval-Augmented Generation (RAG) pipelines
|
|
31
|
+
* Evaluating answers with automated metrics
|
|
32
|
+
|
|
33
|
+
## Quick start
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install ragbandit-core
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from ragbandit.documents import (
|
|
41
|
+
DocumentPipeline,
|
|
42
|
+
ReferencesProcessor,
|
|
43
|
+
FootnoteProcessor,
|
|
44
|
+
MistralOCRDocument,
|
|
45
|
+
MistralEmbedder,
|
|
46
|
+
SemanticChunker
|
|
47
|
+
)
|
|
48
|
+
import os
|
|
49
|
+
import logging
|
|
50
|
+
from dotenv import load_dotenv
|
|
51
|
+
load_dotenv()
|
|
52
|
+
|
|
53
|
+
# Configure logging
|
|
54
|
+
logging.basicConfig(
|
|
55
|
+
level=logging.INFO,
|
|
56
|
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
|
|
60
|
+
|
|
61
|
+
file_path = "./data/raw/[document_name].pdf"
|
|
62
|
+
|
|
63
|
+
doc_pipeline = DocumentPipeline(
|
|
64
|
+
chunker=SemanticChunker(min_chunk_size=500, api_key=MISTRAL_API_KEY),
|
|
65
|
+
embedder=MistralEmbedder(model="mistral-embed", api_key=MISTRAL_API_KEY), # noqa
|
|
66
|
+
ocr_processor=MistralOCRDocument(api_key=MISTRAL_API_KEY),
|
|
67
|
+
processors=[
|
|
68
|
+
ReferencesProcessor(api_key=MISTRAL_API_KEY),
|
|
69
|
+
FootnoteProcessor(api_key=MISTRAL_API_KEY),
|
|
70
|
+
],
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
extended_response = doc_pipeline.process(file_path)
|
|
74
|
+
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Running Steps Manually
|
|
78
|
+
|
|
79
|
+
For more control, you can run each pipeline step independently:
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
from ragbandit.documents import (
|
|
83
|
+
DocumentPipeline,
|
|
84
|
+
ReferencesProcessor,
|
|
85
|
+
MistralOCRDocument,
|
|
86
|
+
MistralEmbedder,
|
|
87
|
+
SemanticChunker
|
|
88
|
+
)
|
|
89
|
+
import os
|
|
90
|
+
from dotenv import load_dotenv
|
|
91
|
+
load_dotenv()
|
|
92
|
+
|
|
93
|
+
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
|
|
94
|
+
file_path = "./data/raw/[document_name].pdf"
|
|
95
|
+
|
|
96
|
+
# Create pipeline with only the components you need
|
|
97
|
+
pipeline = DocumentPipeline(
|
|
98
|
+
ocr_processor=MistralOCRDocument(api_key=MISTRAL_API_KEY),
|
|
99
|
+
processors=[ReferencesProcessor(api_key=MISTRAL_API_KEY)],
|
|
100
|
+
chunker=SemanticChunker(min_chunk_size=500, api_key=MISTRAL_API_KEY),
|
|
101
|
+
embedder=MistralEmbedder(model="mistral-embed", api_key=MISTRAL_API_KEY),
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# Step 1: Run OCR
|
|
105
|
+
ocr_result = pipeline.run_ocr(file_path)
|
|
106
|
+
|
|
107
|
+
# Step 2: Run processors (optional)
|
|
108
|
+
processing_results = pipeline.run_processors(ocr_result)
|
|
109
|
+
final_doc = processing_results[-1] # Get the last processor's output
|
|
110
|
+
|
|
111
|
+
# Step 3: Chunk the document
|
|
112
|
+
chunk_result = pipeline.run_chunker(final_doc)
|
|
113
|
+
|
|
114
|
+
# Step 4: Embed chunks
|
|
115
|
+
embedding_result = pipeline.run_embedder(chunk_result)
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
You can also create separate pipelines for different steps:
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
# OCR-only pipeline
|
|
122
|
+
ocr_pipeline = DocumentPipeline(
|
|
123
|
+
ocr_processor=MistralOCRDocument(api_key=MISTRAL_API_KEY)
|
|
124
|
+
)
|
|
125
|
+
ocr_result = ocr_pipeline.run_ocr(file_path)
|
|
126
|
+
|
|
127
|
+
# Later, chunk with a different pipeline
|
|
128
|
+
chunk_pipeline = DocumentPipeline(
|
|
129
|
+
chunker=SemanticChunker(min_chunk_size=500, api_key=MISTRAL_API_KEY)
|
|
130
|
+
)
|
|
131
|
+
chunks = chunk_pipeline.run_chunker(ocr_result)
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
## Package layout
|
|
135
|
+
|
|
136
|
+
```
|
|
137
|
+
ragbandit-core/
|
|
138
|
+
├── src/ragbandit/
|
|
139
|
+
│ ├── documents/ # document ingestion, OCR, chunking,
|
|
140
|
+
└── tests/
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## License
|
|
144
|
+
|
|
145
|
+
MIT
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# ragbandit-core
|
|
2
|
+
|
|
3
|
+
Core utilities for:
|
|
4
|
+
|
|
5
|
+
* Document ingestion & processing (OCR, chunking, embedding)
|
|
6
|
+
* Building and running Retrieval-Augmented Generation (RAG) pipelines
|
|
7
|
+
* Evaluating answers with automated metrics
|
|
8
|
+
|
|
9
|
+
## Quick start
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install ragbandit-core
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
from ragbandit.documents import (
|
|
17
|
+
DocumentPipeline,
|
|
18
|
+
ReferencesProcessor,
|
|
19
|
+
FootnoteProcessor,
|
|
20
|
+
MistralOCRDocument,
|
|
21
|
+
MistralEmbedder,
|
|
22
|
+
SemanticChunker
|
|
23
|
+
)
|
|
24
|
+
import os
|
|
25
|
+
import logging
|
|
26
|
+
from dotenv import load_dotenv
|
|
27
|
+
load_dotenv()
|
|
28
|
+
|
|
29
|
+
# Configure logging
|
|
30
|
+
logging.basicConfig(
|
|
31
|
+
level=logging.INFO,
|
|
32
|
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
|
|
36
|
+
|
|
37
|
+
file_path = "./data/raw/[document_name].pdf"
|
|
38
|
+
|
|
39
|
+
doc_pipeline = DocumentPipeline(
|
|
40
|
+
chunker=SemanticChunker(min_chunk_size=500, api_key=MISTRAL_API_KEY),
|
|
41
|
+
embedder=MistralEmbedder(model="mistral-embed", api_key=MISTRAL_API_KEY), # noqa
|
|
42
|
+
ocr_processor=MistralOCRDocument(api_key=MISTRAL_API_KEY),
|
|
43
|
+
processors=[
|
|
44
|
+
ReferencesProcessor(api_key=MISTRAL_API_KEY),
|
|
45
|
+
FootnoteProcessor(api_key=MISTRAL_API_KEY),
|
|
46
|
+
],
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
extended_response = doc_pipeline.process(file_path)
|
|
50
|
+
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### Running Steps Manually
|
|
54
|
+
|
|
55
|
+
For more control, you can run each pipeline step independently:
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
from ragbandit.documents import (
|
|
59
|
+
DocumentPipeline,
|
|
60
|
+
ReferencesProcessor,
|
|
61
|
+
MistralOCRDocument,
|
|
62
|
+
MistralEmbedder,
|
|
63
|
+
SemanticChunker
|
|
64
|
+
)
|
|
65
|
+
import os
|
|
66
|
+
from dotenv import load_dotenv
|
|
67
|
+
load_dotenv()
|
|
68
|
+
|
|
69
|
+
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
|
|
70
|
+
file_path = "./data/raw/[document_name].pdf"
|
|
71
|
+
|
|
72
|
+
# Create pipeline with only the components you need
|
|
73
|
+
pipeline = DocumentPipeline(
|
|
74
|
+
ocr_processor=MistralOCRDocument(api_key=MISTRAL_API_KEY),
|
|
75
|
+
processors=[ReferencesProcessor(api_key=MISTRAL_API_KEY)],
|
|
76
|
+
chunker=SemanticChunker(min_chunk_size=500, api_key=MISTRAL_API_KEY),
|
|
77
|
+
embedder=MistralEmbedder(model="mistral-embed", api_key=MISTRAL_API_KEY),
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# Step 1: Run OCR
|
|
81
|
+
ocr_result = pipeline.run_ocr(file_path)
|
|
82
|
+
|
|
83
|
+
# Step 2: Run processors (optional)
|
|
84
|
+
processing_results = pipeline.run_processors(ocr_result)
|
|
85
|
+
final_doc = processing_results[-1] # Get the last processor's output
|
|
86
|
+
|
|
87
|
+
# Step 3: Chunk the document
|
|
88
|
+
chunk_result = pipeline.run_chunker(final_doc)
|
|
89
|
+
|
|
90
|
+
# Step 4: Embed chunks
|
|
91
|
+
embedding_result = pipeline.run_embedder(chunk_result)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
You can also create separate pipelines for different steps:
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
# OCR-only pipeline
|
|
98
|
+
ocr_pipeline = DocumentPipeline(
|
|
99
|
+
ocr_processor=MistralOCRDocument(api_key=MISTRAL_API_KEY)
|
|
100
|
+
)
|
|
101
|
+
ocr_result = ocr_pipeline.run_ocr(file_path)
|
|
102
|
+
|
|
103
|
+
# Later, chunk with a different pipeline
|
|
104
|
+
chunk_pipeline = DocumentPipeline(
|
|
105
|
+
chunker=SemanticChunker(min_chunk_size=500, api_key=MISTRAL_API_KEY)
|
|
106
|
+
)
|
|
107
|
+
chunks = chunk_pipeline.run_chunker(ocr_result)
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## Package layout
|
|
111
|
+
|
|
112
|
+
```
|
|
113
|
+
ragbandit-core/
|
|
114
|
+
├── src/ragbandit/
|
|
115
|
+
│ ├── documents/ # document ingestion, OCR, chunking,
|
|
116
|
+
└── tests/
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## License
|
|
120
|
+
|
|
121
|
+
MIT
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "ragbandit-core"
|
|
7
|
+
version = "0.1.1"
|
|
8
|
+
description = "Core utilities for document processing, RAG configuration, querying, and evaluation."
|
|
9
|
+
authors = [
|
|
10
|
+
{ name="Martim Chaves", email="martim@ragbandit.com" }
|
|
11
|
+
]
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.10"
|
|
14
|
+
license = {text = "MIT"}
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Programming Language :: Python",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Operating System :: OS Independent",
|
|
20
|
+
]
|
|
21
|
+
# Core runtime dependencies
|
|
22
|
+
dependencies = [
|
|
23
|
+
"pydantic>=2.11.7",
|
|
24
|
+
"llama-index>=0.12.52",
|
|
25
|
+
"mistralai>=1.7.0",
|
|
26
|
+
"ragas>=0.3.0",
|
|
27
|
+
"cryptography>=44.0.2",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[tool.uv]
|
|
31
|
+
dev-dependencies = [
|
|
32
|
+
"pytest<8.0.0,>=7.4.3",
|
|
33
|
+
"mypy<2.0.0,>=1.8.0",
|
|
34
|
+
"ruff<1.0.0,>=0.2.2",
|
|
35
|
+
"pre-commit<4.0.0,>=3.6.2",
|
|
36
|
+
"types-passlib<2.0.0.0,>=1.7.7.20240106",
|
|
37
|
+
"coverage<8.0.0,>=7.4.3",
|
|
38
|
+
"black>=25.1.0"
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
[tool.setuptools]
|
|
42
|
+
package-dir = {"" = "src"}
|
|
43
|
+
|
|
44
|
+
[tool.setuptools.packages.find]
|
|
45
|
+
where = ["src"]
|
|
46
|
+
|
|
47
|
+
[project.urls]
|
|
48
|
+
Homepage = "https://github.com/MartimChaves/ragbandit-core"
|
|
49
|
+
Documentation = "https://github.com/MartimChaves/ragbandit-core#readme"
|
|
50
|
+
Source = "https://github.com/MartimChaves/ragbandit-core"
|
|
51
|
+
Issues = "https://github.com/MartimChaves/ragbandit-core/issues"
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""ragbandit core package.
|
|
2
|
+
|
|
3
|
+
This package contains sub-modules for document processing,
|
|
4
|
+
RAG pipeline configuration/execution, and evaluation utilities.
|
|
5
|
+
Only lightweight interfaces and shared utilities are defined here;
|
|
6
|
+
heavy logic resides in sub-packages.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from importlib import metadata as _metadata
|
|
10
|
+
|
|
11
|
+
__version__: str
|
|
12
|
+
try:
|
|
13
|
+
__version__ = _metadata.version("ragbandit-core")
|
|
14
|
+
except _metadata.PackageNotFoundError: # pragma: no cover
|
|
15
|
+
__version__ = "0.0.0+dev"
|
|
16
|
+
|
|
17
|
+
# Re-export public interfaces so that users can simply:
|
|
18
|
+
# from ragbandit import DocumentProcessor, RAGConfig, RAGPipeline, evaluate
|
|
19
|
+
|
|
20
|
+
# from ragbandit.documents import DocumentPipeline
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"__version__",
|
|
25
|
+
# "DocumentPipeline",
|
|
26
|
+
]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LLM configuration settings for ragbandit.
|
|
3
|
+
|
|
4
|
+
This module defines default settings and constants for LLM interactions.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
# Default model settings
|
|
8
|
+
DEFAULT_MODEL = "mistral-small-latest"
|
|
9
|
+
DEFAULT_TEMPERATURE = 0.0
|
|
10
|
+
|
|
11
|
+
# Retry settings
|
|
12
|
+
DEFAULT_MAX_RETRIES = 3
|
|
13
|
+
DEFAULT_RETRY_DELAY = 1.0 # seconds
|
|
14
|
+
DEFAULT_BACKOFF_FACTOR = 2.0 # exponential backoff factor
|
|
15
|
+
DEFAULT_TIMEOUT = 30.0 # seconds
|
|
16
|
+
|
|
17
|
+
# Token limits
|
|
18
|
+
MAX_PROMPT_TOKENS = {
|
|
19
|
+
"mistral-small-latest": 8000,
|
|
20
|
+
"mistral-medium-latest": 32000,
|
|
21
|
+
"mistral-large-latest": 32000,
|
|
22
|
+
"gpt-3.5-turbo": 4096,
|
|
23
|
+
"gpt-4": 8192,
|
|
24
|
+
"gpt-4-turbo": 128000,
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
# System prompts
|
|
28
|
+
DEFAULT_SYSTEM_PROMPT = """You are a helpful AI assistant."""
|
|
29
|
+
|
|
30
|
+
# Response formats
|
|
31
|
+
JSON_FORMAT_INSTRUCTION = """
|
|
32
|
+
Your response must be valid JSON that matches the following schema:
|
|
33
|
+
{schema}
|
|
34
|
+
"""
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pricing configuration for LLM API calls.
|
|
3
|
+
|
|
4
|
+
This module contains pricing constants for various
|
|
5
|
+
LLM models and embedding models.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
# Token cost rates per 1M tokens (in USD)
|
|
9
|
+
# Based on Mistral AI pricing as of July 2025
|
|
10
|
+
MODEL_COSTS = {
|
|
11
|
+
# Format: "model_name": (input_cost_per_1M, output_cost_per_1M)
|
|
12
|
+
"mistral-small-latest": (2.00, 6.00),
|
|
13
|
+
"mistral-medium-latest": (6.00, 18.00),
|
|
14
|
+
"mistral-large-latest": (12.00, 36.00),
|
|
15
|
+
# Add other models as needed
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
# Embedding model costs per 1M tokens
|
|
19
|
+
EMBEDDING_COSTS = {
|
|
20
|
+
# Format: "model_name": cost_per_1M_tokens
|
|
21
|
+
"mistral-embed": 0.10,
|
|
22
|
+
"text-embedding-3-small": 0.02,
|
|
23
|
+
"text-embedding-3-large": 0.13,
|
|
24
|
+
# Add other embedding models as needed
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
# OCR model costs per page (in EUR)
|
|
28
|
+
OCR_MODEL_COSTS = {
|
|
29
|
+
# Format: "model_name": cost_per_page
|
|
30
|
+
"mistral-ocr-latest": 0.001, # 1 EUR per 1000 pages
|
|
31
|
+
# Add other OCR models as needed
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
# Default OCR model to use if the specified model is not in OCR_MODEL_COSTS
|
|
35
|
+
DEFAULT_OCR_MODEL = "mistral-ocr-latest"
|
|
36
|
+
|
|
37
|
+
# Default model to use if the specified model is not in MODEL_COSTS
|
|
38
|
+
DEFAULT_MODEL = "mistral-small-latest"
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document processing module for handling, analyzing, and transforming documents.
|
|
3
|
+
|
|
4
|
+
This package provides tools for OCR, chunking,
|
|
5
|
+
embedding, and processing documents.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
# Import key components from subdirectories
|
|
9
|
+
from ragbandit.documents.document_pipeline import DocumentPipeline
|
|
10
|
+
|
|
11
|
+
# Import from chunkers
|
|
12
|
+
from ragbandit.documents.chunkers import (
|
|
13
|
+
BaseChunker,
|
|
14
|
+
FixedSizeChunker,
|
|
15
|
+
SemanticChunker,
|
|
16
|
+
SemanticBreak
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
# Import from processors
|
|
20
|
+
from ragbandit.documents.processors import (
|
|
21
|
+
BaseProcessor,
|
|
22
|
+
FootnoteProcessor,
|
|
23
|
+
ReferencesProcessor
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
# Import from embedders
|
|
27
|
+
from ragbandit.documents.embedders import (
|
|
28
|
+
BaseEmbedder,
|
|
29
|
+
MistralEmbedder
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# Import from OCR
|
|
33
|
+
from ragbandit.documents.ocr import (
|
|
34
|
+
BaseOCR,
|
|
35
|
+
MistralOCRDocument
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# Import from utils
|
|
39
|
+
from ragbandit.documents.utils import SecureFileHandler
|
|
40
|
+
|
|
41
|
+
__all__ = [
|
|
42
|
+
# Main pipeline
|
|
43
|
+
"DocumentPipeline",
|
|
44
|
+
|
|
45
|
+
# Chunkers
|
|
46
|
+
"BaseChunker",
|
|
47
|
+
"FixedSizeChunker",
|
|
48
|
+
"SemanticChunker",
|
|
49
|
+
"SemanticBreak",
|
|
50
|
+
|
|
51
|
+
# Processors
|
|
52
|
+
"BaseProcessor",
|
|
53
|
+
"FootnoteProcessor",
|
|
54
|
+
"ReferencesProcessor",
|
|
55
|
+
|
|
56
|
+
# Embedders
|
|
57
|
+
"BaseEmbedder",
|
|
58
|
+
"MistralEmbedder",
|
|
59
|
+
|
|
60
|
+
# OCR
|
|
61
|
+
"BaseOCR",
|
|
62
|
+
"MistralOCRDocument",
|
|
63
|
+
|
|
64
|
+
# Utils
|
|
65
|
+
"SecureFileHandler"
|
|
66
|
+
]
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Chunker implementations for document processing.
|
|
3
|
+
|
|
4
|
+
This module provides various chunking strategies for documents.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from ragbandit.documents.chunkers.base_chunker import BaseChunker
|
|
8
|
+
from ragbandit.documents.chunkers.fixed_size_chunker import FixedSizeChunker
|
|
9
|
+
from ragbandit.documents.chunkers.semantic_chunker import (
|
|
10
|
+
SemanticChunker, SemanticBreak
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"BaseChunker",
|
|
15
|
+
"FixedSizeChunker",
|
|
16
|
+
"SemanticChunker",
|
|
17
|
+
"SemanticBreak"
|
|
18
|
+
]
|