wizit-open-rag 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wizit_open_rag-0.0.1/PKG-INFO +250 -0
- wizit_open_rag-0.0.1/README.md +211 -0
- wizit_open_rag-0.0.1/pyproject.toml +70 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/__init__.py +13 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/application/__init__.py +0 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/application/context_chunk_app.py +150 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/application/interfaces.py +178 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/application/kdb_service.py +73 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/application/transcription_app.py +144 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/chunks.py +135 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/data/__init__.py +0 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/data/kdb.py +17 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/data/prompts.py +202 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/data/storage.py +10 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/domain/__init__.py +0 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/domain/models.py +31 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/domain/services.py +97 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/infra/__init__.py +0 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/infra/embeddings/__init__.py +0 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/infra/embeddings/aws_embeddings.py +41 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/infra/llms/__init__.py +0 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/infra/llms/aws_model.py +78 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/infra/persistence/__init__.py +0 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/infra/persistence/pg_connection_manager.py +67 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/infra/persistence/pg_engine_manager.py +34 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/infra/persistence/weaviate_connection_manager.py +79 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/infra/rag/__init__.py +0 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/infra/rag/markdown_chunks.py +73 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/infra/rag/pg_embeddings.py +263 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/infra/rag/recursive_chunks.py +60 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/infra/rag/semantic_chunks.py +72 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/infra/rag/weaviate_embeddings.py +179 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/infra/secrets/__init__.py +0 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/infra/secrets/aws_secrets_manager.py +32 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/transcription.py +131 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/utils/file_utils.py +12 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/workflows/__init__.py +0 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/workflows/context_nodes.py +81 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/workflows/context_state.py +10 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/workflows/context_tools.py +58 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/workflows/context_workflow.py +40 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/workflows/transcription_nodes.py +141 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/workflows/transcription_schemas.py +25 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/workflows/transcription_state.py +17 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/workflows/transcription_tools.py +56 -0
- wizit_open_rag-0.0.1/src/wizit_open_rag/workflows/transcription_workflow.py +46 -0
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: wizit-open-rag
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: AI-powered document transcription and semantic chunking for RAG pipelines
|
|
5
|
+
Keywords: rag,retrieval-augmented-generation,llm,chunking,transcription,weaviate,bedrock,langchain,langgraph,pdf,semantic-search
|
|
6
|
+
Author: Restebance
|
|
7
|
+
Author-email: Restebance <restebance@gmail.com>
|
|
8
|
+
License: Apache-2.0
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
17
|
+
Classifier: Typing :: Typed
|
|
18
|
+
Requires-Dist: boto3>=1.40.23
|
|
19
|
+
Requires-Dist: langchain>=1.2.10
|
|
20
|
+
Requires-Dist: langchain-aws>=1.3.0
|
|
21
|
+
Requires-Dist: langchain-classic>=1.0.7
|
|
22
|
+
Requires-Dist: langchain-community>=0.4.1
|
|
23
|
+
Requires-Dist: langchain-core>=1.2.16
|
|
24
|
+
Requires-Dist: langchain-experimental>=0.4.1
|
|
25
|
+
Requires-Dist: langchain-text-splitters>=1.1.1
|
|
26
|
+
Requires-Dist: langgraph>=1.0.9
|
|
27
|
+
Requires-Dist: pillow>=11.3.0
|
|
28
|
+
Requires-Dist: pymupdf>=1.27.1
|
|
29
|
+
Requires-Dist: anthropic>=0.84.0
|
|
30
|
+
Requires-Dist: psycopg2-binary>=2.9.11
|
|
31
|
+
Requires-Dist: sqlalchemy[asyncio]>=2.0.43
|
|
32
|
+
Requires-Dist: langchain-postgres>=0.0.17
|
|
33
|
+
Requires-Dist: weaviate-client>=4.0.0
|
|
34
|
+
Requires-Dist: langchain-weaviate>=0.0.3
|
|
35
|
+
Requires-Python: >=3.12
|
|
36
|
+
Project-URL: Bug Tracker, https://github.com/Restebance/open_rag/issues
|
|
37
|
+
Project-URL: Repository, https://github.com/Restebance/open_rag
|
|
38
|
+
Description-Content-Type: text/markdown
|
|
39
|
+
|
|
40
|
+
# open_rag
|
|
41
|
+
|
|
42
|
+
A Python library for AI-powered document transcription and semantic chunking with RAG (Retrieval-Augmented Generation). It processes PDFs through LLMs (Claude via **AWS Bedrock**), chunks the resulting Markdown semantically, enriches each chunk with surrounding context, and returns ready-to-index `Document` objects for PostgreSQL pgvector.
|
|
43
|
+
|
|
44
|
+
**Version**: 0.0.1 | **Python**: >=3.12 | **Build**: uv
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## Features
|
|
49
|
+
|
|
50
|
+
- PDF-to-Markdown transcription powered by Claude via AWS Bedrock
|
|
51
|
+
- LangGraph-based transcription workflow with configurable retry logic and accuracy thresholds
|
|
52
|
+
- Semantic chunking with 85th-percentile breakpoints (plus recursive and Markdown-header strategies)
|
|
53
|
+
- Per-chunk context enrichment via a dedicated LangGraph workflow — each chunk is wrapped with `<context>` and `<content>` tags
|
|
54
|
+
- Pluggable storage backends: local filesystem or AWS S3
|
|
55
|
+
- Vector indexing into PostgreSQL pgvector via LangChain `PGVectorStore`
|
|
56
|
+
- LangSmith tracing support
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## Prerequisites
|
|
61
|
+
|
|
62
|
+
- Python 3.12 or higher
|
|
63
|
+
- [uv](https://docs.astral.sh/uv/) for dependency management
|
|
64
|
+
- AWS credentials configured (standard boto3 credential chain — env vars, `~/.aws/credentials`, or instance profile)
|
|
65
|
+
- PostgreSQL database with the [pgvector](https://github.com/pgvector/pgvector) extension enabled
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
## Installation
|
|
70
|
+
|
|
71
|
+
Install from PyPI:
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
pip install open_rag
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
For development (clone + install with dev tools):
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
git clone https://github.com/Restebance/open_rag.git
|
|
81
|
+
cd open_rag
|
|
82
|
+
uv sync --group dev
|
|
83
|
+
cp example.env .env
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Fill in `.env` with your credentials (see [Environment Variables](#environment-variables) below).
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
## Usage
|
|
91
|
+
|
|
92
|
+
### Document Transcription
|
|
93
|
+
|
|
94
|
+
`OpenRagTranscriber` accepts the raw bytes of a single PDF page and returns a `ParsedDocPage` containing the Markdown transcription.
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
import asyncio
|
|
98
|
+
from open_rag import OpenRagTranscriber
|
|
99
|
+
|
|
100
|
+
transcriber = OpenRagTranscriber(
|
|
101
|
+
langsmith_project_name="my-project", # required
|
|
102
|
+
langsmith_api_key="lsv2_...", # required
|
|
103
|
+
llm_model_id="global.anthropic.claude-sonnet-4-6",
|
|
104
|
+
target_language="es-CO",
|
|
105
|
+
transcription_accuracy_threshold=0.90,
|
|
106
|
+
max_transcription_retries=2,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
with open("page.pdf", "rb") as f:
|
|
110
|
+
page_bytes = f.read()
|
|
111
|
+
|
|
112
|
+
result = asyncio.run(transcriber.transcribe_document(page_bytes))
|
|
113
|
+
print(result.page_text) # Markdown string
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### Semantic Chunking with Context
|
|
117
|
+
|
|
118
|
+
`ChunksManager` takes a pre-loaded Markdown string and returns a list of LangChain `Document` objects, each enriched with a contextual summary.
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
import asyncio
|
|
122
|
+
from open_rag import ChunksManager
|
|
123
|
+
|
|
124
|
+
manager = ChunksManager(
|
|
125
|
+
langsmith_project_name="my-project", # required
|
|
126
|
+
langsmith_api_key="lsv2_...", # required
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
with open("document.md") as f:
|
|
130
|
+
markdown_content = f.read()
|
|
131
|
+
|
|
132
|
+
docs = asyncio.run(manager.gen_context_chunks(
|
|
133
|
+
file_key="document.md",
|
|
134
|
+
file_markdown_content=markdown_content,
|
|
135
|
+
file_tags={"category": "hr", "department": "onboarding"},
|
|
136
|
+
))
|
|
137
|
+
|
|
138
|
+
# docs is a List[Document]; index to pgvector as needed
|
|
139
|
+
for doc in docs:
|
|
140
|
+
print(doc.page_content)
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
> **Note:** `gen_context_chunks` does not load files from storage — the caller must pass the content as a string. Indexing to pgvector is the caller's responsibility.
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
## Architecture
|
|
148
|
+
|
|
149
|
+
The codebase follows a clean layered architecture. Dependency direction: `transcription.py / chunks.py → application → domain ↔ infra ← workflows`.
|
|
150
|
+
|
|
151
|
+
```
|
|
152
|
+
open_rag/ # installable package (src/open_rag/)
|
|
153
|
+
├── transcription.py # Public API — OpenRagTranscriber
|
|
154
|
+
├── chunks.py # Public API — ChunksManager
|
|
155
|
+
├── domain/ # Core data models (PageToTranscribe, ParsedDocPage, ParsedDoc)
|
|
156
|
+
├── application/ # Orchestration + abstract interfaces (ABCs)
|
|
157
|
+
├── data/ # Shared enums and prompt strings
|
|
158
|
+
├── infra/
|
|
159
|
+
│ ├── llms/ # AWS Bedrock chat (ChatBedrockConverse)
|
|
160
|
+
│ ├── embeddings/ # AWS Bedrock embeddings (BedrockEmbeddings)
|
|
161
|
+
│ ├── persistence/ # Local filesystem, AWS S3, PostgreSQL managers
|
|
162
|
+
│ ├── rag/ # SemanticChunks, RecursiveChunks, MarkdownHeadersChunks, PGVectorStore, WeaviateEmbeddingsManager
|
|
163
|
+
│ └── secrets/ # AWS Secrets Manager helper
|
|
164
|
+
├── utils/ # validate_file_name_format
|
|
165
|
+
└── workflows/ # LangGraph state machines (transcription + context)
|
|
166
|
+
tests/ # pytest suite
|
|
167
|
+
data/ # Sample / test documents
|
|
168
|
+
example.env
|
|
169
|
+
pyproject.toml
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
### Key Data Flow
|
|
173
|
+
|
|
174
|
+
```
|
|
175
|
+
PDF bytes
|
|
176
|
+
→ ParseDocModelService (PyMuPDF → base64 pages)
|
|
177
|
+
→ TranscriptionWorkflow (LangGraph → Claude via AWS Bedrock → Markdown)
|
|
178
|
+
|
|
179
|
+
Markdown string + tags
|
|
180
|
+
→ SemanticChunks (AWS Bedrock embeddings, 85th-percentile breakpoints)
|
|
181
|
+
→ ContextWorkflow (LangGraph → Claude adds surrounding context per chunk)
|
|
182
|
+
→ List[Document] (each chunk wrapped in <context> / <content> tags)
|
|
183
|
+
→ Caller indexes to pgvector
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
---
|
|
187
|
+
|
|
188
|
+
## Environment Variables
|
|
189
|
+
|
|
190
|
+
Copy `example.env` to `.env` and fill in the values:
|
|
191
|
+
|
|
192
|
+
| Variable | Purpose |
|
|
193
|
+
|---|---|
|
|
194
|
+
| `VECTOR_STORE_CONNECTION` | PostgreSQL connection string (pgvector) |
|
|
195
|
+
| `VECTOR_STORE_TABLE` | pgvector table name |
|
|
196
|
+
| `LANGSMITH_API_KEY` | LangSmith API key for tracing |
|
|
197
|
+
| `LANGCHAIN_PROJECT` | LangSmith project name |
|
|
198
|
+
| `LANGSMITH_TRACING` | Enable LangSmith tracing (`true` / `false`) |
|
|
199
|
+
| `SUPABASE_KEY` / `SUPABASE_URL` | Supabase credentials (optional) |
|
|
200
|
+
|
|
201
|
+
AWS credentials are read from the standard **boto3 credential chain** and are not set in `.env`.
|
|
202
|
+
|
|
203
|
+
---
|
|
204
|
+
|
|
205
|
+
## Development
|
|
206
|
+
|
|
207
|
+
### Running tests
|
|
208
|
+
|
|
209
|
+
```bash
|
|
210
|
+
# Unit tests (mocked — no AWS credentials required)
|
|
211
|
+
uv run pytest
|
|
212
|
+
|
|
213
|
+
# Transcription integration test (requires live AWS credentials)
|
|
214
|
+
uv run python src/open_rag/transcription.py
|
|
215
|
+
|
|
216
|
+
# Chunking integration test (requires live AWS credentials)
|
|
217
|
+
uv run python src/open_rag/chunks.py
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
### Profiling
|
|
221
|
+
|
|
222
|
+
```bash
|
|
223
|
+
# CPU profiling
|
|
224
|
+
uv run pyinstrument test.py transcribe <file.pdf> <source_dir> <target_dir>
|
|
225
|
+
|
|
226
|
+
# Memory profiling
|
|
227
|
+
uv run python -m memray run test.py transcribe <file.pdf> <source_dir> <target_dir>
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
### Building the package
|
|
231
|
+
|
|
232
|
+
```bash
|
|
233
|
+
uv build
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
---
|
|
237
|
+
|
|
238
|
+
## Gotchas
|
|
239
|
+
|
|
240
|
+
- `SemanticChunks` calls AWS Bedrock **at construction time** (via `SemanticChunker`) — not just at index time. Make sure credentials are available before instantiating `ChunksManager`.
|
|
241
|
+
- Both `transcribe_document` and `gen_context_chunks` are `async`; wrap them in `asyncio.run(...)` from synchronous code.
|
|
242
|
+
- `OpenRagTranscriber` and `ChunksManager` require `langsmith_project_name` and `langsmith_api_key` as **constructor arguments** — they are not read from environment variables.
|
|
243
|
+
- `ParseDocModelService.parse_document_to_base64_pages` iterates `range(0, page_count)` — pages are zero-indexed (`page_number=0` is the first page).
|
|
244
|
+
- AWS Bedrock cross-region model IDs use the `global.` prefix (e.g. `global.anthropic.claude-sonnet-4-6`).
|
|
245
|
+
|
|
246
|
+
---
|
|
247
|
+
|
|
248
|
+
## License
|
|
249
|
+
|
|
250
|
+
Licensed under the [Apache License 2.0](LICENSE.md).
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
# open_rag
|
|
2
|
+
|
|
3
|
+
A Python library for AI-powered document transcription and semantic chunking with RAG (Retrieval-Augmented Generation). It processes PDFs through LLMs (Claude via **AWS Bedrock**), chunks the resulting Markdown semantically, enriches each chunk with surrounding context, and returns ready-to-index `Document` objects for PostgreSQL pgvector.
|
|
4
|
+
|
|
5
|
+
**Version**: 0.0.1 | **Python**: >=3.12 | **Build**: uv
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
|
|
11
|
+
- PDF-to-Markdown transcription powered by Claude via AWS Bedrock
|
|
12
|
+
- LangGraph-based transcription workflow with configurable retry logic and accuracy thresholds
|
|
13
|
+
- Semantic chunking with 85th-percentile breakpoints (plus recursive and Markdown-header strategies)
|
|
14
|
+
- Per-chunk context enrichment via a dedicated LangGraph workflow — each chunk is wrapped with `<context>` and `<content>` tags
|
|
15
|
+
- Pluggable storage backends: local filesystem or AWS S3
|
|
16
|
+
- Vector indexing into PostgreSQL pgvector via LangChain `PGVectorStore`
|
|
17
|
+
- LangSmith tracing support
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## Prerequisites
|
|
22
|
+
|
|
23
|
+
- Python 3.12 or higher
|
|
24
|
+
- [uv](https://docs.astral.sh/uv/) for dependency management
|
|
25
|
+
- AWS credentials configured (standard boto3 credential chain — env vars, `~/.aws/credentials`, or instance profile)
|
|
26
|
+
- PostgreSQL database with the [pgvector](https://github.com/pgvector/pgvector) extension enabled
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
## Installation
|
|
31
|
+
|
|
32
|
+
Install from PyPI:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install open_rag
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
For development (clone + install with dev tools):
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
git clone https://github.com/Restebance/open_rag.git
|
|
42
|
+
cd open_rag
|
|
43
|
+
uv sync --group dev
|
|
44
|
+
cp example.env .env
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Fill in `.env` with your credentials (see [Environment Variables](#environment-variables) below).
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
## Usage
|
|
52
|
+
|
|
53
|
+
### Document Transcription
|
|
54
|
+
|
|
55
|
+
`OpenRagTranscriber` accepts the raw bytes of a single PDF page and returns a `ParsedDocPage` containing the Markdown transcription.
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
import asyncio
|
|
59
|
+
from open_rag import OpenRagTranscriber
|
|
60
|
+
|
|
61
|
+
transcriber = OpenRagTranscriber(
|
|
62
|
+
langsmith_project_name="my-project", # required
|
|
63
|
+
langsmith_api_key="lsv2_...", # required
|
|
64
|
+
llm_model_id="global.anthropic.claude-sonnet-4-6",
|
|
65
|
+
target_language="es-CO",
|
|
66
|
+
transcription_accuracy_threshold=0.90,
|
|
67
|
+
max_transcription_retries=2,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
with open("page.pdf", "rb") as f:
|
|
71
|
+
page_bytes = f.read()
|
|
72
|
+
|
|
73
|
+
result = asyncio.run(transcriber.transcribe_document(page_bytes))
|
|
74
|
+
print(result.page_text) # Markdown string
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Semantic Chunking with Context
|
|
78
|
+
|
|
79
|
+
`ChunksManager` takes a pre-loaded Markdown string and returns a list of LangChain `Document` objects, each enriched with a contextual summary.
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
import asyncio
|
|
83
|
+
from open_rag import ChunksManager
|
|
84
|
+
|
|
85
|
+
manager = ChunksManager(
|
|
86
|
+
langsmith_project_name="my-project", # required
|
|
87
|
+
langsmith_api_key="lsv2_...", # required
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
with open("document.md") as f:
|
|
91
|
+
markdown_content = f.read()
|
|
92
|
+
|
|
93
|
+
docs = asyncio.run(manager.gen_context_chunks(
|
|
94
|
+
file_key="document.md",
|
|
95
|
+
file_markdown_content=markdown_content,
|
|
96
|
+
file_tags={"category": "hr", "department": "onboarding"},
|
|
97
|
+
))
|
|
98
|
+
|
|
99
|
+
# docs is a List[Document]; index to pgvector as needed
|
|
100
|
+
for doc in docs:
|
|
101
|
+
print(doc.page_content)
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
> **Note:** `gen_context_chunks` does not load files from storage — the caller must pass the content as a string. Indexing to pgvector is the caller's responsibility.
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## Architecture
|
|
109
|
+
|
|
110
|
+
The codebase follows a clean layered architecture. Dependency direction: `transcription.py / chunks.py → application → domain ↔ infra ← workflows`.
|
|
111
|
+
|
|
112
|
+
```
|
|
113
|
+
open_rag/ # installable package (src/open_rag/)
|
|
114
|
+
├── transcription.py # Public API — OpenRagTranscriber
|
|
115
|
+
├── chunks.py # Public API — ChunksManager
|
|
116
|
+
├── domain/ # Core data models (PageToTranscribe, ParsedDocPage, ParsedDoc)
|
|
117
|
+
├── application/ # Orchestration + abstract interfaces (ABCs)
|
|
118
|
+
├── data/ # Shared enums and prompt strings
|
|
119
|
+
├── infra/
|
|
120
|
+
│ ├── llms/ # AWS Bedrock chat (ChatBedrockConverse)
|
|
121
|
+
│ ├── embeddings/ # AWS Bedrock embeddings (BedrockEmbeddings)
|
|
122
|
+
│ ├── persistence/ # Local filesystem, AWS S3, PostgreSQL managers
|
|
123
|
+
│ ├── rag/ # SemanticChunks, RecursiveChunks, MarkdownHeadersChunks, PGVectorStore, WeaviateEmbeddingsManager
|
|
124
|
+
│ └── secrets/ # AWS Secrets Manager helper
|
|
125
|
+
├── utils/ # validate_file_name_format
|
|
126
|
+
└── workflows/ # LangGraph state machines (transcription + context)
|
|
127
|
+
tests/ # pytest suite
|
|
128
|
+
data/ # Sample / test documents
|
|
129
|
+
example.env
|
|
130
|
+
pyproject.toml
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### Key Data Flow
|
|
134
|
+
|
|
135
|
+
```
|
|
136
|
+
PDF bytes
|
|
137
|
+
→ ParseDocModelService (PyMuPDF → base64 pages)
|
|
138
|
+
→ TranscriptionWorkflow (LangGraph → Claude via AWS Bedrock → Markdown)
|
|
139
|
+
|
|
140
|
+
Markdown string + tags
|
|
141
|
+
→ SemanticChunks (AWS Bedrock embeddings, 85th-percentile breakpoints)
|
|
142
|
+
→ ContextWorkflow (LangGraph → Claude adds surrounding context per chunk)
|
|
143
|
+
→ List[Document] (each chunk wrapped in <context> / <content> tags)
|
|
144
|
+
→ Caller indexes to pgvector
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## Environment Variables
|
|
150
|
+
|
|
151
|
+
Copy `example.env` to `.env` and fill in the values:
|
|
152
|
+
|
|
153
|
+
| Variable | Purpose |
|
|
154
|
+
|---|---|
|
|
155
|
+
| `VECTOR_STORE_CONNECTION` | PostgreSQL connection string (pgvector) |
|
|
156
|
+
| `VECTOR_STORE_TABLE` | pgvector table name |
|
|
157
|
+
| `LANGSMITH_API_KEY` | LangSmith API key for tracing |
|
|
158
|
+
| `LANGCHAIN_PROJECT` | LangSmith project name |
|
|
159
|
+
| `LANGSMITH_TRACING` | Enable LangSmith tracing (`true` / `false`) |
|
|
160
|
+
| `SUPABASE_KEY` / `SUPABASE_URL` | Supabase credentials (optional) |
|
|
161
|
+
|
|
162
|
+
AWS credentials are read from the standard **boto3 credential chain** and are not set in `.env`.
|
|
163
|
+
|
|
164
|
+
---
|
|
165
|
+
|
|
166
|
+
## Development
|
|
167
|
+
|
|
168
|
+
### Running tests
|
|
169
|
+
|
|
170
|
+
```bash
|
|
171
|
+
# Unit tests (mocked — no AWS credentials required)
|
|
172
|
+
uv run pytest
|
|
173
|
+
|
|
174
|
+
# Transcription integration test (requires live AWS credentials)
|
|
175
|
+
uv run python src/open_rag/transcription.py
|
|
176
|
+
|
|
177
|
+
# Chunking integration test (requires live AWS credentials)
|
|
178
|
+
uv run python src/open_rag/chunks.py
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
### Profiling
|
|
182
|
+
|
|
183
|
+
```bash
|
|
184
|
+
# CPU profiling
|
|
185
|
+
uv run pyinstrument test.py transcribe <file.pdf> <source_dir> <target_dir>
|
|
186
|
+
|
|
187
|
+
# Memory profiling
|
|
188
|
+
uv run python -m memray run test.py transcribe <file.pdf> <source_dir> <target_dir>
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
### Building the package
|
|
192
|
+
|
|
193
|
+
```bash
|
|
194
|
+
uv build
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
---
|
|
198
|
+
|
|
199
|
+
## Gotchas
|
|
200
|
+
|
|
201
|
+
- `SemanticChunks` calls AWS Bedrock **at construction time** (via `SemanticChunker`) — not just at index time. Make sure credentials are available before instantiating `ChunksManager`.
|
|
202
|
+
- Both `transcribe_document` and `gen_context_chunks` are `async`; wrap them in `asyncio.run(...)` from synchronous code.
|
|
203
|
+
- `OpenRagTranscriber` and `ChunksManager` require `langsmith_project_name` and `langsmith_api_key` as **constructor arguments** — they are not read from environment variables.
|
|
204
|
+
- `ParseDocModelService.parse_document_to_base64_pages` iterates `range(0, page_count)` — pages are zero-indexed (`page_number=0` is the first page).
|
|
205
|
+
- AWS Bedrock cross-region model IDs use the `global.` prefix (e.g. `global.anthropic.claude-sonnet-4-6`).
|
|
206
|
+
|
|
207
|
+
---
|
|
208
|
+
|
|
209
|
+
## License
|
|
210
|
+
|
|
211
|
+
Licensed under the [Apache License 2.0](LICENSE.md).
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "wizit_open_rag"
|
|
3
|
+
version = "0.0.1"
|
|
4
|
+
description = "AI-powered document transcription and semantic chunking for RAG pipelines"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.12"
|
|
7
|
+
license = { text = "Apache-2.0" }
|
|
8
|
+
authors = [
|
|
9
|
+
{ name = "Restebance", email = "restebance@gmail.com" },
|
|
10
|
+
]
|
|
11
|
+
keywords = [
|
|
12
|
+
"rag",
|
|
13
|
+
"retrieval-augmented-generation",
|
|
14
|
+
"llm",
|
|
15
|
+
"chunking",
|
|
16
|
+
"transcription",
|
|
17
|
+
"weaviate",
|
|
18
|
+
"bedrock",
|
|
19
|
+
"langchain",
|
|
20
|
+
"langgraph",
|
|
21
|
+
"pdf",
|
|
22
|
+
"semantic-search",
|
|
23
|
+
]
|
|
24
|
+
classifiers = [
|
|
25
|
+
"Development Status :: 3 - Alpha",
|
|
26
|
+
"Intended Audience :: Developers",
|
|
27
|
+
"License :: OSI Approved :: Apache Software License",
|
|
28
|
+
"Programming Language :: Python :: 3",
|
|
29
|
+
"Programming Language :: Python :: 3.12",
|
|
30
|
+
"Programming Language :: Python :: 3.13",
|
|
31
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
32
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
33
|
+
"Typing :: Typed",
|
|
34
|
+
]
|
|
35
|
+
dependencies = [
|
|
36
|
+
"boto3>=1.40.23",
|
|
37
|
+
"langchain>=1.2.10",
|
|
38
|
+
"langchain-aws>=1.3.0",
|
|
39
|
+
"langchain-classic>=1.0.7",
|
|
40
|
+
"langchain-community>=0.4.1",
|
|
41
|
+
"langchain-core>=1.2.16",
|
|
42
|
+
"langchain-experimental>=0.4.1",
|
|
43
|
+
"langchain-text-splitters>=1.1.1",
|
|
44
|
+
"langgraph>=1.0.9",
|
|
45
|
+
"pillow>=11.3.0",
|
|
46
|
+
"pymupdf>=1.27.1",
|
|
47
|
+
"anthropic>=0.84.0",
|
|
48
|
+
"psycopg2-binary>=2.9.11",
|
|
49
|
+
"sqlalchemy[asyncio]>=2.0.43",
|
|
50
|
+
"langchain-postgres>=0.0.17",
|
|
51
|
+
"weaviate-client>=4.0.0",
|
|
52
|
+
"langchain-weaviate>=0.0.3",
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
[project.urls]
|
|
56
|
+
Repository = "https://github.com/Restebance/open_rag"
|
|
57
|
+
"Bug Tracker" = "https://github.com/Restebance/open_rag/issues"
|
|
58
|
+
|
|
59
|
+
[dependency-groups]
|
|
60
|
+
dev = [
|
|
61
|
+
"memray>=1.18.0",
|
|
62
|
+
"pyinstrument>=5.1.1",
|
|
63
|
+
"python-dotenv>=1.1.1",
|
|
64
|
+
"pytest>=8.3.5",
|
|
65
|
+
"pytest-asyncio>=0.25.0",
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
[build-system]
|
|
69
|
+
requires = ["uv_build>=0.8.15,<0.9.0"]
|
|
70
|
+
build-backend = "uv_build"
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from open_rag.application.interfaces import EmbeddingsManager
|
|
2
|
+
from open_rag.chunks import ChunksManager
|
|
3
|
+
from open_rag.infra.rag.pg_embeddings import PgEmbeddingsManager
|
|
4
|
+
from open_rag.infra.rag.weaviate_embeddings import WeaviateEmbeddingsManager
|
|
5
|
+
from open_rag.transcription import OpenRagTranscriber
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"OpenRagTranscriber",
|
|
9
|
+
"ChunksManager",
|
|
10
|
+
"EmbeddingsManager",
|
|
11
|
+
"PgEmbeddingsManager",
|
|
12
|
+
"WeaviateEmbeddingsManager",
|
|
13
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from langchain_core.documents import Document
|
|
6
|
+
from langchain_core.messages.human import HumanMessage
|
|
7
|
+
from langsmith import Client, tracing_context
|
|
8
|
+
|
|
9
|
+
from open_rag.workflows.context_workflow import ContextWorkflow
|
|
10
|
+
|
|
11
|
+
from .interfaces import (
|
|
12
|
+
AiApplicationService,
|
|
13
|
+
RagChunker,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ContextChunksInDocumentApp:
|
|
20
|
+
"""
|
|
21
|
+
Service for chunking documents.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
ai_application_service: AiApplicationService,
|
|
27
|
+
rag_chunker: RagChunker,
|
|
28
|
+
langsmith_api_key: str,
|
|
29
|
+
langsmith_project_name: str,
|
|
30
|
+
target_language: str = "es",
|
|
31
|
+
):
|
|
32
|
+
"""
|
|
33
|
+
Initialize the ChunkerService.
|
|
34
|
+
"""
|
|
35
|
+
self.ai_application_service = ai_application_service
|
|
36
|
+
self.rag_chunker = rag_chunker
|
|
37
|
+
self.target_language = target_language
|
|
38
|
+
self.chat_model = self.ai_application_service.load_chat_model()
|
|
39
|
+
# TODO
|
|
40
|
+
self.context_additional_instructions = ""
|
|
41
|
+
self.metadata_source = "source"
|
|
42
|
+
|
|
43
|
+
context_workflow = ContextWorkflow(
|
|
44
|
+
self.chat_model, self.context_additional_instructions
|
|
45
|
+
)
|
|
46
|
+
self.compiled_context_workflow = context_workflow.gen_workflow()
|
|
47
|
+
self.compiled_context_workflow = self.compiled_context_workflow.compile()
|
|
48
|
+
# TRACING
|
|
49
|
+
self.langsmith_project_name = langsmith_project_name
|
|
50
|
+
self.langsmith_client = Client(api_key=langsmith_api_key)
|
|
51
|
+
|
|
52
|
+
async def _retrieve_context_chunk_in_document_with_workflow(
|
|
53
|
+
self,
|
|
54
|
+
workflow,
|
|
55
|
+
markdown_content: str,
|
|
56
|
+
chunk: Document,
|
|
57
|
+
chunk_metadata: dict[str, Any] | None = None,
|
|
58
|
+
) -> Document:
|
|
59
|
+
"""Retrieve context chunks in document."""
|
|
60
|
+
try:
|
|
61
|
+
with tracing_context(
|
|
62
|
+
enabled=True,
|
|
63
|
+
project_name=self.langsmith_project_name,
|
|
64
|
+
client=self.langsmith_client,
|
|
65
|
+
):
|
|
66
|
+
result = await workflow.ainvoke(
|
|
67
|
+
{
|
|
68
|
+
"messages": [
|
|
69
|
+
HumanMessage(
|
|
70
|
+
content=[
|
|
71
|
+
{
|
|
72
|
+
"type": "text",
|
|
73
|
+
"text": f"Retrieve a complete context for the following chunk: <chunk>{chunk.page_content}</chunk>, ensure all content chunks are generated with the same document's language.",
|
|
74
|
+
},
|
|
75
|
+
]
|
|
76
|
+
)
|
|
77
|
+
],
|
|
78
|
+
"document_content": markdown_content,
|
|
79
|
+
},
|
|
80
|
+
{
|
|
81
|
+
"configurable": {
|
|
82
|
+
"transcription_accuracy_threshold": 0.95,
|
|
83
|
+
"max_transcription_retries": 2,
|
|
84
|
+
}
|
|
85
|
+
},
|
|
86
|
+
)
|
|
87
|
+
chunk.page_content = f"<context>\n{result['context']}\n</context>\n <content>\n{chunk.page_content}\n</content>"
|
|
88
|
+
if chunk_metadata is not None:
|
|
89
|
+
for key, value in chunk_metadata.items():
|
|
90
|
+
chunk.metadata[key] = value
|
|
91
|
+
return chunk
|
|
92
|
+
except Exception as e:
|
|
93
|
+
logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
|
|
94
|
+
raise
|
|
95
|
+
|
|
96
|
+
async def _retrieve_context_chunks_in_document_with_workflow(
|
|
97
|
+
self,
|
|
98
|
+
markdown_content: str,
|
|
99
|
+
chunks: list[Document],
|
|
100
|
+
chunks_metadata: dict[str, Any] | None = None,
|
|
101
|
+
) -> list[Document]:
|
|
102
|
+
"""Retrieve context chunks in document."""
|
|
103
|
+
try:
|
|
104
|
+
context_chunks_workflow_invocations = list(
|
|
105
|
+
map(
|
|
106
|
+
lambda chunk: self._retrieve_context_chunk_in_document_with_workflow(
|
|
107
|
+
self.compiled_context_workflow,
|
|
108
|
+
markdown_content,
|
|
109
|
+
chunk,
|
|
110
|
+
chunks_metadata,
|
|
111
|
+
),
|
|
112
|
+
chunks,
|
|
113
|
+
)
|
|
114
|
+
)
|
|
115
|
+
context_chunks = await asyncio.gather(*context_chunks_workflow_invocations)
|
|
116
|
+
return context_chunks
|
|
117
|
+
except Exception as e:
|
|
118
|
+
logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
|
|
119
|
+
raise
|
|
120
|
+
|
|
121
|
+
async def get_context_chunks_in_document(
|
|
122
|
+
self, file_key: str, file_markdown_content: str, file_tags: dict | None = None
|
|
123
|
+
):
|
|
124
|
+
"""
|
|
125
|
+
Get the context chunks in a document.
|
|
126
|
+
"""
|
|
127
|
+
try:
|
|
128
|
+
langchain_rag_document = Document(
|
|
129
|
+
id=file_key,
|
|
130
|
+
page_content=file_markdown_content,
|
|
131
|
+
metadata={
|
|
132
|
+
self.metadata_source: file_key,
|
|
133
|
+
"source_url": "",
|
|
134
|
+
**(file_tags or {}),
|
|
135
|
+
},
|
|
136
|
+
)
|
|
137
|
+
logger.info(f"Document loaded:{file_key}")
|
|
138
|
+
chunks = self.rag_chunker.gen_chunks_for_document(langchain_rag_document)
|
|
139
|
+
breakpoint()
|
|
140
|
+
logger.info(f"Chunks generated:{len(chunks)}")
|
|
141
|
+
context_chunks = (
|
|
142
|
+
await self._retrieve_context_chunks_in_document_with_workflow(
|
|
143
|
+
file_markdown_content, chunks, file_tags
|
|
144
|
+
)
|
|
145
|
+
)
|
|
146
|
+
logger.info(f"Context chunks generated:{len(context_chunks)}")
|
|
147
|
+
return context_chunks
|
|
148
|
+
except Exception as e:
|
|
149
|
+
logger.error(f"Error: {str(e)}")
|
|
150
|
+
raise e
|