wizit-open-rag 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. wizit_open_rag-0.0.1/PKG-INFO +250 -0
  2. wizit_open_rag-0.0.1/README.md +211 -0
  3. wizit_open_rag-0.0.1/pyproject.toml +70 -0
  4. wizit_open_rag-0.0.1/src/wizit_open_rag/__init__.py +13 -0
  5. wizit_open_rag-0.0.1/src/wizit_open_rag/application/__init__.py +0 -0
  6. wizit_open_rag-0.0.1/src/wizit_open_rag/application/context_chunk_app.py +150 -0
  7. wizit_open_rag-0.0.1/src/wizit_open_rag/application/interfaces.py +178 -0
  8. wizit_open_rag-0.0.1/src/wizit_open_rag/application/kdb_service.py +73 -0
  9. wizit_open_rag-0.0.1/src/wizit_open_rag/application/transcription_app.py +144 -0
  10. wizit_open_rag-0.0.1/src/wizit_open_rag/chunks.py +135 -0
  11. wizit_open_rag-0.0.1/src/wizit_open_rag/data/__init__.py +0 -0
  12. wizit_open_rag-0.0.1/src/wizit_open_rag/data/kdb.py +17 -0
  13. wizit_open_rag-0.0.1/src/wizit_open_rag/data/prompts.py +202 -0
  14. wizit_open_rag-0.0.1/src/wizit_open_rag/data/storage.py +10 -0
  15. wizit_open_rag-0.0.1/src/wizit_open_rag/domain/__init__.py +0 -0
  16. wizit_open_rag-0.0.1/src/wizit_open_rag/domain/models.py +31 -0
  17. wizit_open_rag-0.0.1/src/wizit_open_rag/domain/services.py +97 -0
  18. wizit_open_rag-0.0.1/src/wizit_open_rag/infra/__init__.py +0 -0
  19. wizit_open_rag-0.0.1/src/wizit_open_rag/infra/embeddings/__init__.py +0 -0
  20. wizit_open_rag-0.0.1/src/wizit_open_rag/infra/embeddings/aws_embeddings.py +41 -0
  21. wizit_open_rag-0.0.1/src/wizit_open_rag/infra/llms/__init__.py +0 -0
  22. wizit_open_rag-0.0.1/src/wizit_open_rag/infra/llms/aws_model.py +78 -0
  23. wizit_open_rag-0.0.1/src/wizit_open_rag/infra/persistence/__init__.py +0 -0
  24. wizit_open_rag-0.0.1/src/wizit_open_rag/infra/persistence/pg_connection_manager.py +67 -0
  25. wizit_open_rag-0.0.1/src/wizit_open_rag/infra/persistence/pg_engine_manager.py +34 -0
  26. wizit_open_rag-0.0.1/src/wizit_open_rag/infra/persistence/weaviate_connection_manager.py +79 -0
  27. wizit_open_rag-0.0.1/src/wizit_open_rag/infra/rag/__init__.py +0 -0
  28. wizit_open_rag-0.0.1/src/wizit_open_rag/infra/rag/markdown_chunks.py +73 -0
  29. wizit_open_rag-0.0.1/src/wizit_open_rag/infra/rag/pg_embeddings.py +263 -0
  30. wizit_open_rag-0.0.1/src/wizit_open_rag/infra/rag/recursive_chunks.py +60 -0
  31. wizit_open_rag-0.0.1/src/wizit_open_rag/infra/rag/semantic_chunks.py +72 -0
  32. wizit_open_rag-0.0.1/src/wizit_open_rag/infra/rag/weaviate_embeddings.py +179 -0
  33. wizit_open_rag-0.0.1/src/wizit_open_rag/infra/secrets/__init__.py +0 -0
  34. wizit_open_rag-0.0.1/src/wizit_open_rag/infra/secrets/aws_secrets_manager.py +32 -0
  35. wizit_open_rag-0.0.1/src/wizit_open_rag/transcription.py +131 -0
  36. wizit_open_rag-0.0.1/src/wizit_open_rag/utils/file_utils.py +12 -0
  37. wizit_open_rag-0.0.1/src/wizit_open_rag/workflows/__init__.py +0 -0
  38. wizit_open_rag-0.0.1/src/wizit_open_rag/workflows/context_nodes.py +81 -0
  39. wizit_open_rag-0.0.1/src/wizit_open_rag/workflows/context_state.py +10 -0
  40. wizit_open_rag-0.0.1/src/wizit_open_rag/workflows/context_tools.py +58 -0
  41. wizit_open_rag-0.0.1/src/wizit_open_rag/workflows/context_workflow.py +40 -0
  42. wizit_open_rag-0.0.1/src/wizit_open_rag/workflows/transcription_nodes.py +141 -0
  43. wizit_open_rag-0.0.1/src/wizit_open_rag/workflows/transcription_schemas.py +25 -0
  44. wizit_open_rag-0.0.1/src/wizit_open_rag/workflows/transcription_state.py +17 -0
  45. wizit_open_rag-0.0.1/src/wizit_open_rag/workflows/transcription_tools.py +56 -0
  46. wizit_open_rag-0.0.1/src/wizit_open_rag/workflows/transcription_workflow.py +46 -0
@@ -0,0 +1,250 @@
1
+ Metadata-Version: 2.3
2
+ Name: wizit-open-rag
3
+ Version: 0.0.1
4
+ Summary: AI-powered document transcription and semantic chunking for RAG pipelines
5
+ Keywords: rag,retrieval-augmented-generation,llm,chunking,transcription,weaviate,bedrock,langchain,langgraph,pdf,semantic-search
6
+ Author: Restebance
7
+ Author-email: Restebance <restebance@gmail.com>
8
+ License: Apache-2.0
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: Apache Software License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
17
+ Classifier: Typing :: Typed
18
+ Requires-Dist: boto3>=1.40.23
19
+ Requires-Dist: langchain>=1.2.10
20
+ Requires-Dist: langchain-aws>=1.3.0
21
+ Requires-Dist: langchain-classic>=1.0.7
22
+ Requires-Dist: langchain-community>=0.4.1
23
+ Requires-Dist: langchain-core>=1.2.16
24
+ Requires-Dist: langchain-experimental>=0.4.1
25
+ Requires-Dist: langchain-text-splitters>=1.1.1
26
+ Requires-Dist: langgraph>=1.0.9
27
+ Requires-Dist: pillow>=11.3.0
28
+ Requires-Dist: pymupdf>=1.27.1
29
+ Requires-Dist: anthropic>=0.84.0
30
+ Requires-Dist: psycopg2-binary>=2.9.11
31
+ Requires-Dist: sqlalchemy[asyncio]>=2.0.43
32
+ Requires-Dist: langchain-postgres>=0.0.17
33
+ Requires-Dist: weaviate-client>=4.0.0
34
+ Requires-Dist: langchain-weaviate>=0.0.3
35
+ Requires-Python: >=3.12
36
+ Project-URL: Bug Tracker, https://github.com/Restebance/open_rag/issues
37
+ Project-URL: Repository, https://github.com/Restebance/open_rag
38
+ Description-Content-Type: text/markdown
39
+
40
+ # open_rag
41
+
42
+ A Python library for AI-powered document transcription and semantic chunking with RAG (Retrieval-Augmented Generation). It processes PDFs through LLMs (Claude via **AWS Bedrock**), chunks the resulting Markdown semantically, enriches each chunk with surrounding context, and returns ready-to-index `Document` objects for PostgreSQL pgvector.
43
+
44
+ **Version**: 0.0.1 | **Python**: >=3.12 | **Build**: uv
45
+
46
+ ---
47
+
48
+ ## Features
49
+
50
+ - PDF-to-Markdown transcription powered by Claude via AWS Bedrock
51
+ - LangGraph-based transcription workflow with configurable retry logic and accuracy thresholds
52
+ - Semantic chunking with 85th-percentile breakpoints (plus recursive and Markdown-header strategies)
53
+ - Per-chunk context enrichment via a dedicated LangGraph workflow — each chunk is wrapped with `<context>` and `<content>` tags
54
+ - Pluggable storage backends: local filesystem or AWS S3
55
+ - Vector indexing into PostgreSQL pgvector via LangChain `PGVectorStore`
56
+ - LangSmith tracing support
57
+
58
+ ---
59
+
60
+ ## Prerequisites
61
+
62
+ - Python 3.12 or higher
63
+ - [uv](https://docs.astral.sh/uv/) for dependency management
64
+ - AWS credentials configured (standard boto3 credential chain — env vars, `~/.aws/credentials`, or instance profile)
65
+ - PostgreSQL database with the [pgvector](https://github.com/pgvector/pgvector) extension enabled
66
+
67
+ ---
68
+
69
+ ## Installation
70
+
71
+ Install from PyPI:
72
+
73
+ ```bash
74
+ pip install open_rag
75
+ ```
76
+
77
+ For development (clone + install with dev tools):
78
+
79
+ ```bash
80
+ git clone https://github.com/Restebance/open_rag.git
81
+ cd open_rag
82
+ uv sync --group dev
83
+ cp example.env .env
84
+ ```
85
+
86
+ Fill in `.env` with your credentials (see [Environment Variables](#environment-variables) below).
87
+
88
+ ---
89
+
90
+ ## Usage
91
+
92
+ ### Document Transcription
93
+
94
+ `OpenRagTranscriber` accepts the raw bytes of a single PDF page and returns a `ParsedDocPage` containing the Markdown transcription.
95
+
96
+ ```python
97
+ import asyncio
98
+ from open_rag import OpenRagTranscriber
99
+
100
+ transcriber = OpenRagTranscriber(
101
+ langsmith_project_name="my-project", # required
102
+ langsmith_api_key="lsv2_...", # required
103
+ llm_model_id="global.anthropic.claude-sonnet-4-6",
104
+ target_language="es-CO",
105
+ transcription_accuracy_threshold=0.90,
106
+ max_transcription_retries=2,
107
+ )
108
+
109
+ with open("page.pdf", "rb") as f:
110
+ page_bytes = f.read()
111
+
112
+ result = asyncio.run(transcriber.transcribe_document(page_bytes))
113
+ print(result.page_text) # Markdown string
114
+ ```
115
+
116
+ ### Semantic Chunking with Context
117
+
118
+ `ChunksManager` takes a pre-loaded Markdown string and returns a list of LangChain `Document` objects, each enriched with a contextual summary.
119
+
120
+ ```python
121
+ import asyncio
122
+ from open_rag import ChunksManager
123
+
124
+ manager = ChunksManager(
125
+ langsmith_project_name="my-project", # required
126
+ langsmith_api_key="lsv2_...", # required
127
+ )
128
+
129
+ with open("document.md") as f:
130
+ markdown_content = f.read()
131
+
132
+ docs = asyncio.run(manager.gen_context_chunks(
133
+ file_key="document.md",
134
+ file_markdown_content=markdown_content,
135
+ file_tags={"category": "hr", "department": "onboarding"},
136
+ ))
137
+
138
+ # docs is a List[Document]; index to pgvector as needed
139
+ for doc in docs:
140
+ print(doc.page_content)
141
+ ```
142
+
143
+ > **Note:** `gen_context_chunks` does not load files from storage — the caller must pass the content as a string. Indexing to pgvector is the caller's responsibility.
144
+
145
+ ---
146
+
147
+ ## Architecture
148
+
149
+ The codebase follows a clean layered architecture. Dependency direction: `transcription.py / chunks.py → application → domain ↔ infra ← workflows`.
150
+
151
+ ```
152
+ open_rag/ # installable package (src/open_rag/)
153
+ ├── transcription.py # Public API — OpenRagTranscriber
154
+ ├── chunks.py # Public API — ChunksManager
155
+ ├── domain/ # Core data models (PageToTranscribe, ParsedDocPage, ParsedDoc)
156
+ ├── application/ # Orchestration + abstract interfaces (ABCs)
157
+ ├── data/ # Shared enums and prompt strings
158
+ ├── infra/
159
+ │ ├── llms/ # AWS Bedrock chat (ChatBedrockConverse)
160
+ │ ├── embeddings/ # AWS Bedrock embeddings (BedrockEmbeddings)
161
+ │ ├── persistence/ # Local filesystem, AWS S3, PostgreSQL managers
162
+ │ ├── rag/ # SemanticChunks, RecursiveChunks, MarkdownHeadersChunks, PGVectorStore, WeaviateEmbeddingsManager
163
+ │ └── secrets/ # AWS Secrets Manager helper
164
+ ├── utils/ # validate_file_name_format
165
+ └── workflows/ # LangGraph state machines (transcription + context)
166
+ tests/ # pytest suite
167
+ data/ # Sample / test documents
168
+ example.env
169
+ pyproject.toml
170
+ ```
171
+
172
+ ### Key Data Flow
173
+
174
+ ```
175
+ PDF bytes
176
+ → ParseDocModelService (PyMuPDF → base64 pages)
177
+ → TranscriptionWorkflow (LangGraph → Claude via AWS Bedrock → Markdown)
178
+
179
+ Markdown string + tags
180
+ → SemanticChunks (AWS Bedrock embeddings, 85th-percentile breakpoints)
181
+ → ContextWorkflow (LangGraph → Claude adds surrounding context per chunk)
182
+ → List[Document] (each chunk wrapped in <context> / <content> tags)
183
+ → Caller indexes to pgvector
184
+ ```
185
+
186
+ ---
187
+
188
+ ## Environment Variables
189
+
190
+ Copy `example.env` to `.env` and fill in the values:
191
+
192
+ | Variable | Purpose |
193
+ |---|---|
194
+ | `VECTOR_STORE_CONNECTION` | PostgreSQL connection string (pgvector) |
195
+ | `VECTOR_STORE_TABLE` | pgvector table name |
196
+ | `LANGSMITH_API_KEY` | LangSmith API key for tracing |
197
+ | `LANGCHAIN_PROJECT` | LangSmith project name |
198
+ | `LANGSMITH_TRACING` | Enable LangSmith tracing (`true` / `false`) |
199
+ | `SUPABASE_KEY` / `SUPABASE_URL` | Supabase credentials (optional) |
200
+
201
+ AWS credentials are read from the standard **boto3 credential chain** and are not set in `.env`.
202
+
203
+ ---
204
+
205
+ ## Development
206
+
207
+ ### Running tests
208
+
209
+ ```bash
210
+ # Unit tests (mocked — no AWS credentials required)
211
+ uv run pytest
212
+
213
+ # Transcription integration test (requires live AWS credentials)
214
+ uv run python src/open_rag/transcription.py
215
+
216
+ # Chunking integration test (requires live AWS credentials)
217
+ uv run python src/open_rag/chunks.py
218
+ ```
219
+
220
+ ### Profiling
221
+
222
+ ```bash
223
+ # CPU profiling
224
+ uv run pyinstrument test.py transcribe <file.pdf> <source_dir> <target_dir>
225
+
226
+ # Memory profiling
227
+ uv run python -m memray run test.py transcribe <file.pdf> <source_dir> <target_dir>
228
+ ```
229
+
230
+ ### Building the package
231
+
232
+ ```bash
233
+ uv build
234
+ ```
235
+
236
+ ---
237
+
238
+ ## Gotchas
239
+
240
+ - `SemanticChunks` calls AWS Bedrock **at construction time** (via `SemanticChunker`) — not just at index time. Make sure credentials are available before instantiating `ChunksManager`.
241
+ - Both `transcribe_document` and `gen_context_chunks` are `async`; wrap them in `asyncio.run(...)` from synchronous code.
242
+ - `OpenRagTranscriber` and `ChunksManager` require `langsmith_project_name` and `langsmith_api_key` as **constructor arguments** — they are not read from environment variables.
243
+ - `ParseDocModelService.parse_document_to_base64_pages` iterates `range(0, page_count)` — pages are zero-indexed (`page_number=0` is the first page).
244
+ - AWS Bedrock cross-region model IDs use the `global.` prefix (e.g. `global.anthropic.claude-sonnet-4-6`).
245
+
246
+ ---
247
+
248
+ ## License
249
+
250
+ Licensed under the [Apache License 2.0](LICENSE.md).
@@ -0,0 +1,211 @@
1
+ # open_rag
2
+
3
+ A Python library for AI-powered document transcription and semantic chunking with RAG (Retrieval-Augmented Generation). It processes PDFs through LLMs (Claude via **AWS Bedrock**), chunks the resulting Markdown semantically, enriches each chunk with surrounding context, and returns ready-to-index `Document` objects for PostgreSQL pgvector.
4
+
5
+ **Version**: 0.0.1 | **Python**: >=3.12 | **Build**: uv
6
+
7
+ ---
8
+
9
+ ## Features
10
+
11
+ - PDF-to-Markdown transcription powered by Claude via AWS Bedrock
12
+ - LangGraph-based transcription workflow with configurable retry logic and accuracy thresholds
13
+ - Semantic chunking with 85th-percentile breakpoints (plus recursive and Markdown-header strategies)
14
+ - Per-chunk context enrichment via a dedicated LangGraph workflow — each chunk is wrapped with `<context>` and `<content>` tags
15
+ - Pluggable storage backends: local filesystem or AWS S3
16
+ - Vector indexing into PostgreSQL pgvector via LangChain `PGVectorStore`
17
+ - LangSmith tracing support
18
+
19
+ ---
20
+
21
+ ## Prerequisites
22
+
23
+ - Python 3.12 or higher
24
+ - [uv](https://docs.astral.sh/uv/) for dependency management
25
+ - AWS credentials configured (standard boto3 credential chain — env vars, `~/.aws/credentials`, or instance profile)
26
+ - PostgreSQL database with the [pgvector](https://github.com/pgvector/pgvector) extension enabled
27
+
28
+ ---
29
+
30
+ ## Installation
31
+
32
+ Install from PyPI:
33
+
34
+ ```bash
35
+ pip install open_rag
36
+ ```
37
+
38
+ For development (clone + install with dev tools):
39
+
40
+ ```bash
41
+ git clone https://github.com/Restebance/open_rag.git
42
+ cd open_rag
43
+ uv sync --group dev
44
+ cp example.env .env
45
+ ```
46
+
47
+ Fill in `.env` with your credentials (see [Environment Variables](#environment-variables) below).
48
+
49
+ ---
50
+
51
+ ## Usage
52
+
53
+ ### Document Transcription
54
+
55
+ `OpenRagTranscriber` accepts the raw bytes of a single PDF page and returns a `ParsedDocPage` containing the Markdown transcription.
56
+
57
+ ```python
58
+ import asyncio
59
+ from open_rag import OpenRagTranscriber
60
+
61
+ transcriber = OpenRagTranscriber(
62
+ langsmith_project_name="my-project", # required
63
+ langsmith_api_key="lsv2_...", # required
64
+ llm_model_id="global.anthropic.claude-sonnet-4-6",
65
+ target_language="es-CO",
66
+ transcription_accuracy_threshold=0.90,
67
+ max_transcription_retries=2,
68
+ )
69
+
70
+ with open("page.pdf", "rb") as f:
71
+ page_bytes = f.read()
72
+
73
+ result = asyncio.run(transcriber.transcribe_document(page_bytes))
74
+ print(result.page_text) # Markdown string
75
+ ```
76
+
77
+ ### Semantic Chunking with Context
78
+
79
+ `ChunksManager` takes a pre-loaded Markdown string and returns a list of LangChain `Document` objects, each enriched with a contextual summary.
80
+
81
+ ```python
82
+ import asyncio
83
+ from open_rag import ChunksManager
84
+
85
+ manager = ChunksManager(
86
+ langsmith_project_name="my-project", # required
87
+ langsmith_api_key="lsv2_...", # required
88
+ )
89
+
90
+ with open("document.md") as f:
91
+ markdown_content = f.read()
92
+
93
+ docs = asyncio.run(manager.gen_context_chunks(
94
+ file_key="document.md",
95
+ file_markdown_content=markdown_content,
96
+ file_tags={"category": "hr", "department": "onboarding"},
97
+ ))
98
+
99
+ # docs is a List[Document]; index to pgvector as needed
100
+ for doc in docs:
101
+ print(doc.page_content)
102
+ ```
103
+
104
+ > **Note:** `gen_context_chunks` does not load files from storage — the caller must pass the content as a string. Indexing to pgvector is the caller's responsibility.
105
+
106
+ ---
107
+
108
+ ## Architecture
109
+
110
+ The codebase follows a clean layered architecture. Dependency direction: `transcription.py / chunks.py → application → domain ↔ infra ← workflows`.
111
+
112
+ ```
113
+ open_rag/ # installable package (src/open_rag/)
114
+ ├── transcription.py # Public API — OpenRagTranscriber
115
+ ├── chunks.py # Public API — ChunksManager
116
+ ├── domain/ # Core data models (PageToTranscribe, ParsedDocPage, ParsedDoc)
117
+ ├── application/ # Orchestration + abstract interfaces (ABCs)
118
+ ├── data/ # Shared enums and prompt strings
119
+ ├── infra/
120
+ │ ├── llms/ # AWS Bedrock chat (ChatBedrockConverse)
121
+ │ ├── embeddings/ # AWS Bedrock embeddings (BedrockEmbeddings)
122
+ │ ├── persistence/ # Local filesystem, AWS S3, PostgreSQL managers
123
+ │ ├── rag/ # SemanticChunks, RecursiveChunks, MarkdownHeadersChunks, PGVectorStore, WeaviateEmbeddingsManager
124
+ │ └── secrets/ # AWS Secrets Manager helper
125
+ ├── utils/ # validate_file_name_format
126
+ └── workflows/ # LangGraph state machines (transcription + context)
127
+ tests/ # pytest suite
128
+ data/ # Sample / test documents
129
+ example.env
130
+ pyproject.toml
131
+ ```
132
+
133
+ ### Key Data Flow
134
+
135
+ ```
136
+ PDF bytes
137
+ → ParseDocModelService (PyMuPDF → base64 pages)
138
+ → TranscriptionWorkflow (LangGraph → Claude via AWS Bedrock → Markdown)
139
+
140
+ Markdown string + tags
141
+ → SemanticChunks (AWS Bedrock embeddings, 85th-percentile breakpoints)
142
+ → ContextWorkflow (LangGraph → Claude adds surrounding context per chunk)
143
+ → List[Document] (each chunk wrapped in <context> / <content> tags)
144
+ → Caller indexes to pgvector
145
+ ```
146
+
147
+ ---
148
+
149
+ ## Environment Variables
150
+
151
+ Copy `example.env` to `.env` and fill in the values:
152
+
153
+ | Variable | Purpose |
154
+ |---|---|
155
+ | `VECTOR_STORE_CONNECTION` | PostgreSQL connection string (pgvector) |
156
+ | `VECTOR_STORE_TABLE` | pgvector table name |
157
+ | `LANGSMITH_API_KEY` | LangSmith API key for tracing |
158
+ | `LANGCHAIN_PROJECT` | LangSmith project name |
159
+ | `LANGSMITH_TRACING` | Enable LangSmith tracing (`true` / `false`) |
160
+ | `SUPABASE_KEY` / `SUPABASE_URL` | Supabase credentials (optional) |
161
+
162
+ AWS credentials are read from the standard **boto3 credential chain** and are not set in `.env`.
163
+
164
+ ---
165
+
166
+ ## Development
167
+
168
+ ### Running tests
169
+
170
+ ```bash
171
+ # Unit tests (mocked — no AWS credentials required)
172
+ uv run pytest
173
+
174
+ # Transcription integration test (requires live AWS credentials)
175
+ uv run python src/open_rag/transcription.py
176
+
177
+ # Chunking integration test (requires live AWS credentials)
178
+ uv run python src/open_rag/chunks.py
179
+ ```
180
+
181
+ ### Profiling
182
+
183
+ ```bash
184
+ # CPU profiling
185
+ uv run pyinstrument test.py transcribe <file.pdf> <source_dir> <target_dir>
186
+
187
+ # Memory profiling
188
+ uv run python -m memray run test.py transcribe <file.pdf> <source_dir> <target_dir>
189
+ ```
190
+
191
+ ### Building the package
192
+
193
+ ```bash
194
+ uv build
195
+ ```
196
+
197
+ ---
198
+
199
+ ## Gotchas
200
+
201
+ - `SemanticChunks` calls AWS Bedrock **at construction time** (via `SemanticChunker`) — not just at index time. Make sure credentials are available before instantiating `ChunksManager`.
202
+ - Both `transcribe_document` and `gen_context_chunks` are `async`; wrap them in `asyncio.run(...)` from synchronous code.
203
+ - `OpenRagTranscriber` and `ChunksManager` require `langsmith_project_name` and `langsmith_api_key` as **constructor arguments** — they are not read from environment variables.
204
+ - `ParseDocModelService.parse_document_to_base64_pages` iterates `range(0, page_count)` — pages are zero-indexed (`page_number=0` is the first page).
205
+ - AWS Bedrock cross-region model IDs use the `global.` prefix (e.g. `global.anthropic.claude-sonnet-4-6`).
206
+
207
+ ---
208
+
209
+ ## License
210
+
211
+ Licensed under the [Apache License 2.0](LICENSE.md).
@@ -0,0 +1,70 @@
1
+ [project]
2
+ name = "wizit_open_rag"
3
+ version = "0.0.1"
4
+ description = "AI-powered document transcription and semantic chunking for RAG pipelines"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ license = { text = "Apache-2.0" }
8
+ authors = [
9
+ { name = "Restebance", email = "restebance@gmail.com" },
10
+ ]
11
+ keywords = [
12
+ "rag",
13
+ "retrieval-augmented-generation",
14
+ "llm",
15
+ "chunking",
16
+ "transcription",
17
+ "weaviate",
18
+ "bedrock",
19
+ "langchain",
20
+ "langgraph",
21
+ "pdf",
22
+ "semantic-search",
23
+ ]
24
+ classifiers = [
25
+ "Development Status :: 3 - Alpha",
26
+ "Intended Audience :: Developers",
27
+ "License :: OSI Approved :: Apache Software License",
28
+ "Programming Language :: Python :: 3",
29
+ "Programming Language :: Python :: 3.12",
30
+ "Programming Language :: Python :: 3.13",
31
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
32
+ "Topic :: Software Development :: Libraries :: Python Modules",
33
+ "Typing :: Typed",
34
+ ]
35
+ dependencies = [
36
+ "boto3>=1.40.23",
37
+ "langchain>=1.2.10",
38
+ "langchain-aws>=1.3.0",
39
+ "langchain-classic>=1.0.7",
40
+ "langchain-community>=0.4.1",
41
+ "langchain-core>=1.2.16",
42
+ "langchain-experimental>=0.4.1",
43
+ "langchain-text-splitters>=1.1.1",
44
+ "langgraph>=1.0.9",
45
+ "pillow>=11.3.0",
46
+ "pymupdf>=1.27.1",
47
+ "anthropic>=0.84.0",
48
+ "psycopg2-binary>=2.9.11",
49
+ "sqlalchemy[asyncio]>=2.0.43",
50
+ "langchain-postgres>=0.0.17",
51
+ "weaviate-client>=4.0.0",
52
+ "langchain-weaviate>=0.0.3",
53
+ ]
54
+
55
+ [project.urls]
56
+ Repository = "https://github.com/Restebance/open_rag"
57
+ "Bug Tracker" = "https://github.com/Restebance/open_rag/issues"
58
+
59
+ [dependency-groups]
60
+ dev = [
61
+ "memray>=1.18.0",
62
+ "pyinstrument>=5.1.1",
63
+ "python-dotenv>=1.1.1",
64
+ "pytest>=8.3.5",
65
+ "pytest-asyncio>=0.25.0",
66
+ ]
67
+
68
+ [build-system]
69
+ requires = ["uv_build>=0.8.15,<0.9.0"]
70
+ build-backend = "uv_build"
@@ -0,0 +1,13 @@
1
+ from open_rag.application.interfaces import EmbeddingsManager
2
+ from open_rag.chunks import ChunksManager
3
+ from open_rag.infra.rag.pg_embeddings import PgEmbeddingsManager
4
+ from open_rag.infra.rag.weaviate_embeddings import WeaviateEmbeddingsManager
5
+ from open_rag.transcription import OpenRagTranscriber
6
+
7
+ __all__ = [
8
+ "OpenRagTranscriber",
9
+ "ChunksManager",
10
+ "EmbeddingsManager",
11
+ "PgEmbeddingsManager",
12
+ "WeaviateEmbeddingsManager",
13
+ ]
@@ -0,0 +1,150 @@
1
+ import asyncio
2
+ import logging
3
+ from typing import Any
4
+
5
+ from langchain_core.documents import Document
6
+ from langchain_core.messages.human import HumanMessage
7
+ from langsmith import Client, tracing_context
8
+
9
+ from open_rag.workflows.context_workflow import ContextWorkflow
10
+
11
+ from .interfaces import (
12
+ AiApplicationService,
13
+ RagChunker,
14
+ )
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class ContextChunksInDocumentApp:
20
+ """
21
+ Service for chunking documents.
22
+ """
23
+
24
+ def __init__(
25
+ self,
26
+ ai_application_service: AiApplicationService,
27
+ rag_chunker: RagChunker,
28
+ langsmith_api_key: str,
29
+ langsmith_project_name: str,
30
+ target_language: str = "es",
31
+ ):
32
+ """
33
+ Initialize the ChunkerService.
34
+ """
35
+ self.ai_application_service = ai_application_service
36
+ self.rag_chunker = rag_chunker
37
+ self.target_language = target_language
38
+ self.chat_model = self.ai_application_service.load_chat_model()
39
+ # TODO
40
+ self.context_additional_instructions = ""
41
+ self.metadata_source = "source"
42
+
43
+ context_workflow = ContextWorkflow(
44
+ self.chat_model, self.context_additional_instructions
45
+ )
46
+ self.compiled_context_workflow = context_workflow.gen_workflow()
47
+ self.compiled_context_workflow = self.compiled_context_workflow.compile()
48
+ # TRACING
49
+ self.langsmith_project_name = langsmith_project_name
50
+ self.langsmith_client = Client(api_key=langsmith_api_key)
51
+
52
+ async def _retrieve_context_chunk_in_document_with_workflow(
53
+ self,
54
+ workflow,
55
+ markdown_content: str,
56
+ chunk: Document,
57
+ chunk_metadata: dict[str, Any] | None = None,
58
+ ) -> Document:
59
+ """Retrieve context chunks in document."""
60
+ try:
61
+ with tracing_context(
62
+ enabled=True,
63
+ project_name=self.langsmith_project_name,
64
+ client=self.langsmith_client,
65
+ ):
66
+ result = await workflow.ainvoke(
67
+ {
68
+ "messages": [
69
+ HumanMessage(
70
+ content=[
71
+ {
72
+ "type": "text",
73
+ "text": f"Retrieve a complete context for the following chunk: <chunk>{chunk.page_content}</chunk>, ensure all content chunks are generated with the same document's language.",
74
+ },
75
+ ]
76
+ )
77
+ ],
78
+ "document_content": markdown_content,
79
+ },
80
+ {
81
+ "configurable": {
82
+ "transcription_accuracy_threshold": 0.95,
83
+ "max_transcription_retries": 2,
84
+ }
85
+ },
86
+ )
87
+ chunk.page_content = f"<context>\n{result['context']}\n</context>\n <content>\n{chunk.page_content}\n</content>"
88
+ if chunk_metadata is not None:
89
+ for key, value in chunk_metadata.items():
90
+ chunk.metadata[key] = value
91
+ return chunk
92
+ except Exception as e:
93
+ logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
94
+ raise
95
+
96
+ async def _retrieve_context_chunks_in_document_with_workflow(
97
+ self,
98
+ markdown_content: str,
99
+ chunks: list[Document],
100
+ chunks_metadata: dict[str, Any] | None = None,
101
+ ) -> list[Document]:
102
+ """Retrieve context chunks in document."""
103
+ try:
104
+ context_chunks_workflow_invocations = list(
105
+ map(
106
+ lambda chunk: self._retrieve_context_chunk_in_document_with_workflow(
107
+ self.compiled_context_workflow,
108
+ markdown_content,
109
+ chunk,
110
+ chunks_metadata,
111
+ ),
112
+ chunks,
113
+ )
114
+ )
115
+ context_chunks = await asyncio.gather(*context_chunks_workflow_invocations)
116
+ return context_chunks
117
+ except Exception as e:
118
+ logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
119
+ raise
120
+
121
+ async def get_context_chunks_in_document(
122
+ self, file_key: str, file_markdown_content: str, file_tags: dict | None = None
123
+ ):
124
+ """
125
+ Get the context chunks in a document.
126
+ """
127
+ try:
128
+ langchain_rag_document = Document(
129
+ id=file_key,
130
+ page_content=file_markdown_content,
131
+ metadata={
132
+ self.metadata_source: file_key,
133
+ "source_url": "",
134
+ **(file_tags or {}),
135
+ },
136
+ )
137
+ logger.info(f"Document loaded:{file_key}")
138
+ chunks = self.rag_chunker.gen_chunks_for_document(langchain_rag_document)
139
+ breakpoint()
140
+ logger.info(f"Chunks generated:{len(chunks)}")
141
+ context_chunks = (
142
+ await self._retrieve_context_chunks_in_document_with_workflow(
143
+ file_markdown_content, chunks, file_tags
144
+ )
145
+ )
146
+ logger.info(f"Context chunks generated:{len(context_chunks)}")
147
+ return context_chunks
148
+ except Exception as e:
149
+ logger.error(f"Error: {str(e)}")
150
+ raise e