wizit-context-ingestor 0.2.5b2__py3-none-any.whl → 0.3.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wizit_context_ingestor/__init__.py +2 -2
- wizit_context_ingestor/application/context_chunk_service.py +149 -35
- wizit_context_ingestor/application/interfaces.py +1 -1
- wizit_context_ingestor/application/transcription_service.py +132 -49
- wizit_context_ingestor/data/kdb.py +10 -0
- wizit_context_ingestor/data/prompts.py +156 -2
- wizit_context_ingestor/data/storage.py +10 -0
- wizit_context_ingestor/infra/persistence/local_storage.py +19 -9
- wizit_context_ingestor/infra/persistence/s3_storage.py +29 -23
- wizit_context_ingestor/infra/rag/chroma_embeddings.py +135 -0
- wizit_context_ingestor/infra/rag/pg_embeddings.py +57 -54
- wizit_context_ingestor/infra/rag/redis_embeddings.py +34 -25
- wizit_context_ingestor/infra/rag/semantic_chunks.py +9 -1
- wizit_context_ingestor/infra/vertex_model.py +56 -28
- wizit_context_ingestor/main.py +160 -105
- wizit_context_ingestor/utils/file_utils.py +13 -0
- wizit_context_ingestor/workflows/context_nodes.py +73 -0
- wizit_context_ingestor/workflows/context_state.py +10 -0
- wizit_context_ingestor/workflows/context_tools.py +58 -0
- wizit_context_ingestor/workflows/context_workflow.py +42 -0
- wizit_context_ingestor/workflows/transcription_nodes.py +136 -0
- wizit_context_ingestor/workflows/transcription_schemas.py +25 -0
- wizit_context_ingestor/workflows/transcription_state.py +17 -0
- wizit_context_ingestor/workflows/transcription_tools.py +54 -0
- wizit_context_ingestor/workflows/transcription_workflow.py +42 -0
- {wizit_context_ingestor-0.2.5b2.dist-info → wizit_context_ingestor-0.3.0b1.dist-info}/METADATA +10 -1
- wizit_context_ingestor-0.3.0b1.dist-info/RECORD +44 -0
- {wizit_context_ingestor-0.2.5b2.dist-info → wizit_context_ingestor-0.3.0b1.dist-info}/WHEEL +1 -1
- wizit_context_ingestor-0.2.5b2.dist-info/RECORD +0 -31
|
@@ -1,5 +1,93 @@
|
|
|
1
1
|
from pydantic import BaseModel, Field
|
|
2
2
|
|
|
3
|
+
AGENT_TRANSCRIPTION_SYSTEM_PROMPT = """
|
|
4
|
+
You are an expert document transcription assistant.
|
|
5
|
+
Your task is to transcribe the exact text from the provided document with extreme accuracy while organizing the output using markdown formatting.
|
|
6
|
+
OBJECTIVE: Create a complete, accurate transcription that preserves the original document's content, structure and formatting.
|
|
7
|
+
TRANSCRIPTION RULES:
|
|
8
|
+
<hard_rules>
|
|
9
|
+
1. document's languages must be detected to ensure correct transcription
|
|
10
|
+
2. Systematically examine each content element (text, images, tables, formatting)
|
|
11
|
+
3. Convert all content to markdown while preserving structure and meaning
|
|
12
|
+
5. Ensure completeness and accuracy of the transcription
|
|
13
|
+
6. TEXT TRANSCRIPTION:
|
|
14
|
+
- Transcribe all visible text exactly as it appears
|
|
15
|
+
- Include: paragraphs, headings, subheadings, headers, footers
|
|
16
|
+
- Include: footnotes, page numbers, bullet points, lists, captions
|
|
17
|
+
- Preserve: bold, italic, underlined, and other text formatting using markdown
|
|
18
|
+
7. LANGUAGE REQUIREMENTS:
|
|
19
|
+
- Transcribed content MUST preserve document's language
|
|
20
|
+
- Translate any secondary language content to maintain consistency
|
|
21
|
+
8. COMPLETENESS:
|
|
22
|
+
- Transcribe the entire document, partial transcriptions are not allowed
|
|
23
|
+
- Never summarize, modify, or generate additional content
|
|
24
|
+
- Maintain original meaning and context
|
|
25
|
+
9. FORMATTING STANDARDS:
|
|
26
|
+
- Use proper markdown syntax for structure
|
|
27
|
+
- Avoid blank lines in transcription
|
|
28
|
+
- Exclude logos, watermarks, and decorative icons
|
|
29
|
+
- Omit special characters that interfere with markdown
|
|
30
|
+
10. IMAGE HANDLING:
|
|
31
|
+
<image_transcription_rules>
|
|
32
|
+
- Extract and transcribe any text within images
|
|
33
|
+
- For data-rich images: create markdown tables when applicable
|
|
34
|
+
- For other images: provide descriptive content summaries
|
|
35
|
+
- Classify each visual element as: Chart, Diagram, Natural Image, Screenshot, or Other
|
|
36
|
+
- Format: <figure_type>Classification</figure_type>
|
|
37
|
+
- Wrap content in <figure></figure> tags with title/caption if available
|
|
38
|
+
</image_transcription_rules>
|
|
39
|
+
11. TABLE PROCESSING:
|
|
40
|
+
<tables_transcription_rules>
|
|
41
|
+
- Convert all tables to proper markdown table format
|
|
42
|
+
- Preserve cell alignment and structure as closely as possible
|
|
43
|
+
- Maintain data relationships and hierarchy
|
|
44
|
+
- Include table headers and formatting
|
|
45
|
+
</tables_transcription_rules>
|
|
46
|
+
12. QUALITY ASSURANCE:
|
|
47
|
+
- Ensure no content is omitted or added
|
|
48
|
+
- Check markdown formatting is correct
|
|
49
|
+
- Confirm structural integrity is maintained
|
|
50
|
+
</hard_rules>
|
|
51
|
+
|
|
52
|
+
CRITICAL REMINDERS:
|
|
53
|
+
<critical_reminders>
|
|
54
|
+
- Accuracy over speed, every character matters
|
|
55
|
+
- Preserve original document intent and meaning
|
|
56
|
+
- Maintain professional transcription standards
|
|
57
|
+
- Complete transcription is mandatory
|
|
58
|
+
</critical_reminders>
|
|
59
|
+
When provided, use the following transcription notes from previous transcriptions intents to improve the current transcription:
|
|
60
|
+
<transcription_notes>
|
|
61
|
+
{transcription_notes}
|
|
62
|
+
</transcription_notes>
|
|
63
|
+
When provided, use the following additional transcription instructions to improve results:
|
|
64
|
+
<additional_instructions>
|
|
65
|
+
{transcription_additional_instructions}
|
|
66
|
+
</additional_instructions>
|
|
67
|
+
"""
|
|
68
|
+
# Generate the optimized transcription following these specifications:
|
|
69
|
+
# {format_instructions}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
IMAGE_TRANSCRIPTION_CHECK_SYSTEM_PROMPT = """
|
|
73
|
+
You are an expert document transcription grader.
|
|
74
|
+
Your task is to evaluate the following transcription quality.
|
|
75
|
+
<rules>
|
|
76
|
+
- Provide an accurate evaluation of the transcription ensuring quality, completeness and accuracy.
|
|
77
|
+
- Transcription has markdown formatting, the markdown format must reflect the original document's structure and formatting.
|
|
78
|
+
- Compare the transcription with the original document (provided as image)
|
|
79
|
+
</rules>
|
|
80
|
+
<transcription>
|
|
81
|
+
{transcription}
|
|
82
|
+
</transcription>
|
|
83
|
+
|
|
84
|
+
When provided, evaluate whether the following additional transcription instructions provided by the user have been followed:
|
|
85
|
+
<additional_instructions>
|
|
86
|
+
{transcription_additional_instructions}
|
|
87
|
+
</additional_instructions>
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
|
|
3
91
|
IMAGE_TRANSCRIPTION_SYSTEM_PROMPT = """
|
|
4
92
|
You are an expert document transcription assistant. Your task is to transcribe the exact text from the provided document with extreme accuracy while organizing the output using markdown formatting.
|
|
5
93
|
|
|
@@ -22,6 +110,7 @@ TRANSCRIPTION RULES:
|
|
|
22
110
|
- Include: footnotes, page numbers, bullet points, lists, captions
|
|
23
111
|
- Preserve: bold, italic, underlined, and other text formatting using markdown
|
|
24
112
|
- Mark unclear text as [unclear] or [illegible] with best guess in brackets
|
|
113
|
+
- Enclose all underlined content in <UnderlinedContent></UnderlinedContent> tags
|
|
25
114
|
|
|
26
115
|
2. LANGUAGE REQUIREMENTS:
|
|
27
116
|
- All transcribed content MUST be in the document's primary language
|
|
@@ -70,9 +159,13 @@ CRITICAL REMINDERS:
|
|
|
70
159
|
- Maintain professional transcription standards
|
|
71
160
|
- Complete transcription is mandatory
|
|
72
161
|
|
|
162
|
+
<additional_instructions>
|
|
163
|
+
{transcription_additional_instructions}
|
|
164
|
+
</additional_instructions>
|
|
165
|
+
|
|
166
|
+
|
|
73
167
|
Generate the optimized transcription following these specifications:
|
|
74
168
|
{format_instructions}
|
|
75
|
-
|
|
76
169
|
"""
|
|
77
170
|
|
|
78
171
|
CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT = """
|
|
@@ -132,10 +225,71 @@ Generate the optimized context following these specifications:
|
|
|
132
225
|
{format_instructions}
|
|
133
226
|
"""
|
|
134
227
|
|
|
228
|
+
WORKFLOW_CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT = """
|
|
229
|
+
You are an expert RAG (Retrieval-Augmented Generation) context generator that creates optimized contextual chunks from markdown document content for enhanced search and retrieval performance.
|
|
230
|
+
OBJECTIVE: Generate rich, searchable context descriptions that maximize retrieval accuracy and relevance in RAG systems.
|
|
231
|
+
WORKFLOW:
|
|
232
|
+
<task_analysis>
|
|
233
|
+
1. LANGUAGE DETECTION: Identify the primary language used in the document content
|
|
234
|
+
2. SEMANTIC ANALYSIS: Understand the chunk's meaning, relationships, and significance within the broader document
|
|
235
|
+
3. CONTEXT GENERATION: Create comprehensive context metadata that enhances retrieval effectiveness
|
|
236
|
+
4. SEARCH OPTIMIZATION: Ensure context includes terms and concepts that users might search for
|
|
237
|
+
5. QUALITY VALIDATION: Verify context completeness and retrieval utility
|
|
238
|
+
</task_analysis>
|
|
239
|
+
CONTEXT GENERATION REQUIREMENTS:
|
|
240
|
+
<context_elements>
|
|
241
|
+
Your generated context must synthesize ALL of these elements into a coherent description:
|
|
242
|
+
- chunk_relation_with_document: How this chunk connects to and fits within the overall document structure and narrative
|
|
243
|
+
- chunk_keywords: Primary and secondary keywords, technical terms, and searchable phrases that would help users find this content
|
|
244
|
+
- chunk_description: Clear explanation of what the chunk contains, including data types, concepts, and information presented
|
|
245
|
+
- chunk_function: The chunk's specific purpose and role (e.g., definition, explanation, example, instruction, procedure, list, summary, analysis, conclusion)
|
|
246
|
+
- chunk_structure: Format and organizational pattern (paragraph, bulleted list, numbered steps, table, code block, heading, etc.)
|
|
247
|
+
- chunk_main_idea: The central concept, message, or takeaway that the chunk communicates
|
|
248
|
+
- chunk_domain: Subject area or field of knowledge (e.g., technical documentation, legal text, medical information, business process)
|
|
249
|
+
- chunk_audience: Intended reader level and background (e.g., beginner, expert, general audience, specific role)
|
|
250
|
+
</context_elements>
|
|
251
|
+
CRITICAL RULES:
|
|
252
|
+
<critical_rules>
|
|
253
|
+
- Context MUST be written in the SAME language as the source document content
|
|
254
|
+
- Be comprehensive yet concise - aim for maximum information density
|
|
255
|
+
- Prioritize search retrieval optimization and semantic understanding
|
|
256
|
+
- Include synonyms and alternative phrasings users might search for
|
|
257
|
+
- Focus on conceptual relationships and knowledge connections
|
|
258
|
+
- Do NOT reproduce or quote the original chunk content verbatim
|
|
259
|
+
- Ensure context is self-contained and understandable without the original chunk
|
|
260
|
+
- Use natural language that flows well while incorporating all required elements
|
|
261
|
+
</critical_rules>
|
|
262
|
+
|
|
263
|
+
SEARCH OPTIMIZATION GUIDELINES:
|
|
264
|
+
<search_optimization>
|
|
265
|
+
- Include both explicit terms from the content and implicit concepts
|
|
266
|
+
- Consider various ways users might phrase queries related to this content
|
|
267
|
+
- Incorporate hierarchical information (section → subsection → detail level)
|
|
268
|
+
- Add contextual bridges that connect this chunk to related topics
|
|
269
|
+
- Use varied vocabulary to capture different search approaches
|
|
270
|
+
</search_optimization>
|
|
271
|
+
|
|
272
|
+
<document_content>
|
|
273
|
+
{document_content}
|
|
274
|
+
</document_content>
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
When provided, follow these additional context extraction instructions:
|
|
278
|
+
<additional_instructions>
|
|
279
|
+
{context_additional_instructions}
|
|
280
|
+
</additional_instructions>
|
|
281
|
+
|
|
282
|
+
"""
|
|
283
|
+
|
|
284
|
+
|
|
135
285
|
class ContextChunk(BaseModel):
|
|
136
|
-
context: str = Field(
|
|
286
|
+
context: str = Field(
|
|
287
|
+
description="Context description that helps with search retrieval"
|
|
288
|
+
)
|
|
289
|
+
|
|
137
290
|
|
|
138
291
|
class Transcription(BaseModel):
|
|
139
292
|
"""Document Transcription."""
|
|
293
|
+
|
|
140
294
|
transcription: str = Field(description="Full transcription")
|
|
141
295
|
language: str = Field(description="Main language")
|
|
@@ -3,22 +3,27 @@ from ...domain.models import ParsedDoc
|
|
|
3
3
|
from typing import Optional
|
|
4
4
|
import logging
|
|
5
5
|
import os
|
|
6
|
+
|
|
6
7
|
logger = logging.getLogger(__name__)
|
|
7
8
|
|
|
9
|
+
|
|
8
10
|
class LocalStorageService(PersistenceService):
|
|
9
11
|
"""Persistence service for local storage."""
|
|
10
12
|
|
|
11
|
-
def __init__(self):
|
|
12
|
-
self.
|
|
13
|
+
def __init__(self, source_storage_route: str, target_storage_route: str):
|
|
14
|
+
self.source_storage_route = source_storage_route
|
|
15
|
+
self.target_storage_route = target_storage_route
|
|
16
|
+
self.supports_tagging = hasattr(self, "retrieve_file_tags")
|
|
13
17
|
|
|
14
18
|
def load_markdown_file_content(self, file_key: str) -> str:
|
|
15
19
|
"""Load markdown file content from local storage."""
|
|
16
20
|
file_content = None
|
|
17
|
-
with open(
|
|
21
|
+
with open(
|
|
22
|
+
f"{self.source_storage_route}/{file_key}", "r", encoding="utf-8"
|
|
23
|
+
) as file:
|
|
18
24
|
file_content = file.read()
|
|
19
25
|
return file_content
|
|
20
26
|
|
|
21
|
-
|
|
22
27
|
def retrieve_raw_file(self, file_key: str) -> str:
|
|
23
28
|
"""Retrieve file path in tmp folder from local storage.
|
|
24
29
|
|
|
@@ -32,16 +37,21 @@ class LocalStorageService(PersistenceService):
|
|
|
32
37
|
ClientError: If there's an error retrieving the object from local storage
|
|
33
38
|
"""
|
|
34
39
|
try:
|
|
35
|
-
tmp_file_path = f"{self.
|
|
40
|
+
tmp_file_path = f"{self.source_storage_route}/{file_key}"
|
|
36
41
|
if not os.path.exists(tmp_file_path):
|
|
37
42
|
raise FileNotFoundError(f"File {file_key} not found in local storage")
|
|
38
43
|
return tmp_file_path
|
|
39
44
|
except Exception as e:
|
|
40
|
-
logger.error(
|
|
45
|
+
logger.error(
|
|
46
|
+
f"Unexpected error retrieving file {file_key} from local storage: {str(e)}"
|
|
47
|
+
)
|
|
41
48
|
raise
|
|
42
49
|
|
|
43
|
-
|
|
44
|
-
|
|
50
|
+
def save_parsed_document(
|
|
51
|
+
self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}
|
|
52
|
+
):
|
|
45
53
|
"""Save a parsed document."""
|
|
46
|
-
with open(
|
|
54
|
+
with open(
|
|
55
|
+
f"{self.target_storage_route}/{file_key}", "w", encoding="utf-8"
|
|
56
|
+
) as f:
|
|
47
57
|
f.write(parsed_document.document_text)
|
|
@@ -11,12 +11,19 @@ logger = logging.getLogger(__name__)
|
|
|
11
11
|
|
|
12
12
|
class S3StorageService(PersistenceService):
|
|
13
13
|
"""Persistence service for S3 storage."""
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
14
|
+
|
|
15
|
+
__slots__ = ("origin_bucket_name", "target_bucket_name", "region_name")
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
origin_bucket_name: str,
|
|
20
|
+
target_bucket_name: str,
|
|
21
|
+
region_name: str = "us-east-1",
|
|
22
|
+
):
|
|
23
|
+
self.s3 = boto3_client("s3", region_name=region_name)
|
|
17
24
|
self.origin_bucket_name = origin_bucket_name
|
|
18
25
|
self.target_bucket_name = target_bucket_name
|
|
19
|
-
|
|
26
|
+
self.supports_tagging = hasattr(self, "retrieve_file_tags")
|
|
20
27
|
|
|
21
28
|
def load_markdown_file_content(self, file_key: str) -> str:
|
|
22
29
|
"""Load markdown file content from S3 storage.
|
|
@@ -36,9 +43,9 @@ class S3StorageService(PersistenceService):
|
|
|
36
43
|
response = self.s3.get_object(Bucket=self.target_bucket_name, Key=file_key)
|
|
37
44
|
tmp_file_key = f"/tmp/{file_key}"
|
|
38
45
|
os.makedirs(os.path.dirname(tmp_file_key), exist_ok=True)
|
|
39
|
-
with open(tmp_file_key,
|
|
40
|
-
f.write(response[
|
|
41
|
-
with open(tmp_file_key,
|
|
46
|
+
with open(tmp_file_key, "wb") as f:
|
|
47
|
+
f.write(response["Body"].read())
|
|
48
|
+
with open(tmp_file_key, "r", encoding="utf-8") as f:
|
|
42
49
|
file_content = f.read()
|
|
43
50
|
return file_content
|
|
44
51
|
except ClientError as e:
|
|
@@ -48,7 +55,6 @@ class S3StorageService(PersistenceService):
|
|
|
48
55
|
logger.error(f"Unexpected error loading file {file_key} from S3: {str(e)}")
|
|
49
56
|
raise
|
|
50
57
|
|
|
51
|
-
|
|
52
58
|
def retrieve_raw_file(self, file_key: str) -> str:
|
|
53
59
|
"""Retrieve file path in tmp folder from S3 storage.
|
|
54
60
|
|
|
@@ -67,18 +73,21 @@ class S3StorageService(PersistenceService):
|
|
|
67
73
|
tmp_file_key = f"/tmp/{file_key}"
|
|
68
74
|
# Create parent directories if they don't exist
|
|
69
75
|
os.makedirs(os.path.dirname(tmp_file_key), exist_ok=True)
|
|
70
|
-
with open(tmp_file_key,
|
|
71
|
-
f.write(response[
|
|
76
|
+
with open(tmp_file_key, "wb") as f:
|
|
77
|
+
f.write(response["Body"].read())
|
|
72
78
|
return tmp_file_key
|
|
73
79
|
except ClientError as e:
|
|
74
80
|
logger.error(f"Error retrieving file {file_key} from S3: {str(e)}")
|
|
75
81
|
raise
|
|
76
82
|
except Exception as e:
|
|
77
|
-
logger.error(
|
|
83
|
+
logger.error(
|
|
84
|
+
f"Unexpected error retrieving file {file_key} from S3: {str(e)}"
|
|
85
|
+
)
|
|
78
86
|
raise
|
|
79
87
|
|
|
80
|
-
|
|
81
|
-
|
|
88
|
+
def save_parsed_document(
|
|
89
|
+
self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}
|
|
90
|
+
):
|
|
82
91
|
"""Save a parsed document to S3.
|
|
83
92
|
|
|
84
93
|
Args:
|
|
@@ -91,21 +100,21 @@ class S3StorageService(PersistenceService):
|
|
|
91
100
|
"""
|
|
92
101
|
try:
|
|
93
102
|
# Convert document content to bytes
|
|
94
|
-
content_bytes = parsed_document.document_text.encode(
|
|
103
|
+
content_bytes = parsed_document.document_text.encode("utf-8")
|
|
95
104
|
# Upload the file to S3
|
|
96
105
|
if not file_tags:
|
|
97
106
|
self.s3.put_object(
|
|
98
|
-
Bucket=self.target_bucket_name,
|
|
99
|
-
Key=file_key,
|
|
100
|
-
Body=content_bytes
|
|
107
|
+
Bucket=self.target_bucket_name, Key=file_key, Body=content_bytes
|
|
101
108
|
)
|
|
102
109
|
else:
|
|
103
|
-
tagging_string = "&".join(
|
|
110
|
+
tagging_string = "&".join(
|
|
111
|
+
[f"{key}={value}" for key, value in file_tags.items()]
|
|
112
|
+
)
|
|
104
113
|
self.s3.put_object(
|
|
105
114
|
Bucket=self.target_bucket_name,
|
|
106
115
|
Key=file_key,
|
|
107
116
|
Body=content_bytes,
|
|
108
|
-
Tagging=tagging_string
|
|
117
|
+
Tagging=tagging_string,
|
|
109
118
|
)
|
|
110
119
|
|
|
111
120
|
logger.info(f"Successfully saved document to S3 as {file_key}")
|
|
@@ -122,8 +131,5 @@ class S3StorageService(PersistenceService):
|
|
|
122
131
|
Args:
|
|
123
132
|
file_key: The key (path) to retrieve tags
|
|
124
133
|
"""
|
|
125
|
-
response = self.s3.get_object_tagging(
|
|
126
|
-
Bucket=bucket_name,
|
|
127
|
-
Key=file_key
|
|
128
|
-
)
|
|
134
|
+
response = self.s3.get_object_tagging(Bucket=bucket_name, Key=file_key)
|
|
129
135
|
return {item["Key"]: item["Value"] for item in response["TagSet"]}
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
from typing_extensions import Sequence
|
|
2
|
+
from test.test_typing import CoolEmployee
|
|
3
|
+
from langchain_core.documents import Document
|
|
4
|
+
from langchain_chroma import Chroma
|
|
5
|
+
from typing import List
|
|
6
|
+
import logging
|
|
7
|
+
from uuid import uuid4
|
|
8
|
+
from ...application.interfaces import EmbeddingsManager
|
|
9
|
+
|
|
10
|
+
# load_dotenv()
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ChromaEmbeddingsManager(EmbeddingsManager):
|
|
16
|
+
__slots__ = ("embeddings_model", "collection_name")
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
embeddings_model,
|
|
21
|
+
chroma_host=None,
|
|
22
|
+
**chroma_conn_kwargs,
|
|
23
|
+
):
|
|
24
|
+
"""
|
|
25
|
+
Initialize the ChromaEmbeddingsManager.
|
|
26
|
+
Args:
|
|
27
|
+
embeddings_model: The embeddings model to use for generating vector embeddings
|
|
28
|
+
(typically a LangChain embeddings model instance)
|
|
29
|
+
chroma_host: The Chroma host URL
|
|
30
|
+
|
|
31
|
+
Raises:
|
|
32
|
+
Exception: If there's an error initializing the RedisEmbeddingsManager
|
|
33
|
+
"""
|
|
34
|
+
self.embeddings_model = embeddings_model
|
|
35
|
+
self.chroma_host = chroma_host
|
|
36
|
+
try:
|
|
37
|
+
if chroma_host:
|
|
38
|
+
self.chroma = Chroma(
|
|
39
|
+
embedding_function=self.embeddings_model,
|
|
40
|
+
host=chroma_host,
|
|
41
|
+
**chroma_conn_kwargs,
|
|
42
|
+
)
|
|
43
|
+
logger.info("ChromaEmbeddingsManager initialized")
|
|
44
|
+
else:
|
|
45
|
+
self.chroma = Chroma(
|
|
46
|
+
embedding_function=self.embeddings_model, **chroma_conn_kwargs
|
|
47
|
+
)
|
|
48
|
+
logger.info("ChromaEmbeddingsManager initialized")
|
|
49
|
+
except Exception as e:
|
|
50
|
+
logger.error(f"Failed to initialize ChromaEmbeddingsManager: {str(e)}")
|
|
51
|
+
raise
|
|
52
|
+
|
|
53
|
+
def configure_vector_store(
|
|
54
|
+
self,
|
|
55
|
+
table_name: str = "",
|
|
56
|
+
vector_size: int = 768,
|
|
57
|
+
content_column: str = "document",
|
|
58
|
+
id_column: str = "id",
|
|
59
|
+
):
|
|
60
|
+
"""Configure the vector store."""
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
def init_vector_store(
|
|
64
|
+
self,
|
|
65
|
+
table_name: str = "",
|
|
66
|
+
content_column: str = "document",
|
|
67
|
+
id_column: str = "id",
|
|
68
|
+
):
|
|
69
|
+
"""Initialize the vector store."""
|
|
70
|
+
pass
|
|
71
|
+
|
|
72
|
+
def index_documents(self, documents: list[Document]):
|
|
73
|
+
"""
|
|
74
|
+
Add documents to the vector store with their embeddings.
|
|
75
|
+
|
|
76
|
+
This method takes a list of Document objects, generates embeddings for them
|
|
77
|
+
using the embeddings model, and stores both the documents and their
|
|
78
|
+
embeddings in the PostgreSQL database.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
docs: A list of LangChain Document objects to add to the vector store
|
|
82
|
+
Each Document should have page_content and metadata attributes
|
|
83
|
+
from langchain_core.documents import Document
|
|
84
|
+
Returns:
|
|
85
|
+
None
|
|
86
|
+
|
|
87
|
+
Raises:
|
|
88
|
+
Exception: If there's an error adding documents to the vector store
|
|
89
|
+
"""
|
|
90
|
+
try:
|
|
91
|
+
logger.info(f"Indexing {len(documents)} documents in vector store")
|
|
92
|
+
self.chroma.add_documents(documents)
|
|
93
|
+
except Exception as e:
|
|
94
|
+
logger.error(f"Error indexing documents: {str(e)}")
|
|
95
|
+
raise
|
|
96
|
+
|
|
97
|
+
def get_documents_by_id(self, ids: list[str]):
|
|
98
|
+
"""
|
|
99
|
+
Get document by ID from the vector store.
|
|
100
|
+
"""
|
|
101
|
+
try:
|
|
102
|
+
return self.chroma.get_by_ids(ids)
|
|
103
|
+
except Exception as e:
|
|
104
|
+
logger.error(f"Error getting documents by ID: {str(e)}")
|
|
105
|
+
raise
|
|
106
|
+
|
|
107
|
+
def delete_documents_by_id(self, ids: list[str]):
|
|
108
|
+
"""
|
|
109
|
+
Delete documents by ID from the vector store.
|
|
110
|
+
"""
|
|
111
|
+
try:
|
|
112
|
+
self.chroma.delete(ids)
|
|
113
|
+
except Exception as e:
|
|
114
|
+
logger.error(f"Error deleting documents by ID: {str(e)}")
|
|
115
|
+
raise
|
|
116
|
+
|
|
117
|
+
def delete_documents_by_metadata_key(self, metadata_key: str, metadata_value: str):
|
|
118
|
+
"""
|
|
119
|
+
Delete documents by filter from the vector store.
|
|
120
|
+
"""
|
|
121
|
+
try:
|
|
122
|
+
self.chroma.delete(where={metadata_key: metadata_value})
|
|
123
|
+
except Exception as error:
|
|
124
|
+
logger.error(
|
|
125
|
+
f"Error deleting documents by filter: {str(filter)}, error: {error} "
|
|
126
|
+
)
|
|
127
|
+
raise
|
|
128
|
+
|
|
129
|
+
def get_documents_keys_by_source_id(self, source_id: str):
|
|
130
|
+
"""Get documents keys by source ID."""
|
|
131
|
+
pass
|
|
132
|
+
|
|
133
|
+
def delete_documents_by_source_id(self, source_id: str):
|
|
134
|
+
"""Delete documents by source ID."""
|
|
135
|
+
pass
|
|
@@ -6,6 +6,7 @@ from langchain_postgres import PGVectorStore, PGEngine
|
|
|
6
6
|
from sqlalchemy import create_engine
|
|
7
7
|
from dotenv import load_dotenv
|
|
8
8
|
from wizit_context_ingestor.application.interfaces import EmbeddingsManager
|
|
9
|
+
|
|
9
10
|
load_dotenv()
|
|
10
11
|
|
|
11
12
|
logger = logging.getLogger(__name__)
|
|
@@ -38,19 +39,21 @@ class PgEmbeddingsManager(EmbeddingsManager):
|
|
|
38
39
|
... )
|
|
39
40
|
>>> documents = [Document(page_content="Sample text", metadata={"source": "example"})]
|
|
40
41
|
"""
|
|
42
|
+
|
|
41
43
|
__slots__ = ("embeddings_model", "pg_connection")
|
|
44
|
+
|
|
42
45
|
def __init__(self, embeddings_model, pg_connection: str):
|
|
43
46
|
"""
|
|
44
|
-
|
|
47
|
+
Initialize the PgEmbeddingsManager.
|
|
45
48
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
49
|
+
Args:
|
|
50
|
+
embeddings_model: The embeddings model to use for generating vector embeddings
|
|
51
|
+
(typically a LangChain embeddings model instance)
|
|
52
|
+
pg_connection: The PostgreSQL connection string
|
|
53
|
+
(format: postgresql://user:password@host:port/database)
|
|
51
54
|
|
|
52
|
-
|
|
53
|
-
|
|
55
|
+
Raises:
|
|
56
|
+
Exception: If there's an error initializing the vector store
|
|
54
57
|
"""
|
|
55
58
|
self.pg_connection = pg_connection
|
|
56
59
|
self.embeddings_model = embeddings_model
|
|
@@ -58,65 +61,65 @@ class PgEmbeddingsManager(EmbeddingsManager):
|
|
|
58
61
|
self.vector_store = None
|
|
59
62
|
self.record_manager = None
|
|
60
63
|
try:
|
|
61
|
-
|
|
62
|
-
|
|
64
|
+
self.pg_engine = PGEngine.from_connection_string(url=pg_connection)
|
|
65
|
+
logger.info("PgEmbeddingsManager initialized")
|
|
63
66
|
except Exception as e:
|
|
64
67
|
logger.error(f"Failed to initialize PgEmbeddingsManager: {str(e)}")
|
|
65
68
|
raise
|
|
66
69
|
|
|
67
70
|
def configure_vector_store(
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
71
|
+
self,
|
|
72
|
+
table_name: str = "langchain_pg_embedding",
|
|
73
|
+
vector_size: int = 768,
|
|
74
|
+
content_column: str = "document",
|
|
75
|
+
id_column: str = "id",
|
|
76
|
+
metadata_json_column: str = "cmetadata",
|
|
77
|
+
pg_record_manager: str = "postgres/langchain_pg_collection",
|
|
75
78
|
):
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
self.record_manager.create_schema()
|
|
79
|
+
self.pg_engine.init_vectorstore_table(
|
|
80
|
+
table_name=table_name,
|
|
81
|
+
vector_size=vector_size,
|
|
82
|
+
content_column=content_column,
|
|
83
|
+
id_column=id_column,
|
|
84
|
+
metadata_json_column=metadata_json_column,
|
|
85
|
+
)
|
|
86
|
+
self.record_manager = SQLRecordManager(
|
|
87
|
+
pg_record_manager, engine=create_engine(url=self.pg_connection)
|
|
88
|
+
)
|
|
89
|
+
# TODO move this from here
|
|
90
|
+
self.record_manager.create_schema()
|
|
89
91
|
|
|
90
92
|
def init_vector_store(
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
93
|
+
self,
|
|
94
|
+
table_name: str = "langchain_pg_embedding",
|
|
95
|
+
content_column: str = "document",
|
|
96
|
+
metadata_json_column: str = "cmetadata",
|
|
97
|
+
id_column: str = "id",
|
|
98
|
+
pg_record_manager: str = "postgres/langchain_pg_collection",
|
|
97
99
|
):
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
)
|
|
100
|
+
self.vector_store = PGVectorStore.create_sync(
|
|
101
|
+
embedding_service=self.embeddings_model,
|
|
102
|
+
engine=self.pg_engine,
|
|
103
|
+
table_name=table_name,
|
|
104
|
+
content_column=content_column,
|
|
105
|
+
metadata_json_column=metadata_json_column,
|
|
106
|
+
id_column=id_column,
|
|
107
|
+
)
|
|
108
|
+
self.record_manager = SQLRecordManager(
|
|
109
|
+
pg_record_manager, engine=create_engine(url=self.pg_connection)
|
|
110
|
+
)
|
|
110
111
|
|
|
111
112
|
def vector_store_initialized(func):
|
|
112
113
|
"""validate vector store initialization"""
|
|
114
|
+
|
|
113
115
|
def wrapper(self, *args, **kwargs):
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
116
|
+
# Common validation logic
|
|
117
|
+
if self.vector_store is None:
|
|
118
|
+
raise Exception("Vector store not initialized")
|
|
119
|
+
if self.record_manager is None:
|
|
120
|
+
raise Exception("Record manager not initialized")
|
|
121
|
+
return func(self, *args, **kwargs)
|
|
122
|
+
|
|
120
123
|
return wrapper
|
|
121
124
|
|
|
122
125
|
@vector_store_initialized
|