wizit-context-ingestor 0.3.0b2__py3-none-any.whl → 0.3.0b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wizit-context-ingestor might be problematic. Click here for more details.

@@ -1,3 +1,4 @@
1
+ import asyncio
1
2
  from langchain_core.output_parsers.pydantic import PydanticOutputParser
2
3
  from langchain_core.prompts import ChatPromptTemplate
3
4
  from langchain_core.documents import Document
@@ -44,7 +45,7 @@ class ContextChunksInDocumentService:
44
45
  self.context_additional_instructions = ""
45
46
  self.metadata_source = "source"
46
47
 
47
- def _retrieve_context_chunk_in_document_with_workflow(
48
+ async def _retrieve_context_chunk_in_document_with_workflow(
48
49
  self,
49
50
  workflow,
50
51
  markdown_content: str,
@@ -53,7 +54,7 @@ class ContextChunksInDocumentService:
53
54
  ) -> Document:
54
55
  """Retrieve context chunks in document."""
55
56
  try:
56
- result = workflow.invoke(
57
+ result = await workflow.ainvoke(
57
58
  {
58
59
  "messages": [
59
60
  HumanMessage(
@@ -74,9 +75,7 @@ class ContextChunksInDocumentService:
74
75
  }
75
76
  },
76
77
  )
77
- # chunk.page_content = (
78
- # f"Context:{result['context']}, Content:{chunk.page_content}"
79
- # )
78
+ chunk.page_content = f"<context>\n{result['context']}\n</context>\n <content>\n{chunk.page_content}\n</content>"
80
79
  chunk.metadata["context"] = result["context"]
81
80
  if chunk_metadata:
82
81
  for key, value in chunk_metadata.items():
@@ -154,7 +153,7 @@ class ContextChunksInDocumentService:
154
153
  # logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
155
154
  # raise
156
155
 
157
- def retrieve_context_chunks_in_document_with_workflow(
156
+ async def retrieve_context_chunks_in_document_with_workflow(
158
157
  self,
159
158
  markdown_content: str,
160
159
  chunks: List[Document],
@@ -167,7 +166,7 @@ class ContextChunksInDocumentService:
167
166
  )
168
167
  compiled_context_workflow = context_workflow.gen_workflow()
169
168
  compiled_context_workflow = compiled_context_workflow.compile()
170
- context_chunks = list(
169
+ context_chunks_workflow_invocations = list(
171
170
  map(
172
171
  lambda chunk: self._retrieve_context_chunk_in_document_with_workflow(
173
172
  compiled_context_workflow,
@@ -178,12 +177,13 @@ class ContextChunksInDocumentService:
178
177
  chunks,
179
178
  )
180
179
  )
180
+ context_chunks = await asyncio.gather(*context_chunks_workflow_invocations)
181
181
  return context_chunks
182
182
  except Exception as e:
183
183
  logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
184
184
  raise
185
185
 
186
- def get_context_chunks_in_document(self, file_key: str, file_tags: dict = {}):
186
+ async def get_context_chunks_in_document(self, file_key: str, file_tags: dict = {}):
187
187
  """
188
188
  Get the context chunks in a document.
189
189
  """
@@ -199,8 +199,10 @@ class ContextChunksInDocumentService:
199
199
  logger.info(f"Document loaded:{file_key}")
200
200
  chunks = self.rag_chunker.gen_chunks_for_document(langchain_rag_document)
201
201
  logger.info(f"Chunks generated:{len(chunks)}")
202
- context_chunks = self.retrieve_context_chunks_in_document_with_workflow(
203
- markdown_content, chunks, file_tags
202
+ context_chunks = (
203
+ await self.retrieve_context_chunks_in_document_with_workflow(
204
+ markdown_content, chunks, file_tags
205
+ )
204
206
  )
205
207
  logger.info(f"Context chunks generated:{len(context_chunks)}")
206
208
  # upsert validation
@@ -1,3 +1,4 @@
1
+ import asyncio
1
2
  from typing import Tuple, List, Dict, Optional
2
3
  from langchain_core.prompts import ChatPromptTemplate
3
4
  from langchain_core.output_parsers.pydantic import PydanticOutputParser
@@ -23,15 +24,15 @@ class TranscriptionService:
23
24
  persistence_service: PersistenceService,
24
25
  target_language: str = "es",
25
26
  transcription_additional_instructions: str = "",
26
- transcription_accuracy_threshold: int = 90,
27
+ transcription_accuracy_threshold: float = 0.90,
27
28
  max_transcription_retries: int = 2,
28
29
  ):
29
30
  self.ai_application_service = ai_application_service
30
31
  self.persistence_service = persistence_service
31
32
  self.target_language = target_language
32
33
  if (
33
- transcription_accuracy_threshold < 0
34
- or transcription_accuracy_threshold > 95
34
+ transcription_accuracy_threshold < 0.0
35
+ or transcription_accuracy_threshold > 0.95
35
36
  ):
36
37
  raise ValueError(
37
38
  "transcription_accuracy_threshold must be between 0 and 95"
@@ -46,6 +47,15 @@ class TranscriptionService:
46
47
  transcription_additional_instructions
47
48
  )
48
49
  self.chat_model = self.ai_application_service.load_chat_model()
50
+ self.transcription_workflow = TranscriptionWorkflow(
51
+ self.chat_model, self.transcription_additional_instructions
52
+ )
53
+ self.compiled_transcription_workflow = (
54
+ self.transcription_workflow.gen_workflow()
55
+ )
56
+ self.compiled_transcription_workflow = (
57
+ self.compiled_transcription_workflow.compile()
58
+ )
49
59
 
50
60
  # def parse_doc_page(self, document: ParsedDocPage) -> ParsedDocPage:
51
61
  # """Transcribe an image to text.
@@ -101,19 +111,16 @@ class TranscriptionService:
101
111
  # logger.error(f"Failed to parse document page: {str(e)}")
102
112
  # raise
103
113
 
104
- def parse_doc_page_with_workflow(self, document: ParsedDocPage) -> ParsedDocPage:
114
+ async def parse_doc_page_with_workflow(
115
+ self, document: ParsedDocPage
116
+ ) -> ParsedDocPage:
105
117
  """Transcribe an image to text using an agent.
106
118
  Args:
107
119
  document: The document with the image to transcribe
108
120
  Returns:
109
121
  Processed text
110
122
  """
111
- transcription_workflow = TranscriptionWorkflow(
112
- self.chat_model, self.transcription_additional_instructions
113
- )
114
- compiled_transcription_workflow = transcription_workflow.gen_workflow()
115
- compiled_transcription_workflow = compiled_transcription_workflow.compile()
116
- result = compiled_transcription_workflow.invoke(
123
+ result = await self.compiled_transcription_workflow.ainvoke(
117
124
  {
118
125
  "messages": [
119
126
  HumanMessage(
@@ -146,20 +153,39 @@ class TranscriptionService:
146
153
  if result["transcription"]:
147
154
  document.page_text = result["transcription"]
148
155
  else:
149
- raise ValueError("No transcription found")
156
+ raise ValueError(f"No transcription found: {result} ")
150
157
  return document
151
158
 
152
- def process_document(self, file_key: str) -> Tuple[List[ParsedDocPage], ParsedDoc]:
159
+ # def process_document(self, file_key: str) -> Tuple[List[ParsedDocPage], ParsedDoc]:
160
+ # """
161
+ # Process a document by parsing it and returning the parsed content.
162
+ # """
163
+ # raw_file_path = self.persistence_service.retrieve_raw_file(file_key)
164
+ # parse_doc_model_service = ParseDocModelService(raw_file_path)
165
+ # document_pages = parse_doc_model_service.parse_document_to_base64()
166
+ # parsed_pages = []
167
+ # for page in document_pages:
168
+ # page = self.parse_doc_page_with_workflow(page)
169
+ # parsed_pages.append(page)
170
+ # logger.info(f"Parsed {len(parsed_pages)} pages")
171
+ # parsed_document = parse_doc_model_service.create_md_content(parsed_pages)
172
+ # return parsed_pages, parsed_document
173
+
174
+ async def process_document(
175
+ self, file_key: str
176
+ ) -> Tuple[List[ParsedDocPage], ParsedDoc]:
153
177
  """
154
178
  Process a document by parsing it and returning the parsed content.
155
179
  """
156
180
  raw_file_path = self.persistence_service.retrieve_raw_file(file_key)
157
181
  parse_doc_model_service = ParseDocModelService(raw_file_path)
158
182
  document_pages = parse_doc_model_service.parse_document_to_base64()
183
+ parse_pages_workflow_tasks = []
159
184
  parsed_pages = []
160
185
  for page in document_pages:
161
- page = self.parse_doc_page_with_workflow(page)
162
- parsed_pages.append(page)
186
+ parse_pages_workflow_tasks.append(self.parse_doc_page_with_workflow(page))
187
+ # here
188
+ parsed_pages = await asyncio.gather(*parse_pages_workflow_tasks)
163
189
  logger.info(f"Parsed {len(parsed_pages)} pages")
164
190
  parsed_document = parse_doc_model_service.create_md_content(parsed_pages)
165
191
  return parsed_pages, parsed_document
@@ -8,8 +8,9 @@ from ..domain.models import ParsedDocPage, ParsedDoc
8
8
 
9
9
  logger = logging.getLogger(__name__)
10
10
 
11
+
11
12
  # CHECK THIS THING IMPROVE THE WAY CODE IS STRUCTURED
12
- class ParseDocModelService():
13
+ class ParseDocModelService:
13
14
  """
14
15
  Class for parsing PDF documents, converting pages to base64 images
15
16
  """
@@ -25,7 +26,6 @@ class ParseDocModelService():
25
26
  self.pdf_document = pymupdf.open(file_path)
26
27
  self.page_count = self.pdf_document.page_count
27
28
 
28
-
29
29
  def pdf_page_to_base64(self, page_number: int) -> ParsedDocPage:
30
30
  """
31
31
  Convert a PDF page to a base64-encoded PNG image.
@@ -48,10 +48,7 @@ class ParseDocModelService():
48
48
  img.save(buffer, format="PNG")
49
49
  b64_encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
50
50
  logger.info(f"Page {page_number} encoded successfully")
51
- return ParsedDocPage(
52
- page_number=page_number,
53
- page_base64=b64_encoded_image
54
- )
51
+ return ParsedDocPage(page_number=page_number, page_base64=b64_encoded_image)
55
52
  except Exception as e:
56
53
  logger.error(f"Failed to parse b64 image: {str(e)}")
57
54
  raise
@@ -87,12 +84,10 @@ class ParseDocModelService():
87
84
  Create a markdown content from a list of parsed pages.
88
85
  """
89
86
  md_content = ""
90
- for page in parsed_pages:
87
+ sorted_pages = sorted(parsed_pages, key=lambda page: page.page_number)
88
+ for page in sorted_pages:
91
89
  md_content += f"## Page {page.page_number}\n\n"
92
90
  md_content += f"{page.page_text}\n\n"
93
- return ParsedDoc(
94
- pages=parsed_pages,
95
- document_text=md_content
96
- )
91
+ return ParsedDoc(pages=parsed_pages, document_text=md_content)
97
92
 
98
93
  # def
@@ -3,11 +3,10 @@ import logging
3
3
 
4
4
  logger = logging.getLogger(__name__)
5
5
 
6
- class AwsSecretsManager:
7
-
8
- def __init__(self):
9
- self.client = boto3_client('secretsmanager')
10
6
 
7
+ class AwsSecretsManager:
8
+ def __init__(self, aws_region="us-east-1"):
9
+ self.client = boto3_client("secretsmanager", region_name=aws_region)
11
10
 
12
11
  def get_secret(self, secret_name):
13
12
  """
@@ -78,7 +78,7 @@ class TranscriptionManager:
78
78
  llm_model_id: str = "claude-sonnet-4@20250514",
79
79
  target_language: str = "es",
80
80
  transcription_additional_instructions: str = "",
81
- transcription_accuracy_threshold: int = 90,
81
+ transcription_accuracy_threshold: float = 0.90,
82
82
  max_transcription_retries: int = 2,
83
83
  ):
84
84
  self.gcp_project_id = gcp_project_id
@@ -116,18 +116,18 @@ class TranscriptionManager:
116
116
  return vertex_model
117
117
 
118
118
  def tracing(func):
119
- def gen_tracing_context(self, *args, **kwargs):
119
+ async def gen_tracing_context(self, *args, **kwargs):
120
120
  with tracing_context(
121
121
  enabled=True,
122
122
  project_name=self.langsmith_project_name,
123
123
  client=self.langsmith_client,
124
124
  ):
125
- return func(self, *args, **kwargs)
125
+ return await func(self, *args, **kwargs)
126
126
 
127
127
  return gen_tracing_context
128
128
 
129
129
  @tracing
130
- def transcribe_document(self, file_key: str):
130
+ async def transcribe_document(self, file_key: str):
131
131
  """Transcribe a document from source storage to target storage.
132
132
  This method serves as a generic interface for transcribing documents from
133
133
  various storage sources to target destinations. The specific implementation
@@ -162,9 +162,10 @@ class TranscriptionManager:
162
162
  transcription_accuracy_threshold=self.transcription_accuracy_threshold,
163
163
  max_transcription_retries=self.max_transcription_retries,
164
164
  )
165
- parsed_pages, parsed_document = (
166
- transcribe_document_service.process_document(file_key)
167
- )
165
+ (
166
+ parsed_pages,
167
+ parsed_document,
168
+ ) = await transcribe_document_service.process_document(file_key)
168
169
  source_storage_file_tags = {}
169
170
  if persistence_service.supports_tagging:
170
171
  # source_storage_file_tags.tag_file(file_key, {"status": "transcribed"})
@@ -231,18 +232,18 @@ class ChunksManager:
231
232
  return vertex_model
232
233
 
233
234
  def tracing(func):
234
- def gen_tacing_context(self, *args, **kwargs):
235
+ async def gen_tracing_context(self, *args, **kwargs):
235
236
  with tracing_context(
236
237
  enabled=True,
237
238
  project_name=self.langsmith_project_name,
238
239
  client=self.langsmith_client,
239
240
  ):
240
- return func(self, *args, **kwargs)
241
+ return await func(self, *args, **kwargs)
241
242
 
242
- return gen_tacing_context
243
+ return gen_tracing_context
243
244
 
244
245
  @tracing
245
- def gen_context_chunks(
246
+ async def gen_context_chunks(
246
247
  self, file_key: str, source_storage_route: str, target_storage_route: str
247
248
  ):
248
249
  try:
@@ -272,7 +273,7 @@ class ChunksManager:
272
273
  target_language=self.target_language,
273
274
  )
274
275
  context_chunks = (
275
- context_chunks_in_document_service.get_context_chunks_in_document(
276
+ await context_chunks_in_document_service.get_context_chunks_in_document(
276
277
  file_key, target_bucket_file_tags
277
278
  )
278
279
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: wizit-context-ingestor
3
- Version: 0.3.0b2
3
+ Version: 0.3.0b4
4
4
  Summary: Contextual Rag with Cloud Solutions
5
5
  Requires-Dist: anthropic[vertex]>=0.66.0
6
6
  Requires-Dist: boto3>=1.40.23
@@ -1,16 +1,16 @@
1
1
  wizit_context_ingestor/.DS_Store,sha256=c7hZ0C8v2hxprMlCgmvxXDl92phew3iSATJzE1yYTBs,6148
2
2
  wizit_context_ingestor/__init__.py,sha256=TSTm5qSpNNCz9ilKYkXRUxupvmWG2AHfv7RBWFw8T4c,107
3
3
  wizit_context_ingestor/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- wizit_context_ingestor/application/context_chunk_service.py,sha256=zKdnjNr5woi4PHseLEAcfdTNRvOroAkU_52pwLZLmBc,8858
4
+ wizit_context_ingestor/application/context_chunk_service.py,sha256=LYRKBsY30IT2LczkgkYdPx7W3yycRy-0m7t3KKgq6Nw,9046
5
5
  wizit_context_ingestor/application/interfaces.py,sha256=W0qonE3t-S-zwAoKtDYc4oyW_GOILKVmrdy8LnC8MVI,3193
6
- wizit_context_ingestor/application/transcription_service.py,sha256=4Z_STIRgExY5VnVWbyZ_oSnx_bgSfjfPA2N7tCYb5bg,7334
6
+ wizit_context_ingestor/application/transcription_service.py,sha256=jAjQE_sR0E3CSHLf0lq-24scl-_VKWy3crGhiodkoSM,8394
7
7
  wizit_context_ingestor/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  wizit_context_ingestor/data/kdb.py,sha256=GCkXQmnk2JCXV_VJ-h0k55AOIX8qohzBJN2v-9D1dlU,194
9
9
  wizit_context_ingestor/data/prompts.py,sha256=EnocoriDjPcFPd6Af9G6TUTB8NkO4EFN4AUHfpRVqYU,14406
10
10
  wizit_context_ingestor/data/storage.py,sha256=aanXY1AV696cShHtDDhlJDhKPouZ1dq2lo_57yhTd20,198
11
11
  wizit_context_ingestor/domain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  wizit_context_ingestor/domain/models.py,sha256=DV83PArMyh-VoUqnVF_ohcgStsk549ixdYw98B8o2GI,381
13
- wizit_context_ingestor/domain/services.py,sha256=0i9WwZ0ufBgnzNJ5dt8Iop9VLTeK_AqjcaH8p3Av26I,3347
13
+ wizit_context_ingestor/domain/services.py,sha256=dg8UvYSjYsOMphrciZyGvuRriM8Qf08SstvO979XrFc,3344
14
14
  wizit_context_ingestor/infra/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
15
  wizit_context_ingestor/infra/aws_model.py,sha256=glIaewSdv6PDBXoCe6QgCUIzLCjtM7KlayEERXRNFwo,2539
16
16
  wizit_context_ingestor/infra/persistence/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -21,9 +21,9 @@ wizit_context_ingestor/infra/rag/pg_embeddings.py,sha256=D7onh27SvqYahYAsLy6Deyk
21
21
  wizit_context_ingestor/infra/rag/redis_embeddings.py,sha256=pCP_I1RLeIUTYMSHkZT6AjIOyHA9A47wyffrZBjiG0s,5107
22
22
  wizit_context_ingestor/infra/rag/semantic_chunks.py,sha256=Xes1MwlShKbqVulspXzfb6zJuqd8iBX3nKuy-5BtSfk,2473
23
23
  wizit_context_ingestor/infra/secrets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
- wizit_context_ingestor/infra/secrets/aws_secrets_manager.py,sha256=1k_R_uzLabptiZ1GXAoqAgYpk8EykXIb-pUDdidUDJQ,1202
24
+ wizit_context_ingestor/infra/secrets/aws_secrets_manager.py,sha256=vukil5sO9tQPTM74wUbyQqR8Z-z0ElyjeF2ns7rbVbQ,1249
25
25
  wizit_context_ingestor/infra/vertex_model.py,sha256=6L2C4qH7PSVjdOSzIEZlFtUwu1pgQVXtQBIU5isn644,7582
26
- wizit_context_ingestor/main.py,sha256=WohTQiWOEHshrYnjD0TJWbqsOHhpzb0-ywrdpDgj8Kw,11616
26
+ wizit_context_ingestor/main.py,sha256=vP9311d9TcXtoxmOE1-4jAzKjrNU5ZaxiJiG2d9IV1w,11687
27
27
  wizit_context_ingestor/services/.DS_Store,sha256=1lFlJ5EFymdzGAUAaI30vcaaLHt3F1LwpG7xILf9jsM,6148
28
28
  wizit_context_ingestor/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
29
  wizit_context_ingestor/services/chunks.py,sha256=tQQsdsOscZWzqVY5WxVxr3ii62FOJ3nMARaJJz6CvjQ,2011
@@ -39,6 +39,6 @@ wizit_context_ingestor/workflows/transcription_schemas.py,sha256=CQCl7LXD5voxhJO
39
39
  wizit_context_ingestor/workflows/transcription_state.py,sha256=2Z_t2aZFEH_nAjdEO6RFBEmi_fwvr9cV0aLS1eIxiCQ,590
40
40
  wizit_context_ingestor/workflows/transcription_tools.py,sha256=FtIfWFITn8_Rr5SEobCeR55aJGZoHRMgF2UxRT5vJ-E,1373
41
41
  wizit_context_ingestor/workflows/transcription_workflow.py,sha256=77cLsYGdv01Py2GaKYpACuifPeSxH7tkVodvLv97sdg,1621
42
- wizit_context_ingestor-0.3.0b2.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
43
- wizit_context_ingestor-0.3.0b2.dist-info/METADATA,sha256=Ww9m__uLznS-mcEQNWbRqngtJukxPAlIPHOgyynlLo4,3768
44
- wizit_context_ingestor-0.3.0b2.dist-info/RECORD,,
42
+ wizit_context_ingestor-0.3.0b4.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
43
+ wizit_context_ingestor-0.3.0b4.dist-info/METADATA,sha256=iPcDYUP3VQKukxafk_9HeJQqzw8WqLKrZH71cmbuIYw,3768
44
+ wizit_context_ingestor-0.3.0b4.dist-info/RECORD,,