wizit-context-ingestor 0.2.3b0__tar.gz → 0.2.4b0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wizit-context-ingestor might be problematic. Click here for more details.
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/PKG-INFO +9 -1
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/README.md +8 -0
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/pyproject.toml +3 -1
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/infra/aws_model.py +1 -1
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/infra/persistence/s3_storage.py +3 -3
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/infra/rag/pg_embeddings.py +1 -1
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/infra/rag/redis_embeddings.py +1 -0
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/infra/rag/semantic_chunks.py +1 -1
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/infra/secrets/aws_secrets_manager.py +3 -3
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/infra/vertex_model.py +4 -4
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/main.py +2 -1
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/.DS_Store +0 -0
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/__init__.py +0 -0
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/application/__init__.py +0 -0
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/application/context_chunk_service.py +0 -0
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/application/interfaces.py +0 -0
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/application/transcription_service.py +0 -0
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/data/__init__.py +0 -0
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/data/prompts.py +0 -0
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/domain/__init__.py +0 -0
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/domain/models.py +0 -0
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/domain/services.py +0 -0
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/infra/__init__.py +0 -0
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/infra/persistence/__init__.py +0 -0
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/infra/persistence/local_storage.py +0 -0
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/infra/secrets/__init__.py +0 -0
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/services/.DS_Store +0 -0
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/services/__init__.py +0 -0
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/services/chunks.py +0 -0
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/services/parse_doc.py +0 -0
- {wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/services/pg_embeddings_manager.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: wizit-context-ingestor
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4b0
|
|
4
4
|
Summary: Contextual Rag with Cloud Solutions
|
|
5
5
|
Requires-Dist: anthropic[vertex]>=0.66.0
|
|
6
6
|
Requires-Dist: boto3>=1.40.23
|
|
@@ -79,6 +79,14 @@ from main import context_chunks_in_document
|
|
|
79
79
|
# Get semantic chunks from a document
|
|
80
80
|
context_chunks_in_document("your-document.pdf")
|
|
81
81
|
```
|
|
82
|
+
## Running Memory Profiler
|
|
83
|
+
|
|
84
|
+
To run the memory profiler, use the following command:
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
python -m memray run test_redis.py
|
|
88
|
+
```
|
|
89
|
+
|
|
82
90
|
|
|
83
91
|
## Project Structure
|
|
84
92
|
|
|
@@ -64,6 +64,14 @@ from main import context_chunks_in_document
|
|
|
64
64
|
# Get semantic chunks from a document
|
|
65
65
|
context_chunks_in_document("your-document.pdf")
|
|
66
66
|
```
|
|
67
|
+
## Running Memory Profiler
|
|
68
|
+
|
|
69
|
+
To run the memory profiler, use the following command:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
python -m memray run test_redis.py
|
|
73
|
+
```
|
|
74
|
+
|
|
67
75
|
|
|
68
76
|
## Project Structure
|
|
69
77
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "wizit_context_ingestor"
|
|
3
|
-
version = "0.2.
|
|
3
|
+
version = "0.2.4-beta"
|
|
4
4
|
description = "Contextual Rag with Cloud Solutions"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.12"
|
|
@@ -17,6 +17,8 @@ dependencies = [
|
|
|
17
17
|
|
|
18
18
|
[dependency-groups]
|
|
19
19
|
dev = [
|
|
20
|
+
"memray>=1.18.0",
|
|
21
|
+
"pyinstrument>=5.1.1",
|
|
20
22
|
"python-dotenv>=1.1.1",
|
|
21
23
|
]
|
|
22
24
|
|
|
@@ -12,7 +12,7 @@ class AWSModels(AiApplicationService):
|
|
|
12
12
|
A wrapper class for Google Cloud Vertex AI models that handles credentials and
|
|
13
13
|
provides methods to load embeddings and chat models.
|
|
14
14
|
"""
|
|
15
|
-
|
|
15
|
+
__slots__ = ('llm_model_id')
|
|
16
16
|
def __init__(
|
|
17
17
|
self,
|
|
18
18
|
llm_model_id: str = "us.anthropic.claude-3-5-sonnet-20241022-v2:0"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from ...application.interfaces import PersistenceService
|
|
2
2
|
from ...domain.models import ParsedDoc
|
|
3
|
-
import
|
|
3
|
+
from boto3 import client as boto3_client
|
|
4
4
|
import logging
|
|
5
5
|
import os
|
|
6
6
|
from botocore.exceptions import ClientError
|
|
@@ -10,9 +10,9 @@ logger = logging.getLogger(__name__)
|
|
|
10
10
|
|
|
11
11
|
class S3StorageService(PersistenceService):
|
|
12
12
|
"""Persistence service for S3 storage."""
|
|
13
|
-
|
|
13
|
+
__slots__ = ('origin_bucket_name', 'target_bucket_name', 'region_name')
|
|
14
14
|
def __init__(self, origin_bucket_name: str, target_bucket_name: str, region_name: str = 'us-east-1'):
|
|
15
|
-
self.s3 =
|
|
15
|
+
self.s3 = boto3_client('s3', region_name=region_name)
|
|
16
16
|
self.origin_bucket_name = origin_bucket_name
|
|
17
17
|
self.target_bucket_name = target_bucket_name
|
|
18
18
|
|
|
@@ -38,7 +38,7 @@ class PgEmbeddingsManager(EmbeddingsManager):
|
|
|
38
38
|
... )
|
|
39
39
|
>>> documents = [Document(page_content="Sample text", metadata={"source": "example"})]
|
|
40
40
|
"""
|
|
41
|
-
|
|
41
|
+
__slots__ = ("embeddings_model", "pg_connection")
|
|
42
42
|
def __init__(self, embeddings_model, pg_connection: str):
|
|
43
43
|
"""
|
|
44
44
|
Initialize the PgEmbeddingsManager.
|
|
@@ -11,6 +11,7 @@ logger = logging.getLogger(__name__)
|
|
|
11
11
|
|
|
12
12
|
class RedisEmbeddingsManager(EmbeddingsManager):
|
|
13
13
|
|
|
14
|
+
__slots__ = ("embeddings_model", "redis_conn_string", "metadata_tags")
|
|
14
15
|
def __init__(self, embeddings_model, redis_conn_string: str, metadata_tags: dict):
|
|
15
16
|
"""
|
|
16
17
|
Initialize the RedisEmbeddingsManager.
|
|
@@ -16,7 +16,7 @@ class SemanticChunks(RagChunker):
|
|
|
16
16
|
Class for semantically chunking documents into smaller pieces based on semantic similarity.
|
|
17
17
|
Uses LangChain's SemanticChunker to create semantically coherent document chunks.
|
|
18
18
|
"""
|
|
19
|
-
|
|
19
|
+
__slots__ = ("embeddings_model",)
|
|
20
20
|
def __init__(self, embeddings_model: Any):
|
|
21
21
|
"""
|
|
22
22
|
Initialize a document chunker with an embeddings model.
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import
|
|
1
|
+
from boto3 import client as boto3_client
|
|
2
2
|
import logging
|
|
3
3
|
|
|
4
4
|
logger = logging.getLogger(__name__)
|
|
@@ -6,7 +6,7 @@ logger = logging.getLogger(__name__)
|
|
|
6
6
|
class AwsSecretsManager:
|
|
7
7
|
|
|
8
8
|
def __init__(self):
|
|
9
|
-
self.client =
|
|
9
|
+
self.client = boto3_client('secretsmanager')
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
def get_secret(self, secret_name):
|
|
@@ -30,4 +30,4 @@ class AwsSecretsManager:
|
|
|
30
30
|
return msg
|
|
31
31
|
except Exception as e:
|
|
32
32
|
logger.error(f"An unknown error occurred: {str(e)}.")
|
|
33
|
-
raise
|
|
33
|
+
raise
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import
|
|
1
|
+
from vertexai import init as vertexai_init
|
|
2
2
|
from google.oauth2 import service_account
|
|
3
3
|
from langchain_google_vertexai import VertexAIEmbeddings, ChatVertexAI
|
|
4
4
|
from langchain_google_vertexai.model_garden import ChatAnthropicVertex
|
|
@@ -15,7 +15,7 @@ class VertexModels(AiApplicationService):
|
|
|
15
15
|
A wrapper class for Google Cloud Vertex AI models that handles credentials and
|
|
16
16
|
provides methods to load embeddings and chat models.
|
|
17
17
|
"""
|
|
18
|
-
|
|
18
|
+
__slots__ = ('project_id', 'location', 'json_service_account', 'scopes', 'llm_model_id')
|
|
19
19
|
def __init__(
|
|
20
20
|
self,
|
|
21
21
|
project_id: str,
|
|
@@ -42,7 +42,7 @@ class VertexModels(AiApplicationService):
|
|
|
42
42
|
self.llm_model_id = llm_model_id
|
|
43
43
|
self.project_id = project_id
|
|
44
44
|
self.location = location
|
|
45
|
-
|
|
45
|
+
vertexai_init(
|
|
46
46
|
project=project_id,
|
|
47
47
|
location=location,
|
|
48
48
|
credentials=self.credentials
|
|
@@ -54,7 +54,7 @@ class VertexModels(AiApplicationService):
|
|
|
54
54
|
|
|
55
55
|
def load_embeddings_model(
|
|
56
56
|
self,
|
|
57
|
-
embeddings_model_id: str = "text-embedding-
|
|
57
|
+
embeddings_model_id: str = "text-multilingual-embedding-002") -> VertexAIEmbeddings: # noqa: E125
|
|
58
58
|
"""
|
|
59
59
|
Load and return a Vertex AI embeddings model.
|
|
60
60
|
default embeddings length is 768 https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-text-embeddings
|
{wizit_context_ingestor-0.2.3b0 → wizit_context_ingestor-0.2.4b0}/src/wizit_context_ingestor/main.py
RENAMED
|
@@ -78,6 +78,7 @@ class DeelabRedisChunksManager:
|
|
|
78
78
|
gcp_secret_name: str,
|
|
79
79
|
redis_connection_string: str,
|
|
80
80
|
llm_model_id: str = "claude-3-5-haiku@20241022",
|
|
81
|
+
embeddings_model_id: str = "text-multilingual-embedding-002",
|
|
81
82
|
target_language: str = "es"
|
|
82
83
|
):
|
|
83
84
|
self.gcp_project_id = gcp_project_id
|
|
@@ -89,7 +90,7 @@ class DeelabRedisChunksManager:
|
|
|
89
90
|
self.gcp_sa_dict = self._get_gcp_sa_dict(gcp_secret_name)
|
|
90
91
|
self.redis_connection_string = redis_connection_string
|
|
91
92
|
self.vertex_model = self._get_vertex_model()
|
|
92
|
-
self.embeddings_model = self.vertex_model.load_embeddings_model()
|
|
93
|
+
self.embeddings_model = self.vertex_model.load_embeddings_model(embeddings_model_id)
|
|
93
94
|
|
|
94
95
|
def _get_gcp_sa_dict(self, gcp_secret_name: str):
|
|
95
96
|
vertex_gcp_sa = self.aws_secrets_manager.get_secret(gcp_secret_name)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|