sunholo 0.77.4__py3-none-any.whl → 0.78.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sunholo/agents/route.py +14 -11
- sunholo/azure/auth.py +61 -0
- sunholo/azure/blobs.py +47 -0
- sunholo/chunker/azure.py +51 -2
- sunholo/chunker/encode_metadata.py +64 -0
- sunholo/chunker/message_data.py +97 -3
- sunholo/chunker/process_chunker_data.py +6 -2
- sunholo/cli/embedder.py +1 -58
- sunholo/invoke/invoke_vac_utils.py +2 -2
- {sunholo-0.77.4.dist-info → sunholo-0.78.1.dist-info}/METADATA +4 -2
- {sunholo-0.77.4.dist-info → sunholo-0.78.1.dist-info}/RECORD +15 -12
- {sunholo-0.77.4.dist-info → sunholo-0.78.1.dist-info}/LICENSE.txt +0 -0
- {sunholo-0.77.4.dist-info → sunholo-0.78.1.dist-info}/WHEEL +0 -0
- {sunholo-0.77.4.dist-info → sunholo-0.78.1.dist-info}/entry_points.txt +0 -0
- {sunholo-0.77.4.dist-info → sunholo-0.78.1.dist-info}/top_level.txt +0 -0
sunholo/agents/route.py
CHANGED
|
@@ -14,6 +14,19 @@
|
|
|
14
14
|
from ..logging import log
|
|
15
15
|
from ..utils import load_config, ConfigManager
|
|
16
16
|
|
|
17
|
+
def read_cloud_run_url(agent, cloud_run_urls_file='config/cloud_run_urls.json'):
|
|
18
|
+
agent_route, _ = load_config(cloud_run_urls_file)
|
|
19
|
+
log.info(f'agent_route: {agent_route}')
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
agent_url = agent_route[agent]
|
|
23
|
+
except KeyError:
|
|
24
|
+
raise ValueError(f'agent_url not found for {agent}')
|
|
25
|
+
|
|
26
|
+
log.info(f'agent_url: {agent_url}')
|
|
27
|
+
|
|
28
|
+
return agent_url
|
|
29
|
+
|
|
17
30
|
def route_vac(vector_name: str=None, config=None) -> str :
|
|
18
31
|
"""
|
|
19
32
|
Considers what VAC this vector_name belongs to
|
|
@@ -30,18 +43,8 @@ def route_vac(vector_name: str=None, config=None) -> str :
|
|
|
30
43
|
return agent_url
|
|
31
44
|
|
|
32
45
|
agent = config.vacConfig('agent')
|
|
33
|
-
log.info(f'agent_type: {agent}')
|
|
34
|
-
|
|
35
|
-
agent_route, _ = load_config('config/cloud_run_urls.json')
|
|
36
|
-
log.info(f'agent_route: {agent_route}')
|
|
37
46
|
|
|
38
|
-
|
|
39
|
-
agent_url = agent_route[agent]
|
|
40
|
-
except KeyError:
|
|
41
|
-
raise ValueError(f'agent_url not found for {agent}')
|
|
42
|
-
|
|
43
|
-
log.info(f'agent_url: {agent_url}')
|
|
44
|
-
return agent_url
|
|
47
|
+
return read_cloud_run_url(agent)
|
|
45
48
|
|
|
46
49
|
def route_endpoint(vector_name=None, method = 'post', override_endpoint=None, config=None):
|
|
47
50
|
|
sunholo/azure/auth.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import os
|
|
2
|
+
try:
|
|
3
|
+
from azure.identity import DefaultAzureCredential, ClientSecretCredential
|
|
4
|
+
except ImportError:
|
|
5
|
+
DefaultAzureCredential = None
|
|
6
|
+
ClientSecretCredential = None
|
|
7
|
+
|
|
8
|
+
from ..logging import log
|
|
9
|
+
|
|
10
|
+
def azure_auth():
|
|
11
|
+
"""
|
|
12
|
+
Will attempt to authenticate using default credentials first (e.g. you are running within Azure Container Apps or similar)
|
|
13
|
+
|
|
14
|
+
If default credentials are not available, will attempt to authenticate via env vars - set up via:
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
az ad sp create-for-rbac --name "myApp" --role contributor \
|
|
18
|
+
--scopes /subscriptions/{subscription-id}/resourceGroups/{resource-group} \
|
|
19
|
+
--sdk-auth
|
|
20
|
+
|
|
21
|
+
export AZURE_CLIENT_ID="your-client-id"
|
|
22
|
+
export AZURE_CLIENT_SECRET="your-client-secret"
|
|
23
|
+
export AZURE_TENANT_ID="your-tenant-id"
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
"""
|
|
27
|
+
if DefaultAzureCredential is None:
|
|
28
|
+
raise ImportError("Azure identity credentials library needed - install via `pip install sunholo[azure]`")
|
|
29
|
+
|
|
30
|
+
# Use DefaultAzureCredential to authenticate
|
|
31
|
+
try:
|
|
32
|
+
credential = DefaultAzureCredential()
|
|
33
|
+
return credential
|
|
34
|
+
|
|
35
|
+
except Exception as e:
|
|
36
|
+
log.error(f"Failed to authenticate with default credentials: {str(e)}")
|
|
37
|
+
log.info("Attempting to authenticate using ClientSecretCredential")
|
|
38
|
+
|
|
39
|
+
# Use ClientSecretCredential to authenticate with a service principal
|
|
40
|
+
client_id = os.getenv("AZURE_CLIENT_ID")
|
|
41
|
+
client_secret = os.getenv("AZURE_CLIENT_SECRET")
|
|
42
|
+
tenant_id = os.getenv("AZURE_TENANT_ID")
|
|
43
|
+
|
|
44
|
+
if not client_id or not client_secret or not tenant_id:
|
|
45
|
+
log.error("Service principal credentials are not set in environment variables")
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
if ClientSecretCredential is None:
|
|
49
|
+
raise ImportError("Azure identity credentials library needed - install via `pip install sunholo[azure]`")
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
credential = ClientSecretCredential(
|
|
53
|
+
client_id=client_id,
|
|
54
|
+
client_secret=client_secret,
|
|
55
|
+
tenant_id=tenant_id
|
|
56
|
+
)
|
|
57
|
+
return credential
|
|
58
|
+
except Exception as e:
|
|
59
|
+
log.error(f"Failed to authenticate with service principal: {str(e)}")
|
|
60
|
+
return None
|
|
61
|
+
|
sunholo/azure/blobs.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from ..logging import log
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def is_azure_blob(message_data):
|
|
6
|
+
"""
|
|
7
|
+
Checks if the provided URL is an Azure Blob Storage URL.
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
message_data (str): The URL to be checked.
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
bool: True if the URL is an Azure Blob Storage URL, False otherwise.
|
|
14
|
+
"""
|
|
15
|
+
blob_url_pattern = r"https://(.*).blob.core.windows.net/(.*)/(.*)"
|
|
16
|
+
match = re.match(blob_url_pattern, message_data)
|
|
17
|
+
if not match:
|
|
18
|
+
return False
|
|
19
|
+
|
|
20
|
+
return True
|
|
21
|
+
|
|
22
|
+
def extract_blob_parts(message_data):
|
|
23
|
+
"""
|
|
24
|
+
Extracts the account name, container name, and blob name from an Azure Blob Storage URL.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
message_data (str): The Azure Blob Storage URL.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
tuple: A tuple containing the account name, container name, and blob name.
|
|
31
|
+
Returns (None, None, None) if the URL is invalid.
|
|
32
|
+
"""
|
|
33
|
+
if not is_azure_blob(message_data):
|
|
34
|
+
return None, None, None
|
|
35
|
+
|
|
36
|
+
log.debug("Detected Azure blob storage URL")
|
|
37
|
+
# Extract the account name, container name, and blob name from the URL
|
|
38
|
+
blob_url_pattern = r"https://(.*).blob.core.windows.net/(.*)/(.*)"
|
|
39
|
+
match = re.match(blob_url_pattern, message_data)
|
|
40
|
+
if not match:
|
|
41
|
+
log.error("Invalid Azure blob URL format")
|
|
42
|
+
return None, None
|
|
43
|
+
|
|
44
|
+
account_name, container_name, blob_name = match.groups()
|
|
45
|
+
|
|
46
|
+
return account_name, container_name, blob_name
|
|
47
|
+
|
sunholo/chunker/azure.py
CHANGED
|
@@ -11,10 +11,16 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
import json
|
|
15
|
+
from datetime import datetime, timezone
|
|
14
16
|
|
|
15
17
|
from ..logging import log
|
|
16
18
|
from ..azure import process_azure_blob_event
|
|
19
|
+
|
|
20
|
+
from ..invoke import invoke_vac
|
|
17
21
|
from .process_chunker_data import process_chunker_data
|
|
22
|
+
from ..chunker.encode_metadata import create_metadata, encode_data
|
|
23
|
+
from ..agents.route import read_cloud_run_url
|
|
18
24
|
|
|
19
25
|
def data_to_embed_azure(events: list):
|
|
20
26
|
"""Triggered from a message on an Azure Data Grid event.
|
|
@@ -24,6 +30,8 @@ def data_to_embed_azure(events: list):
|
|
|
24
30
|
validation_event_type = "Microsoft.EventGrid.SubscriptionValidationEvent"
|
|
25
31
|
storage_blob_created_event = "Microsoft.Storage.BlobCreated"
|
|
26
32
|
|
|
33
|
+
all_chunks = []
|
|
34
|
+
vac_name = None
|
|
27
35
|
for event in events:
|
|
28
36
|
event_type = event['eventType']
|
|
29
37
|
data = event['data']
|
|
@@ -36,6 +44,47 @@ def data_to_embed_azure(events: list):
|
|
|
36
44
|
return {"ValidationResponse": validation_code}
|
|
37
45
|
elif event_type == storage_blob_created_event:
|
|
38
46
|
|
|
39
|
-
message_data, metadata,
|
|
47
|
+
message_data, metadata, vac_name = process_azure_blob_event(events)
|
|
48
|
+
metadata["return_chunks"] = True
|
|
49
|
+
|
|
50
|
+
#TODO: process the azure blob URL and download it
|
|
51
|
+
|
|
52
|
+
chunks = process_chunker_data(message_data, metadata, vac_name)
|
|
53
|
+
if chunks:
|
|
54
|
+
all_chunks.extend(chunks)
|
|
55
|
+
|
|
56
|
+
if not all_chunks or len(chunks) == 0:
|
|
57
|
+
return {'status': 'error', 'message': f'No chunks were found in events: {events}'}
|
|
58
|
+
|
|
59
|
+
if not vac_name:
|
|
60
|
+
return {'status': 'error', 'message': f'Could not find a valid VAC config name in payload {all_chunks}'}
|
|
61
|
+
|
|
62
|
+
metadata = create_metadata(vac_name, metadata)
|
|
63
|
+
|
|
64
|
+
embeds = []
|
|
65
|
+
|
|
66
|
+
for chunk in chunks:
|
|
67
|
+
log.info(f"Working on chunk {chunk['metadata']}")
|
|
68
|
+
|
|
69
|
+
# do this async?
|
|
70
|
+
content = chunk.get("page_content")
|
|
71
|
+
now_utc = datetime.now(timezone.utc)
|
|
72
|
+
formatted_time = now_utc.strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
73
|
+
chunk["metadata"]["eventTime"] = formatted_time
|
|
74
|
+
if not content:
|
|
75
|
+
log.error("No content chunk found, skipping")
|
|
76
|
+
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
log.info(f"Sending chunk length {len(content)} to embedder")
|
|
80
|
+
processed_chunk = encode_data(vac = vac_name, content = json.dumps(chunk))
|
|
81
|
+
|
|
82
|
+
embed_url = read_cloud_run_url('embedder')
|
|
83
|
+
|
|
84
|
+
embed_res = invoke_vac(f"{embed_url}/embed_chunk", processed_chunk)
|
|
85
|
+
embeds.append(embed_res)
|
|
86
|
+
|
|
87
|
+
log.info("Embedding pipeline finished")
|
|
88
|
+
|
|
89
|
+
return embed_res
|
|
40
90
|
|
|
41
|
-
return process_chunker_data(message_data, metadata, vector_name)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
import base64
|
|
3
|
+
import json
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
|
|
6
|
+
from ..logging import log
|
|
7
|
+
|
|
8
|
+
def create_metadata(vac, metadata):
|
|
9
|
+
now_utc = datetime.now(timezone.utc)
|
|
10
|
+
formatted_time = now_utc.strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
11
|
+
|
|
12
|
+
# Default metadata if none provided
|
|
13
|
+
default_metadata = {"vector_name": vac, "source": "sunholo-cli", "eventTime": formatted_time}
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
# Merge default metadata with provided metadata
|
|
17
|
+
if metadata:
|
|
18
|
+
if not isinstance(metadata, dict):
|
|
19
|
+
metadata = json.loads(metadata)
|
|
20
|
+
else:
|
|
21
|
+
metadata = {}
|
|
22
|
+
except Exception as err:
|
|
23
|
+
log.error(f"[bold red]ERROR: metadata not parsed: {err} for {metadata}")
|
|
24
|
+
|
|
25
|
+
# Update metadata with default values if not present
|
|
26
|
+
metadata.update(default_metadata)
|
|
27
|
+
|
|
28
|
+
return metadata
|
|
29
|
+
|
|
30
|
+
def encode_data(vac, content, metadata=None, local_chunks=False):
|
|
31
|
+
|
|
32
|
+
metadata = create_metadata(vac, metadata)
|
|
33
|
+
|
|
34
|
+
# Encode the content (URL)
|
|
35
|
+
if isinstance(content, str):
|
|
36
|
+
message_data = base64.b64encode(content.encode('utf-8')).decode('utf-8')
|
|
37
|
+
else:
|
|
38
|
+
raise ValueError(f"Unsupported content type: {type(content)}")
|
|
39
|
+
|
|
40
|
+
now_utc = datetime.now(timezone.utc)
|
|
41
|
+
formatted_time = now_utc.strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
42
|
+
|
|
43
|
+
# Construct the message dictionary
|
|
44
|
+
messageId = str(uuid.uuid4())
|
|
45
|
+
message = {
|
|
46
|
+
"message": {
|
|
47
|
+
"data": message_data,
|
|
48
|
+
"messageId": messageId,
|
|
49
|
+
"publishTime": formatted_time,
|
|
50
|
+
"attributes": {
|
|
51
|
+
"namespace": vac,
|
|
52
|
+
"return_chunks": str(local_chunks).lower()
|
|
53
|
+
},
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
# Merge metadata with attributes
|
|
58
|
+
message["message"]["attributes"].update(metadata)
|
|
59
|
+
|
|
60
|
+
#console.print()
|
|
61
|
+
#console.print(f"Sending message: {messageId} with metadata:")
|
|
62
|
+
#console.print(f"{message['message']['attributes']}")
|
|
63
|
+
|
|
64
|
+
return message
|
sunholo/chunker/message_data.py
CHANGED
|
@@ -18,11 +18,17 @@ import tempfile
|
|
|
18
18
|
import os
|
|
19
19
|
import re
|
|
20
20
|
import json
|
|
21
|
+
|
|
21
22
|
try:
|
|
22
23
|
from google.cloud import storage
|
|
23
24
|
except ImportError:
|
|
24
25
|
storage = None
|
|
25
26
|
|
|
27
|
+
try:
|
|
28
|
+
from azure.storage.blob import BlobServiceClient
|
|
29
|
+
except ImportError:
|
|
30
|
+
BlobServiceClient = None
|
|
31
|
+
|
|
26
32
|
from langchain.schema import Document
|
|
27
33
|
|
|
28
34
|
|
|
@@ -33,8 +39,8 @@ from . import loaders
|
|
|
33
39
|
|
|
34
40
|
from ..utils.parsers import extract_urls
|
|
35
41
|
from ..gcs.add_file import add_file_to_gcs, get_pdf_split_file_name
|
|
36
|
-
|
|
37
|
-
|
|
42
|
+
from ..azure.blobs import extract_blob_parts
|
|
43
|
+
from ..azure.auth import azure_auth
|
|
38
44
|
|
|
39
45
|
def handle_gcs_message(message_data: str, metadata: dict, vector_name: str):
|
|
40
46
|
|
|
@@ -199,4 +205,92 @@ def handle_json_content_message(message_data: dict, metadata: dict, vector_name:
|
|
|
199
205
|
|
|
200
206
|
chunks = chunk_doc_to_docs(docs, vector_name=vector_name)
|
|
201
207
|
|
|
202
|
-
return chunks, metadata
|
|
208
|
+
return chunks, metadata
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def handle_azure_blob(message_data: str, metadata: dict, vector_name: str):
|
|
212
|
+
"""
|
|
213
|
+
Processes a message from Azure Blob storage, downloads the file, processes it,
|
|
214
|
+
and returns chunks and metadata.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
message_data (str): URL of the Azure blob.
|
|
218
|
+
metadata (dict): Metadata associated with the file.
|
|
219
|
+
vector_name (str): Vector name for processing.
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
chunks (list): List of document chunks.
|
|
223
|
+
metadata (dict): Updated metadata.
|
|
224
|
+
"""
|
|
225
|
+
|
|
226
|
+
if BlobServiceClient is None:
|
|
227
|
+
raise ImportError("BlobServiceClient is not installed - install via pip install sunholo[azure]")
|
|
228
|
+
|
|
229
|
+
account_name, container_name, blob_name = extract_blob_parts(message_data)
|
|
230
|
+
|
|
231
|
+
credential = azure_auth()
|
|
232
|
+
if credential is None:
|
|
233
|
+
log.error("BlobServiceClient could not find auth credentials")
|
|
234
|
+
return None, None
|
|
235
|
+
|
|
236
|
+
# Create a BlobServiceClient
|
|
237
|
+
blob_service_client = BlobServiceClient(
|
|
238
|
+
account_url=f"https://{account_name}.blob.core.windows.net",
|
|
239
|
+
credential=credential)
|
|
240
|
+
|
|
241
|
+
# Get the blob client
|
|
242
|
+
blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
|
|
243
|
+
|
|
244
|
+
file_name = pathlib.Path(blob_name)
|
|
245
|
+
|
|
246
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
247
|
+
tmp_file_path = os.path.join(temp_dir, file_name.name)
|
|
248
|
+
with open(tmp_file_path, "wb") as file:
|
|
249
|
+
download_stream = blob_client.download_blob()
|
|
250
|
+
file.write(download_stream.readall())
|
|
251
|
+
|
|
252
|
+
if file_name.suffix.lower() == ".pdf":
|
|
253
|
+
pages = split_pdf_to_pages(tmp_file_path, temp_dir)
|
|
254
|
+
if not metadata.get("source"):
|
|
255
|
+
metadata["source"] = str(file_name)
|
|
256
|
+
if len(pages) > 1:
|
|
257
|
+
log.info(f"Got back {len(pages)} pages for file {tmp_file_path}")
|
|
258
|
+
for pp in pages:
|
|
259
|
+
pp_basename = os.path.basename(pp)
|
|
260
|
+
# file_name/pdf_parts/file_name_1.pdf
|
|
261
|
+
azure_blob_path = f"{file_name.stem}_parts/{pp_basename}"
|
|
262
|
+
# Upload split pages back to Azure Blob storage
|
|
263
|
+
with open(pp, "rb") as page_file:
|
|
264
|
+
blob_client.upload_blob(name=azure_blob_path, data=page_file)
|
|
265
|
+
log.info(f"{azure_blob_path} is now in container {container_name}")
|
|
266
|
+
log.info(f"Sent split pages for {file_name.name} back to Azure Blob to parallelize the imports")
|
|
267
|
+
return None, None
|
|
268
|
+
else:
|
|
269
|
+
# just original temp file
|
|
270
|
+
pages = [tmp_file_path]
|
|
271
|
+
|
|
272
|
+
the_metadata = {
|
|
273
|
+
"type": "file_load_azure_blob",
|
|
274
|
+
"container_name": container_name
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
if metadata.get("source") is None:
|
|
278
|
+
the_metadata["source"] = str(file_name)
|
|
279
|
+
|
|
280
|
+
metadata.update(the_metadata)
|
|
281
|
+
|
|
282
|
+
docs = []
|
|
283
|
+
for page in pages:
|
|
284
|
+
log.info(f"Sending file {page} to loaders.read_file_to_documents {metadata}")
|
|
285
|
+
docs2 = loaders.read_file_to_documents(page, metadata=metadata)
|
|
286
|
+
if docs2 is None:
|
|
287
|
+
log.warning(f"loaders.read_file_to_documents docs2 failed to load file {metadata}")
|
|
288
|
+
docs.extend(docs2)
|
|
289
|
+
|
|
290
|
+
if docs is None:
|
|
291
|
+
log.warning(f"loaders.read_file_to_documents docs failed to load file {metadata}")
|
|
292
|
+
return None, metadata
|
|
293
|
+
else:
|
|
294
|
+
chunks = chunk_doc_to_docs(docs, file_name.suffix, vector_name=vector_name)
|
|
295
|
+
|
|
296
|
+
return chunks, metadata
|
|
@@ -5,7 +5,8 @@ from .message_data import (
|
|
|
5
5
|
handle_google_drive_message,
|
|
6
6
|
handle_github_message,
|
|
7
7
|
handle_http_message,
|
|
8
|
-
handle_json_content_message
|
|
8
|
+
handle_json_content_message,
|
|
9
|
+
handle_azure_blob
|
|
9
10
|
)
|
|
10
11
|
|
|
11
12
|
from . import loaders
|
|
@@ -13,7 +14,7 @@ from ..llamaindex.import_files import llamaindex_chunker_check
|
|
|
13
14
|
from ..discovery_engine.chunker_handler import discovery_engine_chunker_check
|
|
14
15
|
from .publish import process_docs_chunks_vector_name
|
|
15
16
|
from .splitter import chunk_doc_to_docs
|
|
16
|
-
|
|
17
|
+
from ..azure.blobs import is_azure_blob
|
|
17
18
|
|
|
18
19
|
from ..logging import log
|
|
19
20
|
|
|
@@ -43,6 +44,9 @@ def process_chunker_data(message_data, metadata, vector_name):
|
|
|
43
44
|
if message_data.startswith("gs://"):
|
|
44
45
|
chunks, metadata = handle_gcs_message(message_data, metadata, vector_name)
|
|
45
46
|
|
|
47
|
+
elif is_azure_blob(message_data):
|
|
48
|
+
chunks, metadata = handle_azure_blob(message_data, metadata, vector_name)
|
|
49
|
+
|
|
46
50
|
elif message_data.startswith("https://drive.google.com") or message_data.startswith("https://docs.google.com"):
|
|
47
51
|
chunks, metadata = handle_google_drive_message(message_data, metadata, vector_name)
|
|
48
52
|
|
sunholo/cli/embedder.py
CHANGED
|
@@ -11,64 +11,7 @@ from rich.progress import Progress
|
|
|
11
11
|
from ..invoke import invoke_vac
|
|
12
12
|
from .chat_vac import resolve_service_url
|
|
13
13
|
from .run_proxy import stop_proxy
|
|
14
|
-
|
|
15
|
-
def create_metadata(vac, metadata):
|
|
16
|
-
now_utc = datetime.now(timezone.utc)
|
|
17
|
-
formatted_time = now_utc.strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
18
|
-
|
|
19
|
-
# Default metadata if none provided
|
|
20
|
-
default_metadata = {"vector_name": vac, "source": "sunholo-cli", "eventTime": formatted_time}
|
|
21
|
-
|
|
22
|
-
try:
|
|
23
|
-
# Merge default metadata with provided metadata
|
|
24
|
-
if metadata:
|
|
25
|
-
if not isinstance(metadata, dict):
|
|
26
|
-
metadata = json.loads(metadata)
|
|
27
|
-
else:
|
|
28
|
-
metadata = {}
|
|
29
|
-
except Exception as err:
|
|
30
|
-
console.print(f"[bold red]ERROR: metadata not parsed: {err} for {metadata}")
|
|
31
|
-
|
|
32
|
-
# Update metadata with default values if not present
|
|
33
|
-
metadata.update(default_metadata)
|
|
34
|
-
|
|
35
|
-
return metadata
|
|
36
|
-
|
|
37
|
-
def encode_data(vac, content, metadata=None, local_chunks=False):
|
|
38
|
-
|
|
39
|
-
metadata = create_metadata(vac, metadata)
|
|
40
|
-
|
|
41
|
-
# Encode the content (URL)
|
|
42
|
-
if isinstance(content, str):
|
|
43
|
-
message_data = base64.b64encode(content.encode('utf-8')).decode('utf-8')
|
|
44
|
-
else:
|
|
45
|
-
raise ValueError(f"Unsupported content type: {type(content)}")
|
|
46
|
-
|
|
47
|
-
now_utc = datetime.now(timezone.utc)
|
|
48
|
-
formatted_time = now_utc.strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
49
|
-
|
|
50
|
-
# Construct the message dictionary
|
|
51
|
-
messageId = str(uuid.uuid4())
|
|
52
|
-
message = {
|
|
53
|
-
"message": {
|
|
54
|
-
"data": message_data,
|
|
55
|
-
"messageId": messageId,
|
|
56
|
-
"publishTime": formatted_time,
|
|
57
|
-
"attributes": {
|
|
58
|
-
"namespace": vac,
|
|
59
|
-
"return_chunks": str(local_chunks).lower()
|
|
60
|
-
},
|
|
61
|
-
}
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
# Merge metadata with attributes
|
|
65
|
-
message["message"]["attributes"].update(metadata)
|
|
66
|
-
|
|
67
|
-
#console.print()
|
|
68
|
-
#console.print(f"Sending message: {messageId} with metadata:")
|
|
69
|
-
#console.print(f"{message['message']['attributes']}")
|
|
70
|
-
|
|
71
|
-
return message
|
|
14
|
+
from ..chunker.encode_metadata import create_metadata, encode_data
|
|
72
15
|
|
|
73
16
|
def embed_command(args):
|
|
74
17
|
chunk_args = vars(args).copy()
|
|
@@ -33,10 +33,10 @@ def invoke_vac(service_url, data, vector_name=None, metadata=None, is_file=False
|
|
|
33
33
|
else:
|
|
34
34
|
json_data = json.loads(data)
|
|
35
35
|
except json.JSONDecodeError as err:
|
|
36
|
-
log.error(f"
|
|
36
|
+
log.error(f"ERROR: invalid JSON: {str(err)}")
|
|
37
37
|
raise err
|
|
38
38
|
except Exception as err:
|
|
39
|
-
log.error(f"
|
|
39
|
+
log.error(f"ERROR: could not parse JSON: {str(err)}")
|
|
40
40
|
raise err
|
|
41
41
|
|
|
42
42
|
log.debug(f"Sending data: {data} or json_data: {json.dumps(json_data)}")
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: sunholo
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.78.1
|
|
4
4
|
Summary: Large Language Model DevOps - a package to help deploy LLMs to the Cloud.
|
|
5
5
|
Home-page: https://github.com/sunholo-data/sunholo-py
|
|
6
|
-
Download-URL: https://github.com/sunholo-data/sunholo-py/archive/refs/tags/v0.
|
|
6
|
+
Download-URL: https://github.com/sunholo-data/sunholo-py/archive/refs/tags/v0.78.1.tar.gz
|
|
7
7
|
Author: Holosun ApS
|
|
8
8
|
Author-email: multivac@sunholo.com
|
|
9
9
|
License: Apache License, Version 2.0
|
|
@@ -25,6 +25,7 @@ Requires-Dist: langchain-experimental >0.0.60
|
|
|
25
25
|
Requires-Dist: langchain-community
|
|
26
26
|
Provides-Extra: all
|
|
27
27
|
Requires-Dist: asyncpg ; extra == 'all'
|
|
28
|
+
Requires-Dist: azure-identity ; extra == 'all'
|
|
28
29
|
Requires-Dist: azure-storage-blob ; extra == 'all'
|
|
29
30
|
Requires-Dist: fastapi ; extra == 'all'
|
|
30
31
|
Requires-Dist: flask ; extra == 'all'
|
|
@@ -74,6 +75,7 @@ Requires-Dist: unstructured[local-inference] ==0.14.9 ; extra == 'all'
|
|
|
74
75
|
Provides-Extra: anthropic
|
|
75
76
|
Requires-Dist: langchain-anthropic >=0.1.13 ; extra == 'anthropic'
|
|
76
77
|
Provides-Extra: azure
|
|
78
|
+
Requires-Dist: azure-identity ; extra == 'azure'
|
|
77
79
|
Requires-Dist: azure-storage-blob ; extra == 'azure'
|
|
78
80
|
Provides-Extra: cli
|
|
79
81
|
Requires-Dist: jsonschema >=4.21.1 ; extra == 'cli'
|
|
@@ -5,7 +5,7 @@ sunholo/agents/chat_history.py,sha256=8iX1bgvRW6fdp6r_DQR_caPHYrZ_9QJJgPxCiSDf3q
|
|
|
5
5
|
sunholo/agents/dispatch_to_qa.py,sha256=A8skiZ-CtDvYdP0tnXL4sWM3BCDBgdjFVyrqy-h8Aa4,8374
|
|
6
6
|
sunholo/agents/langserve.py,sha256=eSNJ4G5eGKjmyMQLM_uTOjiS-D_W4QhCLrsC4Vsnk7E,4407
|
|
7
7
|
sunholo/agents/pubsub.py,sha256=5hbbhbBGyVWRpt2sAGC5FEheYH1mCCwVUhZEB1S7vGg,1337
|
|
8
|
-
sunholo/agents/route.py,sha256=
|
|
8
|
+
sunholo/agents/route.py,sha256=FE9qTNzCkdB6pzxp5iqi9pyc2-2pC3FrwUU9tVLnULU,2979
|
|
9
9
|
sunholo/agents/special_commands.py,sha256=ecD5jrBVXo170sdgPILi0m_m_4nRFEv6qKn5zYEvEK8,6494
|
|
10
10
|
sunholo/agents/swagger.py,sha256=w5eCShufIjZLuF1SHQgPLABFM1f0FrU0KlX8Y41KrIo,11191
|
|
11
11
|
sunholo/agents/fastapi/__init__.py,sha256=S_pj4_bTUmDGoq_exaREHlOKThi0zTuGT0VZY0YfODQ,88
|
|
@@ -22,19 +22,22 @@ sunholo/auth/gcloud.py,sha256=PdbwkuTdRi4RKBmgG9uwsReegqC4VG15_tw5uzmA7Fs,298
|
|
|
22
22
|
sunholo/auth/refresh.py,sha256=uOdT7oQRVl0YsUP__NXj6PdUdLyXFSv2ylwF283esuw,1831
|
|
23
23
|
sunholo/auth/run.py,sha256=SBghZdWEwXhxuyeZ1s68RBE7foI_kUn6NcRvzFk_iVw,2769
|
|
24
24
|
sunholo/azure/__init__.py,sha256=S1WQ5jndzNgzhSBh9UpX_yw7hRVm3hCzkAWNxUdK4dA,48
|
|
25
|
+
sunholo/azure/auth.py,sha256=e9kus3-V8b_a-lC1_TeOevFMwDwM1JcluJkxpKQs83g,2229
|
|
26
|
+
sunholo/azure/blobs.py,sha256=mAO00FpRP7SUUoYdQEhqi_hEQBocblGHSUqnYy5EdDs,1397
|
|
25
27
|
sunholo/azure/event_grid.py,sha256=uXunwdjVLxNRf38aTRPoC9HXxFEFlL8JH9dijaOlF8M,2567
|
|
26
28
|
sunholo/bots/__init__.py,sha256=EMFd7e2z68l6pzYOnkzHbLd2xJRvxTKFRNCTuhZ8hIw,130
|
|
27
29
|
sunholo/bots/discord.py,sha256=cCFae5K1BCa6JVkWGLh_iZ9qFO1JpXb6K4eJrlDfEro,2442
|
|
28
30
|
sunholo/bots/github_webhook.py,sha256=5pQPRLM_wxxcILVaIzUDV8Kt7Arcm2dL1r1kMMHA524,9629
|
|
29
31
|
sunholo/bots/webapp.py,sha256=EIMxdAJ_xtufwJmvnn7N_Fb_1hZ9DjhJ0Kf_hp02vEU,1926
|
|
30
32
|
sunholo/chunker/__init__.py,sha256=A5canS0XPgisHu0OZ7sVdILgEHGzgH9kpkDi4oBwLZk,135
|
|
31
|
-
sunholo/chunker/azure.py,sha256=
|
|
33
|
+
sunholo/chunker/azure.py,sha256=iZ0mXjei0cILsLuSUnZK0mmUUsQNiC3ZQr1iX8q5IeY,3263
|
|
32
34
|
sunholo/chunker/doc_handling.py,sha256=rIyknpzDyj5A0u_DqSQVD_CXLRNZPOU6TCL4bhCdjOI,8563
|
|
35
|
+
sunholo/chunker/encode_metadata.py,sha256=SYHaqKcr4lCzwmrzUGhgX4_l4pzDv7wAeNCw7a461MA,1912
|
|
33
36
|
sunholo/chunker/images.py,sha256=Xmh1vwHrVhoXm5iH2dhCc52O8YgdzE8KrDSdL-pGnp8,1861
|
|
34
37
|
sunholo/chunker/loaders.py,sha256=xiToUVgPz2ZzcqpUAq7aNP3PTenb_rBUAFzu0JPycIg,10268
|
|
35
|
-
sunholo/chunker/message_data.py,sha256=
|
|
38
|
+
sunholo/chunker/message_data.py,sha256=T1LXUoVZ7SQda8rgt9lfpg5taNmE2wxmmfWUR3SG6W4,10676
|
|
36
39
|
sunholo/chunker/pdfs.py,sha256=daCZ1xjn1YvxlifIyxskWNpLJLe-Q9D_Jq12MWx3tZo,2473
|
|
37
|
-
sunholo/chunker/process_chunker_data.py,sha256=
|
|
40
|
+
sunholo/chunker/process_chunker_data.py,sha256=z9An3I8BuE0JDXMdKtB7CO885KOKA51wcBxDXUL1qoA,3516
|
|
38
41
|
sunholo/chunker/publish.py,sha256=tiO615A2uo_ZjzdFDzNH1PL_1kJeLMUQwLJ4w67rNIc,2932
|
|
39
42
|
sunholo/chunker/pubsub.py,sha256=XgLAuOFNDSqKEBvzRa0TSylZdPecRVHMp0nmmQ_OVco,1005
|
|
40
43
|
sunholo/chunker/splitter.py,sha256=jtGfi_ZdhVdyFhfw0e4ynEpmwIyrxQtV63OituYWy6o,6729
|
|
@@ -44,7 +47,7 @@ sunholo/cli/cli.py,sha256=3ZMcsR1VLCdrsfm0zGBQ9TKqO5qkOrtZ6-iVNmr6f_8,3820
|
|
|
44
47
|
sunholo/cli/cli_init.py,sha256=JMZ9AX2cPDZ-_mv3adiv2ToFVNyRPtjk9Biszl1kiR0,2358
|
|
45
48
|
sunholo/cli/configs.py,sha256=QUM9DvKOdZmEQRM5uI3Nh887T0YDiSMr7O240zTLqws,4546
|
|
46
49
|
sunholo/cli/deploy.py,sha256=zxdwUsRTRMC8U5vyRv0JiKBLFn84Ug_Tc88-_h9hJSs,1609
|
|
47
|
-
sunholo/cli/embedder.py,sha256=
|
|
50
|
+
sunholo/cli/embedder.py,sha256=v-FKiSPHaQzB6ctClclYueIf3bf3CqYtC1oRgPfT4dY,5566
|
|
48
51
|
sunholo/cli/merge_texts.py,sha256=U9vdMwKmcPoc6iPOWX5MKSxn49dNGbNzVLw8ui5PhEU,1823
|
|
49
52
|
sunholo/cli/run_proxy.py,sha256=OeR12ZfnasbJ-smBZQznmGufoDa4iNjUN9FCFo5JxSc,11520
|
|
50
53
|
sunholo/cli/sun_rich.py,sha256=UpMqeJ0C8i0pkue1AHnnyyX0bFJ9zZeJ7HBR6yhuA8A,54
|
|
@@ -80,7 +83,7 @@ sunholo/gcs/download_url.py,sha256=iCIPESi2viQ-TcCINpbJXxUt7XJFFpF0KiVgSA6zFis,5
|
|
|
80
83
|
sunholo/gcs/metadata.py,sha256=C9sMPsHsq1ETetdQCqB3EBs3Kws8b8QHS9L7ei_v5aw,891
|
|
81
84
|
sunholo/invoke/__init__.py,sha256=bELcqIjzKvaupcIN5OQmDgGx_8jARtH9T6PCe8UgcvE,99
|
|
82
85
|
sunholo/invoke/direct_vac_func.py,sha256=mr-xjIQyvn918Txpe1IRkV36Sp-lpS-e202c2hfTFMk,4471
|
|
83
|
-
sunholo/invoke/invoke_vac_utils.py,sha256=
|
|
86
|
+
sunholo/invoke/invoke_vac_utils.py,sha256=71nPT5M5Gmij0Ioapw-d92t7-umDqf0KANh8SYiFVF8,2046
|
|
84
87
|
sunholo/langfuse/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
85
88
|
sunholo/langfuse/callback.py,sha256=CTaos8sYcrga949BG6lIZ4I62DiiQSHxwz5re9XjDWQ,1677
|
|
86
89
|
sunholo/langfuse/prompts.py,sha256=EkbzSw9Jr05ULMsRDoGOp-frbtCZpnvdYSJEYNpzfX8,1293
|
|
@@ -129,9 +132,9 @@ sunholo/vertex/init.py,sha256=uyg76EqS39jWJ2gxMqXOLWP6MQ2hc81wFdwgG86ZoCM,2868
|
|
|
129
132
|
sunholo/vertex/memory_tools.py,sha256=pomHrDKqvY8MZxfUqoEwhdlpCvSGP6KmFJMVKOimXjs,6842
|
|
130
133
|
sunholo/vertex/safety.py,sha256=S9PgQT1O_BQAkcqauWncRJaydiP8Q_Jzmu9gxYfy1VA,2482
|
|
131
134
|
sunholo/vertex/type_dict_to_json.py,sha256=uTzL4o9tJRao4u-gJOFcACgWGkBOtqACmb6ihvCErL8,4694
|
|
132
|
-
sunholo-0.
|
|
133
|
-
sunholo-0.
|
|
134
|
-
sunholo-0.
|
|
135
|
-
sunholo-0.
|
|
136
|
-
sunholo-0.
|
|
137
|
-
sunholo-0.
|
|
135
|
+
sunholo-0.78.1.dist-info/LICENSE.txt,sha256=SdE3QjnD3GEmqqg9EX3TM9f7WmtOzqS1KJve8rhbYmU,11345
|
|
136
|
+
sunholo-0.78.1.dist-info/METADATA,sha256=cyU6V7XfznAgynkdDomy5H2xyGbwpLGo0ChvyRaO1qo,7348
|
|
137
|
+
sunholo-0.78.1.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
|
|
138
|
+
sunholo-0.78.1.dist-info/entry_points.txt,sha256=bZuN5AIHingMPt4Ro1b_T-FnQvZ3teBes-3OyO0asl4,49
|
|
139
|
+
sunholo-0.78.1.dist-info/top_level.txt,sha256=wt5tadn5--5JrZsjJz2LceoUvcrIvxjHJe-RxuudxAk,8
|
|
140
|
+
sunholo-0.78.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|