sunholo 0.77.4__tar.gz → 0.78.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. {sunholo-0.77.4 → sunholo-0.78.2}/PKG-INFO +4 -2
  2. {sunholo-0.77.4 → sunholo-0.78.2}/setup.py +3 -1
  3. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/agents/route.py +14 -11
  4. sunholo-0.78.2/sunholo/azure/auth.py +61 -0
  5. sunholo-0.78.2/sunholo/azure/blobs.py +47 -0
  6. sunholo-0.78.2/sunholo/chunker/azure.py +90 -0
  7. sunholo-0.78.2/sunholo/chunker/encode_metadata.py +64 -0
  8. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/chunker/message_data.py +100 -3
  9. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/chunker/process_chunker_data.py +6 -2
  10. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/cli/embedder.py +1 -58
  11. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/invoke/invoke_vac_utils.py +2 -2
  12. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo.egg-info/PKG-INFO +4 -2
  13. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo.egg-info/SOURCES.txt +3 -0
  14. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo.egg-info/requires.txt +2 -0
  15. sunholo-0.77.4/sunholo/chunker/azure.py +0 -41
  16. {sunholo-0.77.4 → sunholo-0.78.2}/LICENSE.txt +0 -0
  17. {sunholo-0.77.4 → sunholo-0.78.2}/MANIFEST.in +0 -0
  18. {sunholo-0.77.4 → sunholo-0.78.2}/README.md +0 -0
  19. {sunholo-0.77.4 → sunholo-0.78.2}/setup.cfg +0 -0
  20. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/__init__.py +0 -0
  21. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/agents/__init__.py +0 -0
  22. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/agents/chat_history.py +0 -0
  23. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/agents/dispatch_to_qa.py +0 -0
  24. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/agents/fastapi/__init__.py +0 -0
  25. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/agents/fastapi/base.py +0 -0
  26. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/agents/fastapi/qna_routes.py +0 -0
  27. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/agents/flask/__init__.py +0 -0
  28. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/agents/flask/base.py +0 -0
  29. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/agents/flask/qna_routes.py +0 -0
  30. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/agents/flask/vac_routes.py +0 -0
  31. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/agents/langserve.py +0 -0
  32. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/agents/pubsub.py +0 -0
  33. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/agents/special_commands.py +0 -0
  34. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/agents/swagger.py +0 -0
  35. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/archive/__init__.py +0 -0
  36. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/archive/archive.py +0 -0
  37. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/auth/__init__.py +0 -0
  38. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/auth/gcloud.py +0 -0
  39. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/auth/refresh.py +0 -0
  40. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/auth/run.py +0 -0
  41. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/azure/__init__.py +0 -0
  42. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/azure/event_grid.py +0 -0
  43. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/bots/__init__.py +0 -0
  44. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/bots/discord.py +0 -0
  45. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/bots/github_webhook.py +0 -0
  46. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/bots/webapp.py +0 -0
  47. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/chunker/__init__.py +0 -0
  48. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/chunker/doc_handling.py +0 -0
  49. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/chunker/images.py +0 -0
  50. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/chunker/loaders.py +0 -0
  51. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/chunker/pdfs.py +0 -0
  52. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/chunker/publish.py +0 -0
  53. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/chunker/pubsub.py +0 -0
  54. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/chunker/splitter.py +0 -0
  55. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/cli/__init__.py +0 -0
  56. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/cli/chat_vac.py +0 -0
  57. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/cli/cli.py +0 -0
  58. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/cli/cli_init.py +0 -0
  59. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/cli/configs.py +0 -0
  60. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/cli/deploy.py +0 -0
  61. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/cli/merge_texts.py +0 -0
  62. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/cli/run_proxy.py +0 -0
  63. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/cli/sun_rich.py +0 -0
  64. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/cli/swagger.py +0 -0
  65. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/cli/vertex.py +0 -0
  66. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/components/__init__.py +0 -0
  67. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/components/llm.py +0 -0
  68. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/components/retriever.py +0 -0
  69. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/components/vectorstore.py +0 -0
  70. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/database/__init__.py +0 -0
  71. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/database/alloydb.py +0 -0
  72. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/database/alloydb_client.py +0 -0
  73. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/database/database.py +0 -0
  74. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/database/lancedb.py +0 -0
  75. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/database/sql/sb/create_function.sql +0 -0
  76. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/database/sql/sb/create_function_time.sql +0 -0
  77. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/database/sql/sb/create_table.sql +0 -0
  78. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/database/sql/sb/delete_source_row.sql +0 -0
  79. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/database/sql/sb/return_sources.sql +0 -0
  80. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/database/sql/sb/setup.sql +0 -0
  81. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/database/static_dbs.py +0 -0
  82. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/database/uuid.py +0 -0
  83. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/discovery_engine/__init__.py +0 -0
  84. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/discovery_engine/chunker_handler.py +0 -0
  85. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/discovery_engine/create_new.py +0 -0
  86. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/discovery_engine/discovery_engine_client.py +0 -0
  87. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/embedder/__init__.py +0 -0
  88. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/embedder/embed_chunk.py +0 -0
  89. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/gcs/__init__.py +0 -0
  90. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/gcs/add_file.py +0 -0
  91. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/gcs/download_folder.py +0 -0
  92. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/gcs/download_url.py +0 -0
  93. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/gcs/metadata.py +0 -0
  94. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/invoke/__init__.py +0 -0
  95. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/invoke/direct_vac_func.py +0 -0
  96. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/langfuse/__init__.py +0 -0
  97. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/langfuse/callback.py +0 -0
  98. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/langfuse/prompts.py +0 -0
  99. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/llamaindex/__init__.py +0 -0
  100. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/llamaindex/generate.py +0 -0
  101. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/llamaindex/get_files.py +0 -0
  102. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/llamaindex/import_files.py +0 -0
  103. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/logging.py +0 -0
  104. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/lookup/__init__.py +0 -0
  105. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/lookup/model_lookup.yaml +0 -0
  106. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/patches/__init__.py +0 -0
  107. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/patches/langchain/__init__.py +0 -0
  108. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/patches/langchain/lancedb.py +0 -0
  109. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/patches/langchain/vertexai.py +0 -0
  110. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/pubsub/__init__.py +0 -0
  111. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/pubsub/process_pubsub.py +0 -0
  112. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/pubsub/pubsub_manager.py +0 -0
  113. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/qna/__init__.py +0 -0
  114. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/qna/parsers.py +0 -0
  115. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/qna/retry.py +0 -0
  116. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/streaming/__init__.py +0 -0
  117. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/streaming/content_buffer.py +0 -0
  118. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/streaming/langserve.py +0 -0
  119. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/streaming/stream_lookup.py +0 -0
  120. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/streaming/streaming.py +0 -0
  121. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/summarise/__init__.py +0 -0
  122. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/summarise/summarise.py +0 -0
  123. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/tools/__init__.py +0 -0
  124. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/tools/web_browser.py +0 -0
  125. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/utils/__init__.py +0 -0
  126. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/utils/api_key.py +0 -0
  127. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/utils/big_context.py +0 -0
  128. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/utils/config.py +0 -0
  129. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/utils/config_class.py +0 -0
  130. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/utils/config_schema.py +0 -0
  131. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/utils/gcp.py +0 -0
  132. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/utils/gcp_project.py +0 -0
  133. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/utils/parsers.py +0 -0
  134. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/utils/timedelta.py +0 -0
  135. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/utils/user_ids.py +0 -0
  136. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/utils/version.py +0 -0
  137. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/vertex/__init__.py +0 -0
  138. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/vertex/extensions_call.py +0 -0
  139. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/vertex/extensions_class.py +0 -0
  140. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/vertex/genai_functions.py +0 -0
  141. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/vertex/init.py +0 -0
  142. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/vertex/memory_tools.py +0 -0
  143. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/vertex/safety.py +0 -0
  144. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo/vertex/type_dict_to_json.py +0 -0
  145. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo.egg-info/dependency_links.txt +0 -0
  146. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo.egg-info/entry_points.txt +0 -0
  147. {sunholo-0.77.4 → sunholo-0.78.2}/sunholo.egg-info/top_level.txt +0 -0
  148. {sunholo-0.77.4 → sunholo-0.78.2}/tests/test_chat_history.py +0 -0
  149. {sunholo-0.77.4 → sunholo-0.78.2}/tests/test_config.py +0 -0
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sunholo
3
- Version: 0.77.4
3
+ Version: 0.78.2
4
4
  Summary: Large Language Model DevOps - a package to help deploy LLMs to the Cloud.
5
5
  Home-page: https://github.com/sunholo-data/sunholo-py
6
- Download-URL: https://github.com/sunholo-data/sunholo-py/archive/refs/tags/v0.77.4.tar.gz
6
+ Download-URL: https://github.com/sunholo-data/sunholo-py/archive/refs/tags/v0.78.2.tar.gz
7
7
  Author: Holosun ApS
8
8
  Author-email: multivac@sunholo.com
9
9
  License: Apache License, Version 2.0
@@ -25,6 +25,7 @@ Requires-Dist: langchain_experimental>0.0.60
25
25
  Requires-Dist: langchain-community
26
26
  Provides-Extra: all
27
27
  Requires-Dist: asyncpg; extra == "all"
28
+ Requires-Dist: azure-identity; extra == "all"
28
29
  Requires-Dist: azure-storage-blob; extra == "all"
29
30
  Requires-Dist: fastapi; extra == "all"
30
31
  Requires-Dist: flask; extra == "all"
@@ -72,6 +73,7 @@ Requires-Dist: tantivy; extra == "all"
72
73
  Requires-Dist: tiktoken; extra == "all"
73
74
  Requires-Dist: unstructured[local-inference]==0.14.9; extra == "all"
74
75
  Provides-Extra: azure
76
+ Requires-Dist: azure-identity; extra == "azure"
75
77
  Requires-Dist: azure-storage-blob; extra == "azure"
76
78
  Provides-Extra: cli
77
79
  Requires-Dist: jsonschema>=4.21.1; extra == "cli"
@@ -1,7 +1,7 @@
1
1
  from setuptools import setup, find_packages
2
2
 
3
3
  # Define your base version
4
- version = '0.77.4'
4
+ version = '0.78.2'
5
5
 
6
6
  setup(
7
7
  name='sunholo',
@@ -40,6 +40,7 @@ setup(
40
40
  # Define optional dependencies with feature names
41
41
  'all': [
42
42
  "asyncpg",
43
+ "azure-identity",
43
44
  "azure-storage-blob",
44
45
  "fastapi",
45
46
  "flask",
@@ -89,6 +90,7 @@ setup(
89
90
 
90
91
  ],
91
92
  'azure': [
93
+ "azure-identity",
92
94
  "azure-storage-blob"
93
95
  ],
94
96
  'cli': [
@@ -14,6 +14,19 @@
14
14
  from ..logging import log
15
15
  from ..utils import load_config, ConfigManager
16
16
 
17
+ def read_cloud_run_url(agent, cloud_run_urls_file='config/cloud_run_urls.json'):
18
+ agent_route, _ = load_config(cloud_run_urls_file)
19
+ log.info(f'agent_route: {agent_route}')
20
+
21
+ try:
22
+ agent_url = agent_route[agent]
23
+ except KeyError:
24
+ raise ValueError(f'agent_url not found for {agent}')
25
+
26
+ log.info(f'agent_url: {agent_url}')
27
+
28
+ return agent_url
29
+
17
30
  def route_vac(vector_name: str=None, config=None) -> str :
18
31
  """
19
32
  Considers what VAC this vector_name belongs to
@@ -30,18 +43,8 @@ def route_vac(vector_name: str=None, config=None) -> str :
30
43
  return agent_url
31
44
 
32
45
  agent = config.vacConfig('agent')
33
- log.info(f'agent_type: {agent}')
34
-
35
- agent_route, _ = load_config('config/cloud_run_urls.json')
36
- log.info(f'agent_route: {agent_route}')
37
46
 
38
- try:
39
- agent_url = agent_route[agent]
40
- except KeyError:
41
- raise ValueError(f'agent_url not found for {agent}')
42
-
43
- log.info(f'agent_url: {agent_url}')
44
- return agent_url
47
+ return read_cloud_run_url(agent)
45
48
 
46
49
  def route_endpoint(vector_name=None, method = 'post', override_endpoint=None, config=None):
47
50
 
@@ -0,0 +1,61 @@
1
+ import os
2
+ try:
3
+ from azure.identity import DefaultAzureCredential, ClientSecretCredential
4
+ except ImportError:
5
+ DefaultAzureCredential = None
6
+ ClientSecretCredential = None
7
+
8
+ from ..logging import log
9
+
10
+ def azure_auth():
11
+ """
12
+ Will attempt to authenticate using default credentials first (e.g. you are running within Azure Container Apps or similar)
13
+
14
+ If default credentials are not available, will attempt to authenticate via env vars - set up via:
15
+
16
+ ```bash
17
+ az ad sp create-for-rbac --name "myApp" --role contributor \
18
+ --scopes /subscriptions/{subscription-id}/resourceGroups/{resource-group} \
19
+ --sdk-auth
20
+
21
+ export AZURE_CLIENT_ID="your-client-id"
22
+ export AZURE_CLIENT_SECRET="your-client-secret"
23
+ export AZURE_TENANT_ID="your-tenant-id"
24
+ ```
25
+
26
+ """
27
+ if DefaultAzureCredential is None:
28
+ raise ImportError("Azure identity credentials library needed - install via `pip install sunholo[azure]`")
29
+
30
+ # Use DefaultAzureCredential to authenticate
31
+ try:
32
+ credential = DefaultAzureCredential()
33
+ return credential
34
+
35
+ except Exception as e:
36
+ log.error(f"Failed to authenticate with default credentials: {str(e)}")
37
+ log.info("Attempting to authenticate using ClientSecretCredential")
38
+
39
+ # Use ClientSecretCredential to authenticate with a service principal
40
+ client_id = os.getenv("AZURE_CLIENT_ID")
41
+ client_secret = os.getenv("AZURE_CLIENT_SECRET")
42
+ tenant_id = os.getenv("AZURE_TENANT_ID")
43
+
44
+ if not client_id or not client_secret or not tenant_id:
45
+ log.error("Service principal credentials are not set in environment variables")
46
+ return None
47
+
48
+ if ClientSecretCredential is None:
49
+ raise ImportError("Azure identity credentials library needed - install via `pip install sunholo[azure]`")
50
+
51
+ try:
52
+ credential = ClientSecretCredential(
53
+ client_id=client_id,
54
+ client_secret=client_secret,
55
+ tenant_id=tenant_id
56
+ )
57
+ return credential
58
+ except Exception as e:
59
+ log.error(f"Failed to authenticate with service principal: {str(e)}")
60
+ return None
61
+
@@ -0,0 +1,47 @@
1
+ import re
2
+ from ..logging import log
3
+
4
+
5
+ def is_azure_blob(message_data):
6
+ """
7
+ Checks if the provided URL is an Azure Blob Storage URL.
8
+
9
+ Args:
10
+ message_data (str): The URL to be checked.
11
+
12
+ Returns:
13
+ bool: True if the URL is an Azure Blob Storage URL, False otherwise.
14
+ """
15
+ blob_url_pattern = r"https://(.*).blob.core.windows.net/(.*)/(.*)"
16
+ match = re.match(blob_url_pattern, message_data)
17
+ if not match:
18
+ return False
19
+
20
+ return True
21
+
22
+ def extract_blob_parts(message_data):
23
+ """
24
+ Extracts the account name, container name, and blob name from an Azure Blob Storage URL.
25
+
26
+ Args:
27
+ message_data (str): The Azure Blob Storage URL.
28
+
29
+ Returns:
30
+ tuple: A tuple containing the account name, container name, and blob name.
31
+ Returns (None, None, None) if the URL is invalid.
32
+ """
33
+ if not is_azure_blob(message_data):
34
+ return None, None, None
35
+
36
+ log.debug("Detected Azure blob storage URL")
37
+ # Extract the account name, container name, and blob name from the URL
38
+ blob_url_pattern = r"https://(.*).blob.core.windows.net/(.*)/(.*)"
39
+ match = re.match(blob_url_pattern, message_data)
40
+ if not match:
41
+ log.error("Invalid Azure blob URL format")
42
+ return None, None
43
+
44
+ account_name, container_name, blob_name = match.groups()
45
+
46
+ return account_name, container_name, blob_name
47
+
@@ -0,0 +1,90 @@
1
+ # Copyright [2024] [Holosun ApS]
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import json
15
+ from datetime import datetime, timezone
16
+
17
+ from ..logging import log
18
+ from ..azure import process_azure_blob_event
19
+
20
+ from ..invoke import invoke_vac
21
+ from .process_chunker_data import process_chunker_data
22
+ from ..chunker.encode_metadata import create_metadata, encode_data
23
+ from ..agents.route import read_cloud_run_url
24
+
25
+ def data_to_embed_azure(events: list):
26
+ """Triggered from a message on an Azure Data Grid event.
27
+ Args:
28
+ data JSON
29
+ """
30
+ validation_event_type = "Microsoft.EventGrid.SubscriptionValidationEvent"
31
+ storage_blob_created_event = "Microsoft.Storage.BlobCreated"
32
+
33
+ all_chunks = []
34
+ vac_name = None
35
+ for event in events:
36
+ event_type = event['eventType']
37
+ data = event['data']
38
+
39
+ if event_type == validation_event_type:
40
+ validation_code = data['validationCode']
41
+ log.info(f"Got SubscriptionValidation event data, validation code: {validation_code}, topic: {event['topic']}")
42
+
43
+ # Return the validation response
44
+ return {"ValidationResponse": validation_code}
45
+ elif event_type == storage_blob_created_event:
46
+
47
+ message_data, metadata, vac_name = process_azure_blob_event(events)
48
+ metadata["return_chunks"] = True
49
+
50
+ #TODO: process the azure blob URL and download it
51
+
52
+ chunks = process_chunker_data(message_data, metadata, vac_name)
53
+ if chunks:
54
+ all_chunks.extend(chunks)
55
+
56
+ if not all_chunks or len(chunks) == 0:
57
+ return {'status': 'error', 'message': f'No chunks were found in events: {events}'}
58
+
59
+ if not vac_name:
60
+ return {'status': 'error', 'message': f'Could not find a valid VAC config name in payload {all_chunks}'}
61
+
62
+ metadata = create_metadata(vac_name, metadata)
63
+
64
+ embeds = []
65
+
66
+ for chunk in chunks:
67
+ log.info(f"Working on chunk {chunk['metadata']}")
68
+
69
+ # do this async?
70
+ content = chunk.get("page_content")
71
+ now_utc = datetime.now(timezone.utc)
72
+ formatted_time = now_utc.strftime("%Y-%m-%dT%H:%M:%SZ")
73
+ chunk["metadata"]["eventTime"] = formatted_time
74
+ if not content:
75
+ log.error("No content chunk found, skipping")
76
+
77
+ continue
78
+
79
+ log.info(f"Sending chunk length {len(content)} to embedder")
80
+ processed_chunk = encode_data(vac = vac_name, content = json.dumps(chunk))
81
+
82
+ embed_url = read_cloud_run_url('embedder')
83
+
84
+ embed_res = invoke_vac(f"{embed_url}/embed_chunk", processed_chunk)
85
+ embeds.append(embed_res)
86
+
87
+ log.info("Embedding pipeline finished")
88
+
89
+ return embed_res
90
+
@@ -0,0 +1,64 @@
1
+ import uuid
2
+ import base64
3
+ import json
4
+ from datetime import datetime, timezone
5
+
6
+ from ..logging import log
7
+
8
+ def create_metadata(vac, metadata):
9
+ now_utc = datetime.now(timezone.utc)
10
+ formatted_time = now_utc.strftime("%Y-%m-%dT%H:%M:%SZ")
11
+
12
+ # Default metadata if none provided
13
+ default_metadata = {"vector_name": vac, "source": "sunholo-cli", "eventTime": formatted_time}
14
+
15
+ try:
16
+ # Merge default metadata with provided metadata
17
+ if metadata:
18
+ if not isinstance(metadata, dict):
19
+ metadata = json.loads(metadata)
20
+ else:
21
+ metadata = {}
22
+ except Exception as err:
23
+ log.error(f"[bold red]ERROR: metadata not parsed: {err} for {metadata}")
24
+
25
+ # Update metadata with default values if not present
26
+ metadata.update(default_metadata)
27
+
28
+ return metadata
29
+
30
+ def encode_data(vac, content, metadata=None, local_chunks=False):
31
+
32
+ metadata = create_metadata(vac, metadata)
33
+
34
+ # Encode the content (URL)
35
+ if isinstance(content, str):
36
+ message_data = base64.b64encode(content.encode('utf-8')).decode('utf-8')
37
+ else:
38
+ raise ValueError(f"Unsupported content type: {type(content)}")
39
+
40
+ now_utc = datetime.now(timezone.utc)
41
+ formatted_time = now_utc.strftime("%Y-%m-%dT%H:%M:%SZ")
42
+
43
+ # Construct the message dictionary
44
+ messageId = str(uuid.uuid4())
45
+ message = {
46
+ "message": {
47
+ "data": message_data,
48
+ "messageId": messageId,
49
+ "publishTime": formatted_time,
50
+ "attributes": {
51
+ "namespace": vac,
52
+ "return_chunks": str(local_chunks).lower()
53
+ },
54
+ }
55
+ }
56
+
57
+ # Merge metadata with attributes
58
+ message["message"]["attributes"].update(metadata)
59
+
60
+ #console.print()
61
+ #console.print(f"Sending message: {messageId} with metadata:")
62
+ #console.print(f"{message['message']['attributes']}")
63
+
64
+ return message
@@ -18,11 +18,17 @@ import tempfile
18
18
  import os
19
19
  import re
20
20
  import json
21
+
21
22
  try:
22
23
  from google.cloud import storage
23
24
  except ImportError:
24
25
  storage = None
25
26
 
27
+ try:
28
+ from azure.storage.blob import BlobServiceClient
29
+ except ImportError:
30
+ BlobServiceClient = None
31
+
26
32
  from langchain.schema import Document
27
33
 
28
34
 
@@ -33,8 +39,8 @@ from . import loaders
33
39
 
34
40
  from ..utils.parsers import extract_urls
35
41
  from ..gcs.add_file import add_file_to_gcs, get_pdf_split_file_name
36
-
37
-
42
+ from ..azure.blobs import extract_blob_parts
43
+ from ..azure.auth import azure_auth
38
44
 
39
45
  def handle_gcs_message(message_data: str, metadata: dict, vector_name: str):
40
46
 
@@ -199,4 +205,95 @@ def handle_json_content_message(message_data: dict, metadata: dict, vector_name:
199
205
 
200
206
  chunks = chunk_doc_to_docs(docs, vector_name=vector_name)
201
207
 
202
- return chunks, metadata
208
+ return chunks, metadata
209
+
210
+
211
+ def handle_azure_blob(message_data: str, metadata: dict, vector_name: str):
212
+ """
213
+ Processes a message from Azure Blob storage, downloads the file, processes it,
214
+ and returns chunks and metadata.
215
+
216
+ Args:
217
+ message_data (str): URL of the Azure blob.
218
+ metadata (dict): Metadata associated with the file.
219
+ vector_name (str): Vector name for processing.
220
+
221
+ Returns:
222
+ chunks (list): List of document chunks.
223
+ metadata (dict): Updated metadata.
224
+ """
225
+
226
+ if BlobServiceClient is None:
227
+ raise ImportError("BlobServiceClient is not installed - install via pip install sunholo[azure]")
228
+
229
+ account_name, container_name, blob_name = extract_blob_parts(message_data)
230
+
231
+ credential = azure_auth()
232
+ if credential is None:
233
+ log.error("BlobServiceClient could not find auth credentials")
234
+ return None, None
235
+
236
+ # Create a BlobServiceClient
237
+ blob_service_client = BlobServiceClient(
238
+ account_url=f"https://{account_name}.blob.core.windows.net",
239
+ credential=credential)
240
+
241
+ # Get the blob client
242
+ blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
243
+
244
+ file_name = pathlib.Path(blob_name)
245
+
246
+ with tempfile.TemporaryDirectory() as temp_dir:
247
+ tmp_file_path = os.path.join(temp_dir, file_name.name)
248
+ with open(tmp_file_path, "wb") as file:
249
+ download_stream = blob_client.download_blob()
250
+ file.write(download_stream.readall())
251
+
252
+ if file_name.suffix.lower() == ".pdf":
253
+ pages = split_pdf_to_pages(tmp_file_path, temp_dir)
254
+ if not metadata.get("source"):
255
+ metadata["source"] = str(file_name)
256
+ if len(pages) > 1:
257
+ log.info(f"Got back {len(pages)} pages for file {tmp_file_path}")
258
+ for pp in pages:
259
+ pp_basename = os.path.basename(pp)
260
+ # file_name/pdf_parts/file_name_1.pdf
261
+ azure_blob_path = f"{file_name.stem}_parts/{pp_basename}"
262
+
263
+ # Upload split pages back to Azure Blob storage
264
+ container_client = blob_service_client.get_container_client(container=container_name)
265
+ with open(file=pp, mode="rb") as page_file:
266
+ blob_client = container_client.upload_blob(name=azure_blob_path, data=page_file, overwrite=True)
267
+
268
+ log.info(f"{azure_blob_path} is now in container {container_name}")
269
+ log.info(f"Sent split pages for {file_name.name} back to Azure Blob to parallelize the imports")
270
+ return None, None
271
+ else:
272
+ # just original temp file
273
+ pages = [tmp_file_path]
274
+
275
+ the_metadata = {
276
+ "type": "file_load_azure_blob",
277
+ "container_name": container_name
278
+ }
279
+
280
+ if metadata.get("source") is None:
281
+ the_metadata["source"] = str(file_name)
282
+
283
+ metadata.update(the_metadata)
284
+
285
+ docs = []
286
+ for page in pages:
287
+ log.info(f"Sending file {page} to loaders.read_file_to_documents {metadata}")
288
+ docs2 = loaders.read_file_to_documents(page, metadata=metadata)
289
+ if docs2 is None:
290
+ log.warning(f"loaders.read_file_to_documents docs2 failed to load file {metadata}")
291
+ docs.extend(docs2)
292
+
293
+ if docs is None:
294
+ log.warning(f"loaders.read_file_to_documents docs failed to load file {metadata}")
295
+ return None, metadata
296
+ else:
297
+ chunks = chunk_doc_to_docs(docs, file_name.suffix, vector_name=vector_name)
298
+
299
+ return chunks, metadata
@@ -5,7 +5,8 @@ from .message_data import (
5
5
  handle_google_drive_message,
6
6
  handle_github_message,
7
7
  handle_http_message,
8
- handle_json_content_message
8
+ handle_json_content_message,
9
+ handle_azure_blob
9
10
  )
10
11
 
11
12
  from . import loaders
@@ -13,7 +14,7 @@ from ..llamaindex.import_files import llamaindex_chunker_check
13
14
  from ..discovery_engine.chunker_handler import discovery_engine_chunker_check
14
15
  from .publish import process_docs_chunks_vector_name
15
16
  from .splitter import chunk_doc_to_docs
16
-
17
+ from ..azure.blobs import is_azure_blob
17
18
 
18
19
  from ..logging import log
19
20
 
@@ -43,6 +44,9 @@ def process_chunker_data(message_data, metadata, vector_name):
43
44
  if message_data.startswith("gs://"):
44
45
  chunks, metadata = handle_gcs_message(message_data, metadata, vector_name)
45
46
 
47
+ elif is_azure_blob(message_data):
48
+ chunks, metadata = handle_azure_blob(message_data, metadata, vector_name)
49
+
46
50
  elif message_data.startswith("https://drive.google.com") or message_data.startswith("https://docs.google.com"):
47
51
  chunks, metadata = handle_google_drive_message(message_data, metadata, vector_name)
48
52
 
@@ -11,64 +11,7 @@ from rich.progress import Progress
11
11
  from ..invoke import invoke_vac
12
12
  from .chat_vac import resolve_service_url
13
13
  from .run_proxy import stop_proxy
14
-
15
- def create_metadata(vac, metadata):
16
- now_utc = datetime.now(timezone.utc)
17
- formatted_time = now_utc.strftime("%Y-%m-%dT%H:%M:%SZ")
18
-
19
- # Default metadata if none provided
20
- default_metadata = {"vector_name": vac, "source": "sunholo-cli", "eventTime": formatted_time}
21
-
22
- try:
23
- # Merge default metadata with provided metadata
24
- if metadata:
25
- if not isinstance(metadata, dict):
26
- metadata = json.loads(metadata)
27
- else:
28
- metadata = {}
29
- except Exception as err:
30
- console.print(f"[bold red]ERROR: metadata not parsed: {err} for {metadata}")
31
-
32
- # Update metadata with default values if not present
33
- metadata.update(default_metadata)
34
-
35
- return metadata
36
-
37
- def encode_data(vac, content, metadata=None, local_chunks=False):
38
-
39
- metadata = create_metadata(vac, metadata)
40
-
41
- # Encode the content (URL)
42
- if isinstance(content, str):
43
- message_data = base64.b64encode(content.encode('utf-8')).decode('utf-8')
44
- else:
45
- raise ValueError(f"Unsupported content type: {type(content)}")
46
-
47
- now_utc = datetime.now(timezone.utc)
48
- formatted_time = now_utc.strftime("%Y-%m-%dT%H:%M:%SZ")
49
-
50
- # Construct the message dictionary
51
- messageId = str(uuid.uuid4())
52
- message = {
53
- "message": {
54
- "data": message_data,
55
- "messageId": messageId,
56
- "publishTime": formatted_time,
57
- "attributes": {
58
- "namespace": vac,
59
- "return_chunks": str(local_chunks).lower()
60
- },
61
- }
62
- }
63
-
64
- # Merge metadata with attributes
65
- message["message"]["attributes"].update(metadata)
66
-
67
- #console.print()
68
- #console.print(f"Sending message: {messageId} with metadata:")
69
- #console.print(f"{message['message']['attributes']}")
70
-
71
- return message
14
+ from ..chunker.encode_metadata import create_metadata, encode_data
72
15
 
73
16
  def embed_command(args):
74
17
  chunk_args = vars(args).copy()
@@ -33,10 +33,10 @@ def invoke_vac(service_url, data, vector_name=None, metadata=None, is_file=False
33
33
  else:
34
34
  json_data = json.loads(data)
35
35
  except json.JSONDecodeError as err:
36
- log.error(f"[bold red]ERROR: invalid JSON: {str(err)} [/bold red]")
36
+ log.error(f"ERROR: invalid JSON: {str(err)}")
37
37
  raise err
38
38
  except Exception as err:
39
- log.error(f"[bold red]ERROR: could not parse JSON: {str(err)} [/bold red]")
39
+ log.error(f"ERROR: could not parse JSON: {str(err)}")
40
40
  raise err
41
41
 
42
42
  log.debug(f"Sending data: {data} or json_data: {json.dumps(json_data)}")
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sunholo
3
- Version: 0.77.4
3
+ Version: 0.78.2
4
4
  Summary: Large Language Model DevOps - a package to help deploy LLMs to the Cloud.
5
5
  Home-page: https://github.com/sunholo-data/sunholo-py
6
- Download-URL: https://github.com/sunholo-data/sunholo-py/archive/refs/tags/v0.77.4.tar.gz
6
+ Download-URL: https://github.com/sunholo-data/sunholo-py/archive/refs/tags/v0.78.2.tar.gz
7
7
  Author: Holosun ApS
8
8
  Author-email: multivac@sunholo.com
9
9
  License: Apache License, Version 2.0
@@ -25,6 +25,7 @@ Requires-Dist: langchain_experimental>0.0.60
25
25
  Requires-Dist: langchain-community
26
26
  Provides-Extra: all
27
27
  Requires-Dist: asyncpg; extra == "all"
28
+ Requires-Dist: azure-identity; extra == "all"
28
29
  Requires-Dist: azure-storage-blob; extra == "all"
29
30
  Requires-Dist: fastapi; extra == "all"
30
31
  Requires-Dist: flask; extra == "all"
@@ -72,6 +73,7 @@ Requires-Dist: tantivy; extra == "all"
72
73
  Requires-Dist: tiktoken; extra == "all"
73
74
  Requires-Dist: unstructured[local-inference]==0.14.9; extra == "all"
74
75
  Provides-Extra: azure
76
+ Requires-Dist: azure-identity; extra == "azure"
75
77
  Requires-Dist: azure-storage-blob; extra == "azure"
76
78
  Provides-Extra: cli
77
79
  Requires-Dist: jsonschema>=4.21.1; extra == "cli"
@@ -33,6 +33,8 @@ sunholo/auth/gcloud.py
33
33
  sunholo/auth/refresh.py
34
34
  sunholo/auth/run.py
35
35
  sunholo/azure/__init__.py
36
+ sunholo/azure/auth.py
37
+ sunholo/azure/blobs.py
36
38
  sunholo/azure/event_grid.py
37
39
  sunholo/bots/__init__.py
38
40
  sunholo/bots/discord.py
@@ -41,6 +43,7 @@ sunholo/bots/webapp.py
41
43
  sunholo/chunker/__init__.py
42
44
  sunholo/chunker/azure.py
43
45
  sunholo/chunker/doc_handling.py
46
+ sunholo/chunker/encode_metadata.py
44
47
  sunholo/chunker/images.py
45
48
  sunholo/chunker/loaders.py
46
49
  sunholo/chunker/message_data.py
@@ -6,6 +6,7 @@ langchain-community
6
6
 
7
7
  [all]
8
8
  asyncpg
9
+ azure-identity
9
10
  azure-storage-blob
10
11
  fastapi
11
12
  flask
@@ -57,6 +58,7 @@ unstructured[local-inference]==0.14.9
57
58
  langchain-anthropic>=0.1.13
58
59
 
59
60
  [azure]
61
+ azure-identity
60
62
  azure-storage-blob
61
63
 
62
64
  [cli]
@@ -1,41 +0,0 @@
1
- # Copyright [2024] [Holosun ApS]
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- from ..logging import log
16
- from ..azure import process_azure_blob_event
17
- from .process_chunker_data import process_chunker_data
18
-
19
- def data_to_embed_azure(events: list):
20
- """Triggered from a message on an Azure Data Grid event.
21
- Args:
22
- data JSON
23
- """
24
- validation_event_type = "Microsoft.EventGrid.SubscriptionValidationEvent"
25
- storage_blob_created_event = "Microsoft.Storage.BlobCreated"
26
-
27
- for event in events:
28
- event_type = event['eventType']
29
- data = event['data']
30
-
31
- if event_type == validation_event_type:
32
- validation_code = data['validationCode']
33
- log.info(f"Got SubscriptionValidation event data, validation code: {validation_code}, topic: {event['topic']}")
34
-
35
- # Return the validation response
36
- return {"ValidationResponse": validation_code}
37
- elif event_type == storage_blob_created_event:
38
-
39
- message_data, metadata, vector_name = process_azure_blob_event(events)
40
-
41
- return process_chunker_data(message_data, metadata, vector_name)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes