sunholo 0.119.3__py3-none-any.whl → 0.119.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sunholo/discovery_engine/chunker_handler.py +6 -2
- sunholo/discovery_engine/discovery_engine_client.py +4 -0
- sunholo/embedder/embed_chunk.py +2 -19
- sunholo/embedder/embed_metadata.py +27 -0
- {sunholo-0.119.3.dist-info → sunholo-0.119.6.dist-info}/METADATA +1 -1
- {sunholo-0.119.3.dist-info → sunholo-0.119.6.dist-info}/RECORD +10 -9
- {sunholo-0.119.3.dist-info → sunholo-0.119.6.dist-info}/LICENSE.txt +0 -0
- {sunholo-0.119.3.dist-info → sunholo-0.119.6.dist-info}/WHEEL +0 -0
- {sunholo-0.119.3.dist-info → sunholo-0.119.6.dist-info}/entry_points.txt +0 -0
- {sunholo-0.119.3.dist-info → sunholo-0.119.6.dist-info}/top_level.txt +0 -0
|
@@ -5,7 +5,7 @@ from ..components import load_memories
|
|
|
5
5
|
|
|
6
6
|
from .discovery_engine_client import DiscoveryEngineClient
|
|
7
7
|
from .create_new import create_new_discovery_engine
|
|
8
|
-
|
|
8
|
+
from ..embedder.embed_metadata import audit_metadata
|
|
9
9
|
|
|
10
10
|
def do_discovery_engine(message_data:str, metadata:dict, config:ConfigManager=None):
|
|
11
11
|
"""
|
|
@@ -65,11 +65,15 @@ def do_discovery_engine(message_data:str, metadata:dict, config:ConfigManager=No
|
|
|
65
65
|
return None
|
|
66
66
|
for corp in corpuses:
|
|
67
67
|
try:
|
|
68
|
+
metadata = audit_metadata(metadata, chunk_length=500)
|
|
68
69
|
response = corp.import_document_with_metadata(
|
|
69
70
|
gcs_uri=message_data,
|
|
70
71
|
metadata=metadata
|
|
71
72
|
)
|
|
72
|
-
|
|
73
|
+
if response:
|
|
74
|
+
log.info(f"Imported file to corpus: {response} with metadata: {metadata}")
|
|
75
|
+
else:
|
|
76
|
+
log.warning(f"Could not import {message_data} got not response")
|
|
73
77
|
except Exception as err:
|
|
74
78
|
log.error(f"Error importing {message_data} - {corp=} - {str(err)}")
|
|
75
79
|
|
|
@@ -558,6 +558,9 @@ class DiscoveryEngineClient:
|
|
|
558
558
|
except json.JSONDecodeError as e:
|
|
559
559
|
log.error(f"Error decoding JSON in line: {line.strip()}. Error: {e}")
|
|
560
560
|
continue # Skip to the next line if there's an error
|
|
561
|
+
except Exception as e:
|
|
562
|
+
log.error(f"Unknown error: {str(e)}")
|
|
563
|
+
raise e
|
|
561
564
|
|
|
562
565
|
# 2. Use InlineSource to import:
|
|
563
566
|
request = discoveryengine.ImportDocumentsRequest(
|
|
@@ -615,6 +618,7 @@ class DiscoveryEngineClient:
|
|
|
615
618
|
|
|
616
619
|
except Exception as e:
|
|
617
620
|
log.error(f"Error importing document with metadata: {e}")
|
|
621
|
+
raise e
|
|
618
622
|
|
|
619
623
|
def get_mime_type(self, uri:str):
|
|
620
624
|
return guess_mime_type(uri)
|
sunholo/embedder/embed_chunk.py
CHANGED
|
@@ -26,6 +26,7 @@ from ..components import get_embeddings, pick_vectorstore, load_memories, pick_e
|
|
|
26
26
|
from ..custom_logging import log
|
|
27
27
|
from ..database.uuid import generate_uuid_from_object_id
|
|
28
28
|
from ..utils import ConfigManager
|
|
29
|
+
from .embed_metadata import audit_metadata
|
|
29
30
|
|
|
30
31
|
def embed_pubsub_chunk(data: dict):
|
|
31
32
|
"""Triggered from a message on a Cloud Pub/Sub topic "embed_chunk" topic
|
|
@@ -75,25 +76,7 @@ def embed_pubsub_chunk(data: dict):
|
|
|
75
76
|
|
|
76
77
|
log.info(f"Embedding: {vector_name} page_content: {page_content[:30]}...[{len(page_content)}] - {metadata}")
|
|
77
78
|
|
|
78
|
-
|
|
79
|
-
metadata['eventTime'] = datetime.datetime.now().isoformat(timespec='microseconds') + "Z"
|
|
80
|
-
metadata['eventtime'] = metadata['eventTime']
|
|
81
|
-
|
|
82
|
-
if 'source' not in metadata:
|
|
83
|
-
if 'objectId' in metadata:
|
|
84
|
-
metadata['source'] = metadata['objectId']
|
|
85
|
-
elif 'url' in metadata:
|
|
86
|
-
metadata['source'] = metadata['url']
|
|
87
|
-
else:
|
|
88
|
-
log.warning(f"No source found in metadata: {metadata}")
|
|
89
|
-
|
|
90
|
-
if 'original_source' not in metadata:
|
|
91
|
-
metadata['original_source'] = metadata.get('source')
|
|
92
|
-
else:
|
|
93
|
-
metadata['source'] = metadata['original_source']
|
|
94
|
-
|
|
95
|
-
if 'chunk_length' not in metadata:
|
|
96
|
-
metadata['chunk_length'] = len(page_content)
|
|
79
|
+
metadata = audit_metadata(metadata, chunk_length=len(page_content))
|
|
97
80
|
|
|
98
81
|
if 'doc_id' not in metadata:
|
|
99
82
|
log.warning(f"No doc_id found in metadata for {metadata['source']}- creating one")
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
|
|
2
|
+
import datetime
|
|
3
|
+
from ..custom_logging import log
|
|
4
|
+
|
|
5
|
+
def audit_metadata(metadata, chunk_length=None):
|
|
6
|
+
|
|
7
|
+
if 'eventTime' not in metadata:
|
|
8
|
+
metadata['eventTime'] = datetime.datetime.now().isoformat(timespec='microseconds') + "Z"
|
|
9
|
+
metadata['eventtime'] = metadata['eventTime']
|
|
10
|
+
|
|
11
|
+
if 'source' not in metadata:
|
|
12
|
+
if 'objectId' in metadata:
|
|
13
|
+
metadata['source'] = metadata['objectId']
|
|
14
|
+
elif 'url' in metadata:
|
|
15
|
+
metadata['source'] = metadata['url']
|
|
16
|
+
else:
|
|
17
|
+
log.warning(f"No source found in metadata: {metadata}")
|
|
18
|
+
|
|
19
|
+
if 'original_source' not in metadata:
|
|
20
|
+
metadata['original_source'] = metadata.get('source')
|
|
21
|
+
else:
|
|
22
|
+
metadata['source'] = metadata['original_source']
|
|
23
|
+
|
|
24
|
+
if 'chunk_length' not in metadata:
|
|
25
|
+
metadata['chunk_length'] = chunk_length
|
|
26
|
+
|
|
27
|
+
return metadata
|
|
@@ -72,12 +72,13 @@ sunholo/database/sql/sb/delete_source_row.sql,sha256=r6fEuUKdbiLHCDGKSbKINDCpJjs
|
|
|
72
72
|
sunholo/database/sql/sb/return_sources.sql,sha256=89KAnxfK8n_qGK9jy1OQT8f9n4uYUtYL5cCxbC2mj_c,255
|
|
73
73
|
sunholo/database/sql/sb/setup.sql,sha256=CvoFvZQev2uWjmFa3aj3m3iuPFzAAJZ0S7Qi3L3-zZI,89
|
|
74
74
|
sunholo/discovery_engine/__init__.py,sha256=hLgqRDJ22Aov9o2QjAEfsVgnL3kMdM-g5p8RJ9OyKdQ,130
|
|
75
|
-
sunholo/discovery_engine/chunker_handler.py,sha256=
|
|
75
|
+
sunholo/discovery_engine/chunker_handler.py,sha256=1kwVhy9hAxwXjOLpeF_Zc1uFK3uJIwB54gvTnwyjcv0,5849
|
|
76
76
|
sunholo/discovery_engine/create_new.py,sha256=WUi4_xh_dFaGX3xA9jkNKZhaR6LCELjMPeRb0hyj4FU,1226
|
|
77
|
-
sunholo/discovery_engine/discovery_engine_client.py,sha256=
|
|
77
|
+
sunholo/discovery_engine/discovery_engine_client.py,sha256=b6UT7s-1zqrLnIfoP9bDs64hk0SUsqaA40ghbs2A7go,27552
|
|
78
78
|
sunholo/discovery_engine/get_ai_search_chunks.py,sha256=hsFGOQugSeTMPEaQ16XTs_D45F8NABBm2IsAEdTk7kQ,4316
|
|
79
79
|
sunholo/embedder/__init__.py,sha256=sI4N_CqgEVcrMDxXgxKp1FsfsB4FpjoXgPGkl4N_u4I,44
|
|
80
|
-
sunholo/embedder/embed_chunk.py,sha256=
|
|
80
|
+
sunholo/embedder/embed_chunk.py,sha256=did2pKkWM2o0KkRcb0H9l2x_WjCq6OyuHDxGbITFKPM,6530
|
|
81
|
+
sunholo/embedder/embed_metadata.py,sha256=2ziUIdVwnbCUU8gOwQWEvkrRcyp-7IeyZfSsWNkMquA,866
|
|
81
82
|
sunholo/excel/__init__.py,sha256=AqTMN9K4qJYi4maEgoORc5oxDVGO_eqmwzDaVP37JgY,56
|
|
82
83
|
sunholo/excel/plugin.py,sha256=rl3FoECZ6Ts8KKExPrbPwr3u3CegZfsevmcjgUXAlhE,4033
|
|
83
84
|
sunholo/gcs/__init__.py,sha256=SZvbsMFDko40sIRHTHppA37IijvJTae54vrhooEF5-4,90
|
|
@@ -164,9 +165,9 @@ sunholo/vertex/init.py,sha256=1OQwcPBKZYBTDPdyU7IM4X4OmiXLdsNV30C-fee2scQ,2875
|
|
|
164
165
|
sunholo/vertex/memory_tools.py,sha256=tBZxqVZ4InTmdBvLlOYwoSEWu4-kGquc-gxDwZCC4FA,7667
|
|
165
166
|
sunholo/vertex/safety.py,sha256=S9PgQT1O_BQAkcqauWncRJaydiP8Q_Jzmu9gxYfy1VA,2482
|
|
166
167
|
sunholo/vertex/type_dict_to_json.py,sha256=uTzL4o9tJRao4u-gJOFcACgWGkBOtqACmb6ihvCErL8,4694
|
|
167
|
-
sunholo-0.119.
|
|
168
|
-
sunholo-0.119.
|
|
169
|
-
sunholo-0.119.
|
|
170
|
-
sunholo-0.119.
|
|
171
|
-
sunholo-0.119.
|
|
172
|
-
sunholo-0.119.
|
|
168
|
+
sunholo-0.119.6.dist-info/LICENSE.txt,sha256=SdE3QjnD3GEmqqg9EX3TM9f7WmtOzqS1KJve8rhbYmU,11345
|
|
169
|
+
sunholo-0.119.6.dist-info/METADATA,sha256=z8wn78_BYKX8y6CyyxcMbKPA4hkJNPO3kwgVivrJSKQ,9654
|
|
170
|
+
sunholo-0.119.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
171
|
+
sunholo-0.119.6.dist-info/entry_points.txt,sha256=bZuN5AIHingMPt4Ro1b_T-FnQvZ3teBes-3OyO0asl4,49
|
|
172
|
+
sunholo-0.119.6.dist-info/top_level.txt,sha256=wt5tadn5--5JrZsjJz2LceoUvcrIvxjHJe-RxuudxAk,8
|
|
173
|
+
sunholo-0.119.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|