sunholo 0.119.4__py3-none-any.whl → 0.119.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sunholo/discovery_engine/chunker_handler.py +2 -1
- sunholo/embedder/embed_chunk.py +2 -19
- sunholo/embedder/embed_metadata.py +27 -0
- {sunholo-0.119.4.dist-info → sunholo-0.119.6.dist-info}/METADATA +1 -1
- {sunholo-0.119.4.dist-info → sunholo-0.119.6.dist-info}/RECORD +9 -8
- {sunholo-0.119.4.dist-info → sunholo-0.119.6.dist-info}/LICENSE.txt +0 -0
- {sunholo-0.119.4.dist-info → sunholo-0.119.6.dist-info}/WHEEL +0 -0
- {sunholo-0.119.4.dist-info → sunholo-0.119.6.dist-info}/entry_points.txt +0 -0
- {sunholo-0.119.4.dist-info → sunholo-0.119.6.dist-info}/top_level.txt +0 -0
|
@@ -5,7 +5,7 @@ from ..components import load_memories
|
|
|
5
5
|
|
|
6
6
|
from .discovery_engine_client import DiscoveryEngineClient
|
|
7
7
|
from .create_new import create_new_discovery_engine
|
|
8
|
-
|
|
8
|
+
from ..embedder.embed_metadata import audit_metadata
|
|
9
9
|
|
|
10
10
|
def do_discovery_engine(message_data:str, metadata:dict, config:ConfigManager=None):
|
|
11
11
|
"""
|
|
@@ -65,6 +65,7 @@ def do_discovery_engine(message_data:str, metadata:dict, config:ConfigManager=No
|
|
|
65
65
|
return None
|
|
66
66
|
for corp in corpuses:
|
|
67
67
|
try:
|
|
68
|
+
metadata = audit_metadata(metadata, chunk_length=500)
|
|
68
69
|
response = corp.import_document_with_metadata(
|
|
69
70
|
gcs_uri=message_data,
|
|
70
71
|
metadata=metadata
|
sunholo/embedder/embed_chunk.py
CHANGED
|
@@ -26,6 +26,7 @@ from ..components import get_embeddings, pick_vectorstore, load_memories, pick_e
|
|
|
26
26
|
from ..custom_logging import log
|
|
27
27
|
from ..database.uuid import generate_uuid_from_object_id
|
|
28
28
|
from ..utils import ConfigManager
|
|
29
|
+
from .embed_metadata import audit_metadata
|
|
29
30
|
|
|
30
31
|
def embed_pubsub_chunk(data: dict):
|
|
31
32
|
"""Triggered from a message on a Cloud Pub/Sub topic "embed_chunk" topic
|
|
@@ -75,25 +76,7 @@ def embed_pubsub_chunk(data: dict):
|
|
|
75
76
|
|
|
76
77
|
log.info(f"Embedding: {vector_name} page_content: {page_content[:30]}...[{len(page_content)}] - {metadata}")
|
|
77
78
|
|
|
78
|
-
|
|
79
|
-
metadata['eventTime'] = datetime.datetime.now().isoformat(timespec='microseconds') + "Z"
|
|
80
|
-
metadata['eventtime'] = metadata['eventTime']
|
|
81
|
-
|
|
82
|
-
if 'source' not in metadata:
|
|
83
|
-
if 'objectId' in metadata:
|
|
84
|
-
metadata['source'] = metadata['objectId']
|
|
85
|
-
elif 'url' in metadata:
|
|
86
|
-
metadata['source'] = metadata['url']
|
|
87
|
-
else:
|
|
88
|
-
log.warning(f"No source found in metadata: {metadata}")
|
|
89
|
-
|
|
90
|
-
if 'original_source' not in metadata:
|
|
91
|
-
metadata['original_source'] = metadata.get('source')
|
|
92
|
-
else:
|
|
93
|
-
metadata['source'] = metadata['original_source']
|
|
94
|
-
|
|
95
|
-
if 'chunk_length' not in metadata:
|
|
96
|
-
metadata['chunk_length'] = len(page_content)
|
|
79
|
+
metadata = audit_metadata(metadata, chunk_length=len(page_content))
|
|
97
80
|
|
|
98
81
|
if 'doc_id' not in metadata:
|
|
99
82
|
log.warning(f"No doc_id found in metadata for {metadata['source']}- creating one")
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
|
|
2
|
+
import datetime
|
|
3
|
+
from ..custom_logging import log
|
|
4
|
+
|
|
5
|
+
def audit_metadata(metadata, chunk_length=None):
|
|
6
|
+
|
|
7
|
+
if 'eventTime' not in metadata:
|
|
8
|
+
metadata['eventTime'] = datetime.datetime.now().isoformat(timespec='microseconds') + "Z"
|
|
9
|
+
metadata['eventtime'] = metadata['eventTime']
|
|
10
|
+
|
|
11
|
+
if 'source' not in metadata:
|
|
12
|
+
if 'objectId' in metadata:
|
|
13
|
+
metadata['source'] = metadata['objectId']
|
|
14
|
+
elif 'url' in metadata:
|
|
15
|
+
metadata['source'] = metadata['url']
|
|
16
|
+
else:
|
|
17
|
+
log.warning(f"No source found in metadata: {metadata}")
|
|
18
|
+
|
|
19
|
+
if 'original_source' not in metadata:
|
|
20
|
+
metadata['original_source'] = metadata.get('source')
|
|
21
|
+
else:
|
|
22
|
+
metadata['source'] = metadata['original_source']
|
|
23
|
+
|
|
24
|
+
if 'chunk_length' not in metadata:
|
|
25
|
+
metadata['chunk_length'] = chunk_length
|
|
26
|
+
|
|
27
|
+
return metadata
|
|
@@ -72,12 +72,13 @@ sunholo/database/sql/sb/delete_source_row.sql,sha256=r6fEuUKdbiLHCDGKSbKINDCpJjs
|
|
|
72
72
|
sunholo/database/sql/sb/return_sources.sql,sha256=89KAnxfK8n_qGK9jy1OQT8f9n4uYUtYL5cCxbC2mj_c,255
|
|
73
73
|
sunholo/database/sql/sb/setup.sql,sha256=CvoFvZQev2uWjmFa3aj3m3iuPFzAAJZ0S7Qi3L3-zZI,89
|
|
74
74
|
sunholo/discovery_engine/__init__.py,sha256=hLgqRDJ22Aov9o2QjAEfsVgnL3kMdM-g5p8RJ9OyKdQ,130
|
|
75
|
-
sunholo/discovery_engine/chunker_handler.py,sha256=
|
|
75
|
+
sunholo/discovery_engine/chunker_handler.py,sha256=1kwVhy9hAxwXjOLpeF_Zc1uFK3uJIwB54gvTnwyjcv0,5849
|
|
76
76
|
sunholo/discovery_engine/create_new.py,sha256=WUi4_xh_dFaGX3xA9jkNKZhaR6LCELjMPeRb0hyj4FU,1226
|
|
77
77
|
sunholo/discovery_engine/discovery_engine_client.py,sha256=b6UT7s-1zqrLnIfoP9bDs64hk0SUsqaA40ghbs2A7go,27552
|
|
78
78
|
sunholo/discovery_engine/get_ai_search_chunks.py,sha256=hsFGOQugSeTMPEaQ16XTs_D45F8NABBm2IsAEdTk7kQ,4316
|
|
79
79
|
sunholo/embedder/__init__.py,sha256=sI4N_CqgEVcrMDxXgxKp1FsfsB4FpjoXgPGkl4N_u4I,44
|
|
80
|
-
sunholo/embedder/embed_chunk.py,sha256=
|
|
80
|
+
sunholo/embedder/embed_chunk.py,sha256=did2pKkWM2o0KkRcb0H9l2x_WjCq6OyuHDxGbITFKPM,6530
|
|
81
|
+
sunholo/embedder/embed_metadata.py,sha256=2ziUIdVwnbCUU8gOwQWEvkrRcyp-7IeyZfSsWNkMquA,866
|
|
81
82
|
sunholo/excel/__init__.py,sha256=AqTMN9K4qJYi4maEgoORc5oxDVGO_eqmwzDaVP37JgY,56
|
|
82
83
|
sunholo/excel/plugin.py,sha256=rl3FoECZ6Ts8KKExPrbPwr3u3CegZfsevmcjgUXAlhE,4033
|
|
83
84
|
sunholo/gcs/__init__.py,sha256=SZvbsMFDko40sIRHTHppA37IijvJTae54vrhooEF5-4,90
|
|
@@ -164,9 +165,9 @@ sunholo/vertex/init.py,sha256=1OQwcPBKZYBTDPdyU7IM4X4OmiXLdsNV30C-fee2scQ,2875
|
|
|
164
165
|
sunholo/vertex/memory_tools.py,sha256=tBZxqVZ4InTmdBvLlOYwoSEWu4-kGquc-gxDwZCC4FA,7667
|
|
165
166
|
sunholo/vertex/safety.py,sha256=S9PgQT1O_BQAkcqauWncRJaydiP8Q_Jzmu9gxYfy1VA,2482
|
|
166
167
|
sunholo/vertex/type_dict_to_json.py,sha256=uTzL4o9tJRao4u-gJOFcACgWGkBOtqACmb6ihvCErL8,4694
|
|
167
|
-
sunholo-0.119.
|
|
168
|
-
sunholo-0.119.
|
|
169
|
-
sunholo-0.119.
|
|
170
|
-
sunholo-0.119.
|
|
171
|
-
sunholo-0.119.
|
|
172
|
-
sunholo-0.119.
|
|
168
|
+
sunholo-0.119.6.dist-info/LICENSE.txt,sha256=SdE3QjnD3GEmqqg9EX3TM9f7WmtOzqS1KJve8rhbYmU,11345
|
|
169
|
+
sunholo-0.119.6.dist-info/METADATA,sha256=z8wn78_BYKX8y6CyyxcMbKPA4hkJNPO3kwgVivrJSKQ,9654
|
|
170
|
+
sunholo-0.119.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
171
|
+
sunholo-0.119.6.dist-info/entry_points.txt,sha256=bZuN5AIHingMPt4Ro1b_T-FnQvZ3teBes-3OyO0asl4,49
|
|
172
|
+
sunholo-0.119.6.dist-info/top_level.txt,sha256=wt5tadn5--5JrZsjJz2LceoUvcrIvxjHJe-RxuudxAk,8
|
|
173
|
+
sunholo-0.119.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|