unique_toolkit 0.5.55__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- unique_toolkit/_common/validate_required_values.py +21 -0
- unique_toolkit/app/__init__.py +20 -0
- unique_toolkit/app/schemas.py +73 -7
- unique_toolkit/chat/__init__.py +5 -4
- unique_toolkit/chat/constants.py +3 -0
- unique_toolkit/chat/functions.py +661 -0
- unique_toolkit/chat/schemas.py +11 -11
- unique_toolkit/chat/service.py +273 -430
- unique_toolkit/content/__init__.py +1 -0
- unique_toolkit/content/constants.py +2 -0
- unique_toolkit/content/functions.py +475 -0
- unique_toolkit/content/service.py +163 -315
- unique_toolkit/content/utils.py +32 -0
- unique_toolkit/embedding/__init__.py +3 -0
- unique_toolkit/embedding/constants.py +2 -0
- unique_toolkit/embedding/functions.py +79 -0
- unique_toolkit/embedding/service.py +47 -34
- unique_toolkit/evaluators/__init__.py +1 -0
- unique_toolkit/evaluators/constants.py +1 -0
- unique_toolkit/evaluators/context_relevancy/constants.py +3 -3
- unique_toolkit/evaluators/context_relevancy/utils.py +5 -2
- unique_toolkit/evaluators/hallucination/utils.py +2 -1
- unique_toolkit/language_model/__init__.py +1 -0
- unique_toolkit/language_model/constants.py +4 -0
- unique_toolkit/language_model/functions.py +362 -0
- unique_toolkit/language_model/service.py +246 -293
- unique_toolkit/short_term_memory/__init__.py +5 -0
- unique_toolkit/short_term_memory/constants.py +1 -0
- unique_toolkit/short_term_memory/functions.py +175 -0
- unique_toolkit/short_term_memory/service.py +153 -27
- {unique_toolkit-0.5.55.dist-info → unique_toolkit-0.6.0.dist-info}/METADATA +33 -7
- unique_toolkit-0.6.0.dist-info/RECORD +64 -0
- unique_toolkit-0.5.55.dist-info/RECORD +0 -50
- {unique_toolkit-0.5.55.dist-info → unique_toolkit-0.6.0.dist-info}/LICENSE +0 -0
- {unique_toolkit-0.5.55.dist-info → unique_toolkit-0.6.0.dist-info}/WHEEL +0 -0
@@ -1,15 +1,23 @@
|
|
1
1
|
import logging
|
2
|
-
import os
|
3
|
-
import re
|
4
|
-
import tempfile
|
5
2
|
from pathlib import Path
|
6
|
-
from typing import Optional, Union, cast
|
7
3
|
|
8
|
-
import
|
9
|
-
import
|
10
|
-
|
11
|
-
from unique_toolkit._common.
|
12
|
-
from unique_toolkit.app.schemas import Event
|
4
|
+
from requests import Response
|
5
|
+
from typing_extensions import deprecated
|
6
|
+
|
7
|
+
from unique_toolkit._common.validate_required_values import validate_required_values
|
8
|
+
from unique_toolkit.app.schemas import BaseEvent, ChatEvent, Event
|
9
|
+
from unique_toolkit.content import DOMAIN_NAME
|
10
|
+
from unique_toolkit.content.constants import DEFAULT_SEARCH_LANGUAGE
|
11
|
+
from unique_toolkit.content.functions import (
|
12
|
+
download_content,
|
13
|
+
download_content_to_file_by_id,
|
14
|
+
request_content_by_id,
|
15
|
+
search_content_chunks,
|
16
|
+
search_content_chunks_async,
|
17
|
+
search_contents,
|
18
|
+
search_contents_async,
|
19
|
+
upload_content,
|
20
|
+
)
|
13
21
|
from unique_toolkit.content.schemas import (
|
14
22
|
Content,
|
15
23
|
ContentChunk,
|
@@ -17,21 +25,51 @@ from unique_toolkit.content.schemas import (
|
|
17
25
|
ContentSearchType,
|
18
26
|
)
|
19
27
|
|
28
|
+
logger = logging.getLogger(f"toolkit.{DOMAIN_NAME}.{__name__}")
|
20
29
|
|
21
|
-
|
30
|
+
|
31
|
+
class ContentService:
|
22
32
|
"""
|
23
33
|
Provides methods for searching, downloading and uploading content in the knowledge base.
|
24
34
|
|
25
35
|
Attributes:
|
26
|
-
|
27
|
-
|
36
|
+
company_id (str | None): The company ID.
|
37
|
+
user_id (str | None): The user ID.
|
38
|
+
metadata_filter (dict | None): The metadata filter.
|
39
|
+
chat_id (str | None): The chat ID.
|
28
40
|
"""
|
29
41
|
|
30
|
-
def __init__(
|
31
|
-
|
32
|
-
|
42
|
+
def __init__(
|
43
|
+
self,
|
44
|
+
event: Event | BaseEvent | None = None,
|
45
|
+
company_id: str | None = None,
|
46
|
+
user_id: str | None = None,
|
47
|
+
):
|
48
|
+
self._event = event # Changed to protected attribute
|
49
|
+
if event:
|
50
|
+
self.company_id = event.company_id
|
51
|
+
self.user_id = event.user_id
|
52
|
+
if isinstance(event, (ChatEvent, Event)):
|
53
|
+
self.metadata_filter = event.payload.metadata_filter
|
54
|
+
self.chat_id = event.payload.chat_id
|
55
|
+
else:
|
56
|
+
[company_id, user_id] = validate_required_values([company_id, user_id])
|
57
|
+
self.company_id = company_id
|
58
|
+
self.user_id = user_id
|
59
|
+
self.metadata_filter = None
|
60
|
+
|
61
|
+
@property
|
62
|
+
@deprecated(
|
63
|
+
"The event property is deprecated and will be removed in a future version."
|
64
|
+
)
|
65
|
+
def event(self) -> Event | BaseEvent | None:
|
66
|
+
"""
|
67
|
+
Get the event object (deprecated).
|
33
68
|
|
34
|
-
|
69
|
+
Returns:
|
70
|
+
Event | BaseEvent | None: The event object.
|
71
|
+
"""
|
72
|
+
return self._event
|
35
73
|
|
36
74
|
def search_content_chunks(
|
37
75
|
self,
|
@@ -39,11 +77,11 @@ class ContentService(BaseService):
|
|
39
77
|
search_type: ContentSearchType,
|
40
78
|
limit: int,
|
41
79
|
search_language: str = DEFAULT_SEARCH_LANGUAGE,
|
42
|
-
reranker_config:
|
43
|
-
scope_ids:
|
44
|
-
chat_only:
|
45
|
-
metadata_filter:
|
46
|
-
content_ids:
|
80
|
+
reranker_config: ContentRerankerConfig | None = None,
|
81
|
+
scope_ids: list[str] | None = None,
|
82
|
+
chat_only: bool | None = None,
|
83
|
+
metadata_filter: dict | None = None,
|
84
|
+
content_ids: list[str] | None = None,
|
47
85
|
) -> list[ContentChunk]:
|
48
86
|
"""
|
49
87
|
Performs a synchronous search for content chunks in the knowledge base.
|
@@ -53,64 +91,49 @@ class ContentService(BaseService):
|
|
53
91
|
search_type (ContentSearchType): The type of search to perform.
|
54
92
|
limit (int): The maximum number of results to return.
|
55
93
|
search_language (str): The language for the full-text search. Defaults to "english".
|
56
|
-
reranker_config (
|
57
|
-
scope_ids (
|
58
|
-
chat_only (
|
59
|
-
metadata_filter (
|
60
|
-
content_ids (
|
94
|
+
reranker_config (ContentRerankerConfig | None): The reranker configuration. Defaults to None.
|
95
|
+
scope_ids (list[str] | None): The scope IDs. Defaults to None.
|
96
|
+
chat_only (bool | None): Whether to search only in the current chat. Defaults to None.
|
97
|
+
metadata_filter (dict | None): UniqueQL metadata filter. If unspecified/None, it tries to use the metadata filter from the event. Defaults to None.
|
98
|
+
content_ids (list[str] | None): The content IDs to search. Defaults to None.
|
61
99
|
Returns:
|
62
100
|
list[ContentChunk]: The search results.
|
63
101
|
"""
|
64
|
-
if not scope_ids:
|
65
|
-
self.logger.warning("No scope IDs provided for search.")
|
66
|
-
|
67
|
-
if content_ids:
|
68
|
-
self.logger.info("Searching chunks for content IDs: %s", content_ids)
|
69
102
|
|
70
103
|
if metadata_filter is None:
|
71
104
|
metadata_filter = self.metadata_filter
|
72
105
|
|
73
106
|
try:
|
74
|
-
searches =
|
75
|
-
user_id=self.
|
76
|
-
company_id=self.
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
scopeIds=scope_ids,
|
107
|
+
searches = search_content_chunks(
|
108
|
+
user_id=self.user_id,
|
109
|
+
company_id=self.company_id,
|
110
|
+
chat_id=self.chat_id,
|
111
|
+
search_string=search_string,
|
112
|
+
search_type=search_type,
|
81
113
|
limit=limit,
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
chatOnly=chat_only,
|
89
|
-
metaDataFilter=metadata_filter,
|
90
|
-
contentIds=content_ids,
|
114
|
+
search_language=search_language,
|
115
|
+
reranker_config=reranker_config,
|
116
|
+
scope_ids=scope_ids,
|
117
|
+
chat_only=chat_only,
|
118
|
+
metadata_filter=metadata_filter,
|
119
|
+
content_ids=content_ids,
|
91
120
|
)
|
121
|
+
return searches
|
92
122
|
except Exception as e:
|
93
|
-
|
123
|
+
logger.error(f"Error while searching content chunks: {e}")
|
94
124
|
raise e
|
95
125
|
|
96
|
-
def map_to_content_chunks(searches: list[unique_sdk.Search]):
|
97
|
-
return [ContentChunk(**search) for search in searches]
|
98
|
-
|
99
|
-
# TODO change return type in sdk from Search to list[Search]
|
100
|
-
searches = cast(list[unique_sdk.Search], searches)
|
101
|
-
return map_to_content_chunks(searches)
|
102
|
-
|
103
126
|
async def search_content_chunks_async(
|
104
127
|
self,
|
105
128
|
search_string: str,
|
106
129
|
search_type: ContentSearchType,
|
107
130
|
limit: int,
|
108
131
|
search_language: str = DEFAULT_SEARCH_LANGUAGE,
|
109
|
-
reranker_config:
|
110
|
-
scope_ids:
|
111
|
-
chat_only:
|
112
|
-
metadata_filter:
|
113
|
-
content_ids:
|
132
|
+
reranker_config: ContentRerankerConfig | None = None,
|
133
|
+
scope_ids: list[str] | None = None,
|
134
|
+
chat_only: bool | None = None,
|
135
|
+
metadata_filter: dict | None = None,
|
136
|
+
content_ids: list[str] | None = None,
|
114
137
|
):
|
115
138
|
"""
|
116
139
|
Performs an asynchronous search for content chunks in the knowledge base.
|
@@ -120,53 +143,37 @@ class ContentService(BaseService):
|
|
120
143
|
search_type (ContentSearchType): The type of search to perform.
|
121
144
|
limit (int): The maximum number of results to return.
|
122
145
|
search_language (str): The language for the full-text search. Defaults to "english".
|
123
|
-
reranker_config (
|
124
|
-
scope_ids (
|
125
|
-
chat_only (
|
126
|
-
metadata_filter (
|
127
|
-
content_ids (
|
146
|
+
reranker_config (ContentRerankerConfig | None): The reranker configuration. Defaults to None.
|
147
|
+
scope_ids (list[str] | None): The scope IDs. Defaults to None.
|
148
|
+
chat_only (bool | None): Whether to search only in the current chat. Defaults to None.
|
149
|
+
metadata_filter (dict | None): UniqueQL metadata filter. If unspecified/None, it tries to use the metadata filter from the event. Defaults to None.
|
150
|
+
content_ids (list[str] | None): The content IDs to search. Defaults to None.
|
128
151
|
Returns:
|
129
152
|
list[ContentChunk]: The search results.
|
130
153
|
"""
|
131
|
-
if not scope_ids:
|
132
|
-
self.logger.warning("No scope IDs provided for search.")
|
133
|
-
|
134
|
-
if content_ids:
|
135
|
-
self.logger.info("Searching chunks for content IDs: %s", content_ids)
|
136
|
-
|
137
154
|
if metadata_filter is None:
|
138
155
|
metadata_filter = self.metadata_filter
|
139
156
|
|
140
157
|
try:
|
141
|
-
searches = await
|
142
|
-
user_id=self.
|
143
|
-
company_id=self.
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
scopeIds=scope_ids,
|
158
|
+
searches = await search_content_chunks_async(
|
159
|
+
user_id=self.user_id,
|
160
|
+
company_id=self.company_id,
|
161
|
+
chat_id=self.chat_id,
|
162
|
+
search_string=search_string,
|
163
|
+
search_type=search_type,
|
148
164
|
limit=limit,
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
chatOnly=chat_only,
|
156
|
-
metaDataFilter=metadata_filter,
|
157
|
-
contentIds=content_ids,
|
165
|
+
search_language=search_language,
|
166
|
+
reranker_config=reranker_config,
|
167
|
+
scope_ids=scope_ids,
|
168
|
+
chat_only=chat_only,
|
169
|
+
metadata_filter=metadata_filter,
|
170
|
+
content_ids=content_ids,
|
158
171
|
)
|
172
|
+
return searches
|
159
173
|
except Exception as e:
|
160
|
-
|
174
|
+
logger.error(f"Error while searching content chunks: {e}")
|
161
175
|
raise e
|
162
176
|
|
163
|
-
def map_to_content_chunks(searches: list[unique_sdk.Search]):
|
164
|
-
return [ContentChunk(**search) for search in searches]
|
165
|
-
|
166
|
-
# TODO change return type in sdk from Search to list[Search]
|
167
|
-
searches = cast(list[unique_sdk.Search], searches)
|
168
|
-
return map_to_content_chunks(searches)
|
169
|
-
|
170
177
|
def search_contents(
|
171
178
|
self,
|
172
179
|
where: dict,
|
@@ -181,22 +188,12 @@ class ContentService(BaseService):
|
|
181
188
|
Returns:
|
182
189
|
list[Content]: The search results.
|
183
190
|
"""
|
184
|
-
|
185
|
-
self.
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
company_id=self.event.company_id,
|
191
|
-
chatId=self.event.payload.chat_id,
|
192
|
-
# TODO add type parameter
|
193
|
-
where=where, # type: ignore
|
194
|
-
)
|
195
|
-
except Exception as e:
|
196
|
-
self.logger.error(f"Error while searching contents: {e}")
|
197
|
-
raise e
|
198
|
-
|
199
|
-
return self._map_contents(contents)
|
191
|
+
return search_contents(
|
192
|
+
user_id=self.user_id,
|
193
|
+
company_id=self.company_id,
|
194
|
+
chat_id=self.chat_id,
|
195
|
+
where=where,
|
196
|
+
)
|
200
197
|
|
201
198
|
async def search_contents_async(
|
202
199
|
self,
|
@@ -211,62 +208,28 @@ class ContentService(BaseService):
|
|
211
208
|
Returns:
|
212
209
|
list[Content]: The search results.
|
213
210
|
"""
|
214
|
-
|
215
|
-
self.
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
company_id=self.event.company_id,
|
221
|
-
chatId=self.event.payload.chat_id,
|
222
|
-
# TODO add type parameter
|
223
|
-
where=where, # type: ignore
|
224
|
-
)
|
225
|
-
except Exception as e:
|
226
|
-
self.logger.error(f"Error while searching contents: {e}")
|
227
|
-
raise e
|
228
|
-
|
229
|
-
return self._map_contents(contents)
|
211
|
+
return await search_contents_async(
|
212
|
+
user_id=self.user_id,
|
213
|
+
company_id=self.company_id,
|
214
|
+
chat_id=self.chat_id,
|
215
|
+
where=where,
|
216
|
+
)
|
230
217
|
|
231
218
|
def search_content_on_chat(
|
232
219
|
self,
|
233
220
|
) -> list[Content]:
|
234
|
-
where = {"ownerId": {"equals": self.
|
221
|
+
where = {"ownerId": {"equals": self.chat_id}}
|
235
222
|
|
236
223
|
return self.search_contents(where)
|
237
224
|
|
238
|
-
@staticmethod
|
239
|
-
def _map_content_chunk(content_chunk: dict):
|
240
|
-
return ContentChunk(
|
241
|
-
id=content_chunk["id"],
|
242
|
-
text=content_chunk["text"],
|
243
|
-
start_page=content_chunk["startPage"],
|
244
|
-
end_page=content_chunk["endPage"],
|
245
|
-
order=content_chunk["order"],
|
246
|
-
)
|
247
|
-
|
248
|
-
def _map_content(self, content: dict):
|
249
|
-
return Content(
|
250
|
-
id=content["id"],
|
251
|
-
key=content["key"],
|
252
|
-
title=content["title"],
|
253
|
-
url=content["url"],
|
254
|
-
chunks=[self._map_content_chunk(chunk) for chunk in content["chunks"]],
|
255
|
-
created_at=content["createdAt"],
|
256
|
-
updated_at=content["updatedAt"],
|
257
|
-
)
|
258
|
-
|
259
|
-
def _map_contents(self, contents):
|
260
|
-
return [self._map_content(content) for content in contents]
|
261
|
-
|
262
225
|
def upload_content(
|
263
226
|
self,
|
264
227
|
path_to_content: str,
|
265
228
|
content_name: str,
|
266
229
|
mime_type: str,
|
267
|
-
scope_id:
|
268
|
-
chat_id:
|
269
|
-
skip_ingestion:
|
230
|
+
scope_id: str | None = None,
|
231
|
+
chat_id: str | None = None,
|
232
|
+
skip_ingestion: bool = False,
|
270
233
|
):
|
271
234
|
"""
|
272
235
|
Uploads content to the knowledge base.
|
@@ -275,108 +238,30 @@ class ContentService(BaseService):
|
|
275
238
|
path_to_content (str): The path to the content to upload.
|
276
239
|
content_name (str): The name of the content.
|
277
240
|
mime_type (str): The MIME type of the content.
|
278
|
-
scope_id (
|
279
|
-
chat_id (
|
241
|
+
scope_id (str | None): The scope ID. Defaults to None.
|
242
|
+
chat_id (str | None): The chat ID. Defaults to None.
|
243
|
+
skip_ingestion (bool): Whether to skip ingestion. Defaults to False.
|
280
244
|
|
281
245
|
Returns:
|
282
246
|
Content: The uploaded content.
|
283
247
|
"""
|
284
248
|
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
self.logger.error(f"Error while uploading content: {e}")
|
296
|
-
raise e
|
297
|
-
|
298
|
-
def _trigger_upload_content(
|
299
|
-
self,
|
300
|
-
path_to_content: str,
|
301
|
-
content_name: str,
|
302
|
-
mime_type: str,
|
303
|
-
scope_id: Optional[str] = None,
|
304
|
-
chat_id: Optional[str] = None,
|
305
|
-
skip_ingestion: Optional[bool] = False,
|
306
|
-
):
|
307
|
-
if not chat_id and not scope_id:
|
308
|
-
raise ValueError("chat_id or scope_id must be provided")
|
309
|
-
|
310
|
-
byte_size = os.path.getsize(path_to_content)
|
311
|
-
created_content = unique_sdk.Content.upsert(
|
312
|
-
user_id=self.event.user_id,
|
313
|
-
company_id=self.event.company_id,
|
314
|
-
input={
|
315
|
-
"key": content_name,
|
316
|
-
"title": content_name,
|
317
|
-
"mimeType": mime_type,
|
318
|
-
},
|
319
|
-
scopeId=scope_id,
|
320
|
-
chatId=chat_id,
|
321
|
-
) # type: ignore
|
322
|
-
|
323
|
-
write_url = created_content["writeUrl"]
|
324
|
-
|
325
|
-
if not write_url:
|
326
|
-
error_msg = "Write url for uploaded content is missing"
|
327
|
-
self.logger.error(error_msg)
|
328
|
-
raise ValueError(error_msg)
|
329
|
-
|
330
|
-
# upload to azure blob storage SAS url uploadUrl the pdf file translatedFile make sure it is treated as a application/pdf
|
331
|
-
with open(path_to_content, "rb") as file:
|
332
|
-
requests.put(
|
333
|
-
url=write_url,
|
334
|
-
data=file,
|
335
|
-
headers={
|
336
|
-
"X-Ms-Blob-Content-Type": mime_type,
|
337
|
-
"X-Ms-Blob-Type": "BlockBlob",
|
338
|
-
},
|
339
|
-
)
|
340
|
-
|
341
|
-
read_url = created_content["readUrl"]
|
342
|
-
|
343
|
-
if not read_url:
|
344
|
-
error_msg = "Read url for uploaded content is missing"
|
345
|
-
self.logger.error(error_msg)
|
346
|
-
raise ValueError(error_msg)
|
347
|
-
|
348
|
-
input_dict = {
|
349
|
-
"key": content_name,
|
350
|
-
"title": content_name,
|
351
|
-
"mimeType": mime_type,
|
352
|
-
"byteSize": byte_size,
|
353
|
-
}
|
354
|
-
|
355
|
-
if skip_ingestion:
|
356
|
-
input_dict["ingestionConfig"] = {"uniqueIngestionMode": "SKIP_INGESTION"}
|
357
|
-
|
358
|
-
if chat_id:
|
359
|
-
unique_sdk.Content.upsert(
|
360
|
-
user_id=self.event.user_id,
|
361
|
-
company_id=self.event.company_id,
|
362
|
-
input=input_dict,
|
363
|
-
fileUrl=read_url,
|
364
|
-
chatId=chat_id,
|
365
|
-
) # type: ignore
|
366
|
-
else:
|
367
|
-
unique_sdk.Content.upsert(
|
368
|
-
user_id=self.event.user_id,
|
369
|
-
company_id=self.event.company_id,
|
370
|
-
input=input_dict,
|
371
|
-
fileUrl=read_url,
|
372
|
-
scopeId=scope_id,
|
373
|
-
) # type: ignore
|
374
|
-
|
375
|
-
return Content(**created_content)
|
249
|
+
return upload_content(
|
250
|
+
user_id=self.user_id,
|
251
|
+
company_id=self.company_id,
|
252
|
+
path_to_content=path_to_content,
|
253
|
+
content_name=content_name,
|
254
|
+
mime_type=mime_type,
|
255
|
+
scope_id=scope_id,
|
256
|
+
chat_id=chat_id,
|
257
|
+
skip_ingestion=skip_ingestion,
|
258
|
+
)
|
376
259
|
|
377
260
|
def request_content_by_id(
|
378
|
-
self,
|
379
|
-
|
261
|
+
self,
|
262
|
+
content_id: str,
|
263
|
+
chat_id: str | None,
|
264
|
+
) -> Response:
|
380
265
|
"""
|
381
266
|
Sends a request to download content from a chat.
|
382
267
|
|
@@ -388,37 +273,28 @@ class ContentService(BaseService):
|
|
388
273
|
requests.Response: The response object containing the downloaded content.
|
389
274
|
|
390
275
|
"""
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
headers = {
|
398
|
-
"x-api-version": unique_sdk.api_version,
|
399
|
-
"x-app-id": unique_sdk.app_id,
|
400
|
-
"x-user-id": self.event.user_id,
|
401
|
-
"x-company-id": self.event.company_id,
|
402
|
-
"Authorization": "Bearer %s" % (unique_sdk.api_key,),
|
403
|
-
}
|
404
|
-
|
405
|
-
return requests.get(url, headers=headers)
|
276
|
+
return request_content_by_id(
|
277
|
+
user_id=self.user_id,
|
278
|
+
company_id=self.company_id,
|
279
|
+
content_id=content_id,
|
280
|
+
chat_id=chat_id,
|
281
|
+
)
|
406
282
|
|
407
283
|
def download_content_to_file_by_id(
|
408
284
|
self,
|
409
285
|
content_id: str,
|
410
|
-
chat_id:
|
286
|
+
chat_id: str | None = None,
|
411
287
|
filename: str | None = None,
|
412
|
-
tmp_dir_path:
|
288
|
+
tmp_dir_path: str | Path | None = "/tmp",
|
413
289
|
):
|
414
290
|
"""
|
415
291
|
Downloads content from a chat and saves it to a file.
|
416
292
|
|
417
293
|
Args:
|
418
294
|
content_id (str): The ID of the content to download.
|
419
|
-
chat_id (
|
295
|
+
chat_id (str | None): The ID of the chat to download from. Defaults to None and the file is downloaded from the knowledge base.
|
420
296
|
filename (str | None): The name of the file to save the content as. If not provided, the original filename will be used. Defaults to None.
|
421
|
-
tmp_dir_path (
|
297
|
+
tmp_dir_path (str | Path | None): The path to the temporary directory where the content will be saved. Defaults to "/tmp".
|
422
298
|
|
423
299
|
Returns:
|
424
300
|
Path: The path to the downloaded file.
|
@@ -427,43 +303,22 @@ class ContentService(BaseService):
|
|
427
303
|
Exception: If the download fails or the filename cannot be determined.
|
428
304
|
"""
|
429
305
|
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
pattern = r'filename="([^"]+)"'
|
439
|
-
match = re.search(
|
440
|
-
pattern, response.headers.get("Content-Disposition", "")
|
441
|
-
)
|
442
|
-
if match:
|
443
|
-
content_path = Path(random_dir) / match.group(1)
|
444
|
-
else:
|
445
|
-
error_msg = (
|
446
|
-
"Error downloading file: Filename could not be determined"
|
447
|
-
)
|
448
|
-
self.logger.error(error_msg)
|
449
|
-
raise Exception(error_msg)
|
450
|
-
|
451
|
-
with open(content_path, "wb") as file:
|
452
|
-
file.write(response.content)
|
453
|
-
else:
|
454
|
-
error_msg = f"Error downloading file: Status code {response.status_code}"
|
455
|
-
self.logger.error(error_msg)
|
456
|
-
raise Exception(error_msg)
|
457
|
-
|
458
|
-
return content_path
|
306
|
+
return download_content_to_file_by_id(
|
307
|
+
user_id=self.user_id,
|
308
|
+
company_id=self.company_id,
|
309
|
+
content_id=content_id,
|
310
|
+
chat_id=chat_id,
|
311
|
+
filename=filename,
|
312
|
+
tmp_dir_path=tmp_dir_path,
|
313
|
+
)
|
459
314
|
|
460
315
|
# TODO: Discuss if we should deprecate this method due to unclear use by content_name
|
461
316
|
def download_content(
|
462
317
|
self,
|
463
318
|
content_id: str,
|
464
319
|
content_name: str,
|
465
|
-
chat_id:
|
466
|
-
dir_path:
|
320
|
+
chat_id: str | None = None,
|
321
|
+
dir_path: str | Path | None = "/tmp",
|
467
322
|
) -> Path:
|
468
323
|
"""
|
469
324
|
Downloads content to temporary directory
|
@@ -481,18 +336,11 @@ class ContentService(BaseService):
|
|
481
336
|
Exception: If the download fails.
|
482
337
|
"""
|
483
338
|
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
file.write(response.content)
|
493
|
-
else:
|
494
|
-
error_msg = f"Error downloading file: Status code {response.status_code}"
|
495
|
-
self.logger.error(error_msg)
|
496
|
-
raise Exception(error_msg)
|
497
|
-
|
498
|
-
return content_path
|
339
|
+
return download_content(
|
340
|
+
user_id=self.user_id,
|
341
|
+
company_id=self.company_id,
|
342
|
+
content_id=content_id,
|
343
|
+
content_name=content_name,
|
344
|
+
chat_id=chat_id,
|
345
|
+
dir_path=dir_path,
|
346
|
+
)
|
unique_toolkit/content/utils.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
import re
|
2
2
|
|
3
3
|
import tiktoken
|
4
|
+
import unique_sdk
|
4
5
|
|
5
6
|
from unique_toolkit.content.schemas import (
|
7
|
+
Content,
|
6
8
|
ContentChunk,
|
7
9
|
)
|
8
10
|
|
@@ -186,3 +188,33 @@ def count_tokens(text: str, encoding_model="cl100k_base") -> int:
|
|
186
188
|
"""
|
187
189
|
encoding = tiktoken.get_encoding(encoding_model)
|
188
190
|
return len(encoding.encode(text))
|
191
|
+
|
192
|
+
|
193
|
+
def map_content_chunk(content_chunk: dict):
|
194
|
+
return ContentChunk(
|
195
|
+
id=content_chunk["id"],
|
196
|
+
text=content_chunk["text"],
|
197
|
+
start_page=content_chunk["startPage"],
|
198
|
+
end_page=content_chunk["endPage"],
|
199
|
+
order=content_chunk["order"],
|
200
|
+
)
|
201
|
+
|
202
|
+
|
203
|
+
def map_content(content: dict):
|
204
|
+
return Content(
|
205
|
+
id=content["id"],
|
206
|
+
key=content["key"],
|
207
|
+
title=content["title"],
|
208
|
+
url=content["url"],
|
209
|
+
chunks=[map_content_chunk(chunk) for chunk in content["chunks"]],
|
210
|
+
created_at=content["createdAt"],
|
211
|
+
updated_at=content["updatedAt"],
|
212
|
+
)
|
213
|
+
|
214
|
+
|
215
|
+
def map_contents(contents):
|
216
|
+
return [map_content(content) for content in contents]
|
217
|
+
|
218
|
+
|
219
|
+
def map_to_content_chunks(searches: list[unique_sdk.Search]):
|
220
|
+
return [ContentChunk(**search) for search in searches]
|