unique_toolkit 0.5.54__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- unique_toolkit/_common/validate_required_values.py +21 -0
- unique_toolkit/app/__init__.py +20 -0
- unique_toolkit/app/schemas.py +73 -7
- unique_toolkit/chat/__init__.py +5 -4
- unique_toolkit/chat/constants.py +3 -0
- unique_toolkit/chat/functions.py +661 -0
- unique_toolkit/chat/schemas.py +11 -11
- unique_toolkit/chat/service.py +273 -430
- unique_toolkit/content/__init__.py +1 -0
- unique_toolkit/content/constants.py +2 -0
- unique_toolkit/content/functions.py +475 -0
- unique_toolkit/content/service.py +163 -300
- unique_toolkit/content/utils.py +32 -0
- unique_toolkit/embedding/__init__.py +3 -0
- unique_toolkit/embedding/constants.py +2 -0
- unique_toolkit/embedding/functions.py +79 -0
- unique_toolkit/embedding/service.py +47 -34
- unique_toolkit/evaluators/__init__.py +1 -0
- unique_toolkit/evaluators/constants.py +1 -0
- unique_toolkit/evaluators/context_relevancy/constants.py +3 -3
- unique_toolkit/evaluators/context_relevancy/utils.py +5 -2
- unique_toolkit/evaluators/hallucination/utils.py +2 -1
- unique_toolkit/language_model/__init__.py +1 -0
- unique_toolkit/language_model/constants.py +4 -0
- unique_toolkit/language_model/functions.py +362 -0
- unique_toolkit/language_model/service.py +246 -293
- unique_toolkit/short_term_memory/__init__.py +5 -0
- unique_toolkit/short_term_memory/constants.py +1 -0
- unique_toolkit/short_term_memory/functions.py +175 -0
- unique_toolkit/short_term_memory/service.py +153 -27
- {unique_toolkit-0.5.54.dist-info → unique_toolkit-0.6.0.dist-info}/METADATA +36 -7
- unique_toolkit-0.6.0.dist-info/RECORD +64 -0
- unique_toolkit-0.5.54.dist-info/RECORD +0 -50
- {unique_toolkit-0.5.54.dist-info → unique_toolkit-0.6.0.dist-info}/LICENSE +0 -0
- {unique_toolkit-0.5.54.dist-info → unique_toolkit-0.6.0.dist-info}/WHEEL +0 -0
@@ -1,15 +1,23 @@
|
|
1
1
|
import logging
|
2
|
-
import os
|
3
|
-
import re
|
4
|
-
import tempfile
|
5
2
|
from pathlib import Path
|
6
|
-
from typing import Optional, Union, cast
|
7
3
|
|
8
|
-
import
|
9
|
-
import
|
10
|
-
|
11
|
-
from unique_toolkit._common.
|
12
|
-
from unique_toolkit.app.schemas import Event
|
4
|
+
from requests import Response
|
5
|
+
from typing_extensions import deprecated
|
6
|
+
|
7
|
+
from unique_toolkit._common.validate_required_values import validate_required_values
|
8
|
+
from unique_toolkit.app.schemas import BaseEvent, ChatEvent, Event
|
9
|
+
from unique_toolkit.content import DOMAIN_NAME
|
10
|
+
from unique_toolkit.content.constants import DEFAULT_SEARCH_LANGUAGE
|
11
|
+
from unique_toolkit.content.functions import (
|
12
|
+
download_content,
|
13
|
+
download_content_to_file_by_id,
|
14
|
+
request_content_by_id,
|
15
|
+
search_content_chunks,
|
16
|
+
search_content_chunks_async,
|
17
|
+
search_contents,
|
18
|
+
search_contents_async,
|
19
|
+
upload_content,
|
20
|
+
)
|
13
21
|
from unique_toolkit.content.schemas import (
|
14
22
|
Content,
|
15
23
|
ContentChunk,
|
@@ -17,21 +25,51 @@ from unique_toolkit.content.schemas import (
|
|
17
25
|
ContentSearchType,
|
18
26
|
)
|
19
27
|
|
28
|
+
logger = logging.getLogger(f"toolkit.{DOMAIN_NAME}.{__name__}")
|
20
29
|
|
21
|
-
|
30
|
+
|
31
|
+
class ContentService:
|
22
32
|
"""
|
23
33
|
Provides methods for searching, downloading and uploading content in the knowledge base.
|
24
34
|
|
25
35
|
Attributes:
|
26
|
-
|
27
|
-
|
36
|
+
company_id (str | None): The company ID.
|
37
|
+
user_id (str | None): The user ID.
|
38
|
+
metadata_filter (dict | None): The metadata filter.
|
39
|
+
chat_id (str | None): The chat ID.
|
28
40
|
"""
|
29
41
|
|
30
|
-
def __init__(
|
31
|
-
|
32
|
-
|
42
|
+
def __init__(
|
43
|
+
self,
|
44
|
+
event: Event | BaseEvent | None = None,
|
45
|
+
company_id: str | None = None,
|
46
|
+
user_id: str | None = None,
|
47
|
+
):
|
48
|
+
self._event = event # Changed to protected attribute
|
49
|
+
if event:
|
50
|
+
self.company_id = event.company_id
|
51
|
+
self.user_id = event.user_id
|
52
|
+
if isinstance(event, (ChatEvent, Event)):
|
53
|
+
self.metadata_filter = event.payload.metadata_filter
|
54
|
+
self.chat_id = event.payload.chat_id
|
55
|
+
else:
|
56
|
+
[company_id, user_id] = validate_required_values([company_id, user_id])
|
57
|
+
self.company_id = company_id
|
58
|
+
self.user_id = user_id
|
59
|
+
self.metadata_filter = None
|
60
|
+
|
61
|
+
@property
|
62
|
+
@deprecated(
|
63
|
+
"The event property is deprecated and will be removed in a future version."
|
64
|
+
)
|
65
|
+
def event(self) -> Event | BaseEvent | None:
|
66
|
+
"""
|
67
|
+
Get the event object (deprecated).
|
33
68
|
|
34
|
-
|
69
|
+
Returns:
|
70
|
+
Event | BaseEvent | None: The event object.
|
71
|
+
"""
|
72
|
+
return self._event
|
35
73
|
|
36
74
|
def search_content_chunks(
|
37
75
|
self,
|
@@ -39,11 +77,11 @@ class ContentService(BaseService):
|
|
39
77
|
search_type: ContentSearchType,
|
40
78
|
limit: int,
|
41
79
|
search_language: str = DEFAULT_SEARCH_LANGUAGE,
|
42
|
-
reranker_config:
|
43
|
-
scope_ids:
|
44
|
-
chat_only:
|
45
|
-
metadata_filter:
|
46
|
-
content_ids:
|
80
|
+
reranker_config: ContentRerankerConfig | None = None,
|
81
|
+
scope_ids: list[str] | None = None,
|
82
|
+
chat_only: bool | None = None,
|
83
|
+
metadata_filter: dict | None = None,
|
84
|
+
content_ids: list[str] | None = None,
|
47
85
|
) -> list[ContentChunk]:
|
48
86
|
"""
|
49
87
|
Performs a synchronous search for content chunks in the knowledge base.
|
@@ -53,61 +91,49 @@ class ContentService(BaseService):
|
|
53
91
|
search_type (ContentSearchType): The type of search to perform.
|
54
92
|
limit (int): The maximum number of results to return.
|
55
93
|
search_language (str): The language for the full-text search. Defaults to "english".
|
56
|
-
reranker_config (
|
57
|
-
scope_ids (
|
58
|
-
chat_only (
|
59
|
-
metadata_filter (
|
60
|
-
content_ids (
|
94
|
+
reranker_config (ContentRerankerConfig | None): The reranker configuration. Defaults to None.
|
95
|
+
scope_ids (list[str] | None): The scope IDs. Defaults to None.
|
96
|
+
chat_only (bool | None): Whether to search only in the current chat. Defaults to None.
|
97
|
+
metadata_filter (dict | None): UniqueQL metadata filter. If unspecified/None, it tries to use the metadata filter from the event. Defaults to None.
|
98
|
+
content_ids (list[str] | None): The content IDs to search. Defaults to None.
|
61
99
|
Returns:
|
62
100
|
list[ContentChunk]: The search results.
|
63
101
|
"""
|
64
|
-
if not scope_ids:
|
65
|
-
self.logger.warning("No scope IDs provided for search.")
|
66
102
|
|
67
103
|
if metadata_filter is None:
|
68
104
|
metadata_filter = self.metadata_filter
|
69
105
|
|
70
106
|
try:
|
71
|
-
searches =
|
72
|
-
user_id=self.
|
73
|
-
company_id=self.
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
scopeIds=scope_ids,
|
107
|
+
searches = search_content_chunks(
|
108
|
+
user_id=self.user_id,
|
109
|
+
company_id=self.company_id,
|
110
|
+
chat_id=self.chat_id,
|
111
|
+
search_string=search_string,
|
112
|
+
search_type=search_type,
|
78
113
|
limit=limit,
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
chatOnly=chat_only,
|
86
|
-
metaDataFilter=metadata_filter,
|
87
|
-
contentIds=content_ids,
|
114
|
+
search_language=search_language,
|
115
|
+
reranker_config=reranker_config,
|
116
|
+
scope_ids=scope_ids,
|
117
|
+
chat_only=chat_only,
|
118
|
+
metadata_filter=metadata_filter,
|
119
|
+
content_ids=content_ids,
|
88
120
|
)
|
121
|
+
return searches
|
89
122
|
except Exception as e:
|
90
|
-
|
123
|
+
logger.error(f"Error while searching content chunks: {e}")
|
91
124
|
raise e
|
92
125
|
|
93
|
-
def map_to_content_chunks(searches: list[unique_sdk.Search]):
|
94
|
-
return [ContentChunk(**search) for search in searches]
|
95
|
-
|
96
|
-
# TODO change return type in sdk from Search to list[Search]
|
97
|
-
searches = cast(list[unique_sdk.Search], searches)
|
98
|
-
return map_to_content_chunks(searches)
|
99
|
-
|
100
126
|
async def search_content_chunks_async(
|
101
127
|
self,
|
102
128
|
search_string: str,
|
103
129
|
search_type: ContentSearchType,
|
104
130
|
limit: int,
|
105
131
|
search_language: str = DEFAULT_SEARCH_LANGUAGE,
|
106
|
-
reranker_config:
|
107
|
-
scope_ids:
|
108
|
-
chat_only:
|
109
|
-
metadata_filter:
|
110
|
-
content_ids:
|
132
|
+
reranker_config: ContentRerankerConfig | None = None,
|
133
|
+
scope_ids: list[str] | None = None,
|
134
|
+
chat_only: bool | None = None,
|
135
|
+
metadata_filter: dict | None = None,
|
136
|
+
content_ids: list[str] | None = None,
|
111
137
|
):
|
112
138
|
"""
|
113
139
|
Performs an asynchronous search for content chunks in the knowledge base.
|
@@ -117,50 +143,37 @@ class ContentService(BaseService):
|
|
117
143
|
search_type (ContentSearchType): The type of search to perform.
|
118
144
|
limit (int): The maximum number of results to return.
|
119
145
|
search_language (str): The language for the full-text search. Defaults to "english".
|
120
|
-
reranker_config (
|
121
|
-
scope_ids (
|
122
|
-
chat_only (
|
123
|
-
metadata_filter (
|
124
|
-
content_ids (
|
146
|
+
reranker_config (ContentRerankerConfig | None): The reranker configuration. Defaults to None.
|
147
|
+
scope_ids (list[str] | None): The scope IDs. Defaults to None.
|
148
|
+
chat_only (bool | None): Whether to search only in the current chat. Defaults to None.
|
149
|
+
metadata_filter (dict | None): UniqueQL metadata filter. If unspecified/None, it tries to use the metadata filter from the event. Defaults to None.
|
150
|
+
content_ids (list[str] | None): The content IDs to search. Defaults to None.
|
125
151
|
Returns:
|
126
152
|
list[ContentChunk]: The search results.
|
127
153
|
"""
|
128
|
-
if not scope_ids:
|
129
|
-
self.logger.warning("No scope IDs provided for search.")
|
130
|
-
|
131
154
|
if metadata_filter is None:
|
132
155
|
metadata_filter = self.metadata_filter
|
133
156
|
|
134
157
|
try:
|
135
|
-
searches = await
|
136
|
-
user_id=self.
|
137
|
-
company_id=self.
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
scopeIds=scope_ids,
|
158
|
+
searches = await search_content_chunks_async(
|
159
|
+
user_id=self.user_id,
|
160
|
+
company_id=self.company_id,
|
161
|
+
chat_id=self.chat_id,
|
162
|
+
search_string=search_string,
|
163
|
+
search_type=search_type,
|
142
164
|
limit=limit,
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
chatOnly=chat_only,
|
150
|
-
metaDataFilter=metadata_filter,
|
151
|
-
contentIds=content_ids,
|
165
|
+
search_language=search_language,
|
166
|
+
reranker_config=reranker_config,
|
167
|
+
scope_ids=scope_ids,
|
168
|
+
chat_only=chat_only,
|
169
|
+
metadata_filter=metadata_filter,
|
170
|
+
content_ids=content_ids,
|
152
171
|
)
|
172
|
+
return searches
|
153
173
|
except Exception as e:
|
154
|
-
|
174
|
+
logger.error(f"Error while searching content chunks: {e}")
|
155
175
|
raise e
|
156
176
|
|
157
|
-
def map_to_content_chunks(searches: list[unique_sdk.Search]):
|
158
|
-
return [ContentChunk(**search) for search in searches]
|
159
|
-
|
160
|
-
# TODO change return type in sdk from Search to list[Search]
|
161
|
-
searches = cast(list[unique_sdk.Search], searches)
|
162
|
-
return map_to_content_chunks(searches)
|
163
|
-
|
164
177
|
def search_contents(
|
165
178
|
self,
|
166
179
|
where: dict,
|
@@ -175,19 +188,12 @@ class ContentService(BaseService):
|
|
175
188
|
Returns:
|
176
189
|
list[Content]: The search results.
|
177
190
|
"""
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
where=where, # type: ignore
|
185
|
-
)
|
186
|
-
except Exception as e:
|
187
|
-
self.logger.error(f"Error while searching contents: {e}")
|
188
|
-
raise e
|
189
|
-
|
190
|
-
return self._map_contents(contents)
|
191
|
+
return search_contents(
|
192
|
+
user_id=self.user_id,
|
193
|
+
company_id=self.company_id,
|
194
|
+
chat_id=self.chat_id,
|
195
|
+
where=where,
|
196
|
+
)
|
191
197
|
|
192
198
|
async def search_contents_async(
|
193
199
|
self,
|
@@ -202,59 +208,28 @@ class ContentService(BaseService):
|
|
202
208
|
Returns:
|
203
209
|
list[Content]: The search results.
|
204
210
|
"""
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
where=where, # type: ignore
|
212
|
-
)
|
213
|
-
except Exception as e:
|
214
|
-
self.logger.error(f"Error while searching contents: {e}")
|
215
|
-
raise e
|
216
|
-
|
217
|
-
return self._map_contents(contents)
|
211
|
+
return await search_contents_async(
|
212
|
+
user_id=self.user_id,
|
213
|
+
company_id=self.company_id,
|
214
|
+
chat_id=self.chat_id,
|
215
|
+
where=where,
|
216
|
+
)
|
218
217
|
|
219
218
|
def search_content_on_chat(
|
220
219
|
self,
|
221
220
|
) -> list[Content]:
|
222
|
-
where = {"ownerId": {"equals": self.
|
221
|
+
where = {"ownerId": {"equals": self.chat_id}}
|
223
222
|
|
224
223
|
return self.search_contents(where)
|
225
224
|
|
226
|
-
@staticmethod
|
227
|
-
def _map_content_chunk(content_chunk: dict):
|
228
|
-
return ContentChunk(
|
229
|
-
id=content_chunk["id"],
|
230
|
-
text=content_chunk["text"],
|
231
|
-
start_page=content_chunk["startPage"],
|
232
|
-
end_page=content_chunk["endPage"],
|
233
|
-
order=content_chunk["order"],
|
234
|
-
)
|
235
|
-
|
236
|
-
def _map_content(self, content: dict):
|
237
|
-
return Content(
|
238
|
-
id=content["id"],
|
239
|
-
key=content["key"],
|
240
|
-
title=content["title"],
|
241
|
-
url=content["url"],
|
242
|
-
chunks=[self._map_content_chunk(chunk) for chunk in content["chunks"]],
|
243
|
-
created_at=content["createdAt"],
|
244
|
-
updated_at=content["updatedAt"],
|
245
|
-
)
|
246
|
-
|
247
|
-
def _map_contents(self, contents):
|
248
|
-
return [self._map_content(content) for content in contents]
|
249
|
-
|
250
225
|
def upload_content(
|
251
226
|
self,
|
252
227
|
path_to_content: str,
|
253
228
|
content_name: str,
|
254
229
|
mime_type: str,
|
255
|
-
scope_id:
|
256
|
-
chat_id:
|
257
|
-
skip_ingestion:
|
230
|
+
scope_id: str | None = None,
|
231
|
+
chat_id: str | None = None,
|
232
|
+
skip_ingestion: bool = False,
|
258
233
|
):
|
259
234
|
"""
|
260
235
|
Uploads content to the knowledge base.
|
@@ -263,108 +238,30 @@ class ContentService(BaseService):
|
|
263
238
|
path_to_content (str): The path to the content to upload.
|
264
239
|
content_name (str): The name of the content.
|
265
240
|
mime_type (str): The MIME type of the content.
|
266
|
-
scope_id (
|
267
|
-
chat_id (
|
241
|
+
scope_id (str | None): The scope ID. Defaults to None.
|
242
|
+
chat_id (str | None): The chat ID. Defaults to None.
|
243
|
+
skip_ingestion (bool): Whether to skip ingestion. Defaults to False.
|
268
244
|
|
269
245
|
Returns:
|
270
246
|
Content: The uploaded content.
|
271
247
|
"""
|
272
248
|
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
self.logger.error(f"Error while uploading content: {e}")
|
284
|
-
raise e
|
285
|
-
|
286
|
-
def _trigger_upload_content(
|
287
|
-
self,
|
288
|
-
path_to_content: str,
|
289
|
-
content_name: str,
|
290
|
-
mime_type: str,
|
291
|
-
scope_id: Optional[str] = None,
|
292
|
-
chat_id: Optional[str] = None,
|
293
|
-
skip_ingestion: Optional[bool] = False,
|
294
|
-
):
|
295
|
-
if not chat_id and not scope_id:
|
296
|
-
raise ValueError("chat_id or scope_id must be provided")
|
297
|
-
|
298
|
-
byte_size = os.path.getsize(path_to_content)
|
299
|
-
created_content = unique_sdk.Content.upsert(
|
300
|
-
user_id=self.event.user_id,
|
301
|
-
company_id=self.event.company_id,
|
302
|
-
input={
|
303
|
-
"key": content_name,
|
304
|
-
"title": content_name,
|
305
|
-
"mimeType": mime_type,
|
306
|
-
},
|
307
|
-
scopeId=scope_id,
|
308
|
-
chatId=chat_id,
|
309
|
-
) # type: ignore
|
310
|
-
|
311
|
-
write_url = created_content["writeUrl"]
|
312
|
-
|
313
|
-
if not write_url:
|
314
|
-
error_msg = "Write url for uploaded content is missing"
|
315
|
-
self.logger.error(error_msg)
|
316
|
-
raise ValueError(error_msg)
|
317
|
-
|
318
|
-
# upload to azure blob storage SAS url uploadUrl the pdf file translatedFile make sure it is treated as a application/pdf
|
319
|
-
with open(path_to_content, "rb") as file:
|
320
|
-
requests.put(
|
321
|
-
url=write_url,
|
322
|
-
data=file,
|
323
|
-
headers={
|
324
|
-
"X-Ms-Blob-Content-Type": mime_type,
|
325
|
-
"X-Ms-Blob-Type": "BlockBlob",
|
326
|
-
},
|
327
|
-
)
|
328
|
-
|
329
|
-
read_url = created_content["readUrl"]
|
330
|
-
|
331
|
-
if not read_url:
|
332
|
-
error_msg = "Read url for uploaded content is missing"
|
333
|
-
self.logger.error(error_msg)
|
334
|
-
raise ValueError(error_msg)
|
335
|
-
|
336
|
-
input_dict = {
|
337
|
-
"key": content_name,
|
338
|
-
"title": content_name,
|
339
|
-
"mimeType": mime_type,
|
340
|
-
"byteSize": byte_size,
|
341
|
-
}
|
342
|
-
|
343
|
-
if skip_ingestion:
|
344
|
-
input_dict["ingestionConfig"] = {"uniqueIngestionMode": "SKIP_INGESTION"}
|
345
|
-
|
346
|
-
if chat_id:
|
347
|
-
unique_sdk.Content.upsert(
|
348
|
-
user_id=self.event.user_id,
|
349
|
-
company_id=self.event.company_id,
|
350
|
-
input=input_dict,
|
351
|
-
fileUrl=read_url,
|
352
|
-
chatId=chat_id,
|
353
|
-
) # type: ignore
|
354
|
-
else:
|
355
|
-
unique_sdk.Content.upsert(
|
356
|
-
user_id=self.event.user_id,
|
357
|
-
company_id=self.event.company_id,
|
358
|
-
input=input_dict,
|
359
|
-
fileUrl=read_url,
|
360
|
-
scopeId=scope_id,
|
361
|
-
) # type: ignore
|
362
|
-
|
363
|
-
return Content(**created_content)
|
249
|
+
return upload_content(
|
250
|
+
user_id=self.user_id,
|
251
|
+
company_id=self.company_id,
|
252
|
+
path_to_content=path_to_content,
|
253
|
+
content_name=content_name,
|
254
|
+
mime_type=mime_type,
|
255
|
+
scope_id=scope_id,
|
256
|
+
chat_id=chat_id,
|
257
|
+
skip_ingestion=skip_ingestion,
|
258
|
+
)
|
364
259
|
|
365
260
|
def request_content_by_id(
|
366
|
-
self,
|
367
|
-
|
261
|
+
self,
|
262
|
+
content_id: str,
|
263
|
+
chat_id: str | None,
|
264
|
+
) -> Response:
|
368
265
|
"""
|
369
266
|
Sends a request to download content from a chat.
|
370
267
|
|
@@ -376,36 +273,28 @@ class ContentService(BaseService):
|
|
376
273
|
requests.Response: The response object containing the downloaded content.
|
377
274
|
|
378
275
|
"""
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
"x-api-version": unique_sdk.api_version,
|
386
|
-
"x-app-id": unique_sdk.app_id,
|
387
|
-
"x-user-id": self.event.user_id,
|
388
|
-
"x-company-id": self.event.company_id,
|
389
|
-
"Authorization": "Bearer %s" % (unique_sdk.api_key,),
|
390
|
-
}
|
391
|
-
|
392
|
-
return requests.get(url, headers=headers)
|
276
|
+
return request_content_by_id(
|
277
|
+
user_id=self.user_id,
|
278
|
+
company_id=self.company_id,
|
279
|
+
content_id=content_id,
|
280
|
+
chat_id=chat_id,
|
281
|
+
)
|
393
282
|
|
394
283
|
def download_content_to_file_by_id(
|
395
284
|
self,
|
396
285
|
content_id: str,
|
397
|
-
chat_id:
|
286
|
+
chat_id: str | None = None,
|
398
287
|
filename: str | None = None,
|
399
|
-
tmp_dir_path:
|
288
|
+
tmp_dir_path: str | Path | None = "/tmp",
|
400
289
|
):
|
401
290
|
"""
|
402
291
|
Downloads content from a chat and saves it to a file.
|
403
292
|
|
404
293
|
Args:
|
405
294
|
content_id (str): The ID of the content to download.
|
406
|
-
chat_id (
|
295
|
+
chat_id (str | None): The ID of the chat to download from. Defaults to None and the file is downloaded from the knowledge base.
|
407
296
|
filename (str | None): The name of the file to save the content as. If not provided, the original filename will be used. Defaults to None.
|
408
|
-
tmp_dir_path (
|
297
|
+
tmp_dir_path (str | Path | None): The path to the temporary directory where the content will be saved. Defaults to "/tmp".
|
409
298
|
|
410
299
|
Returns:
|
411
300
|
Path: The path to the downloaded file.
|
@@ -414,42 +303,22 @@ class ContentService(BaseService):
|
|
414
303
|
Exception: If the download fails or the filename cannot be determined.
|
415
304
|
"""
|
416
305
|
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
match = re.search(
|
426
|
-
pattern, response.headers.get("Content-Disposition", "")
|
427
|
-
)
|
428
|
-
if match:
|
429
|
-
content_path = Path(random_dir) / match.group(1)
|
430
|
-
else:
|
431
|
-
error_msg = (
|
432
|
-
"Error downloading file: Filename could not be determined"
|
433
|
-
)
|
434
|
-
self.logger.error(error_msg)
|
435
|
-
raise Exception(error_msg)
|
436
|
-
|
437
|
-
with open(content_path, "wb") as file:
|
438
|
-
file.write(response.content)
|
439
|
-
else:
|
440
|
-
error_msg = f"Error downloading file: Status code {response.status_code}"
|
441
|
-
self.logger.error(error_msg)
|
442
|
-
raise Exception(error_msg)
|
443
|
-
|
444
|
-
return content_path
|
306
|
+
return download_content_to_file_by_id(
|
307
|
+
user_id=self.user_id,
|
308
|
+
company_id=self.company_id,
|
309
|
+
content_id=content_id,
|
310
|
+
chat_id=chat_id,
|
311
|
+
filename=filename,
|
312
|
+
tmp_dir_path=tmp_dir_path,
|
313
|
+
)
|
445
314
|
|
446
315
|
# TODO: Discuss if we should deprecate this method due to unclear use by content_name
|
447
316
|
def download_content(
|
448
317
|
self,
|
449
318
|
content_id: str,
|
450
319
|
content_name: str,
|
451
|
-
chat_id:
|
452
|
-
dir_path:
|
320
|
+
chat_id: str | None = None,
|
321
|
+
dir_path: str | Path | None = "/tmp",
|
453
322
|
) -> Path:
|
454
323
|
"""
|
455
324
|
Downloads content to temporary directory
|
@@ -467,17 +336,11 @@ class ContentService(BaseService):
|
|
467
336
|
Exception: If the download fails.
|
468
337
|
"""
|
469
338
|
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
else:
|
479
|
-
error_msg = f"Error downloading file: Status code {response.status_code}"
|
480
|
-
self.logger.error(error_msg)
|
481
|
-
raise Exception(error_msg)
|
482
|
-
|
483
|
-
return content_path
|
339
|
+
return download_content(
|
340
|
+
user_id=self.user_id,
|
341
|
+
company_id=self.company_id,
|
342
|
+
content_id=content_id,
|
343
|
+
content_name=content_name,
|
344
|
+
chat_id=chat_id,
|
345
|
+
dir_path=dir_path,
|
346
|
+
)
|
unique_toolkit/content/utils.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
import re
|
2
2
|
|
3
3
|
import tiktoken
|
4
|
+
import unique_sdk
|
4
5
|
|
5
6
|
from unique_toolkit.content.schemas import (
|
7
|
+
Content,
|
6
8
|
ContentChunk,
|
7
9
|
)
|
8
10
|
|
@@ -186,3 +188,33 @@ def count_tokens(text: str, encoding_model="cl100k_base") -> int:
|
|
186
188
|
"""
|
187
189
|
encoding = tiktoken.get_encoding(encoding_model)
|
188
190
|
return len(encoding.encode(text))
|
191
|
+
|
192
|
+
|
193
|
+
def map_content_chunk(content_chunk: dict):
|
194
|
+
return ContentChunk(
|
195
|
+
id=content_chunk["id"],
|
196
|
+
text=content_chunk["text"],
|
197
|
+
start_page=content_chunk["startPage"],
|
198
|
+
end_page=content_chunk["endPage"],
|
199
|
+
order=content_chunk["order"],
|
200
|
+
)
|
201
|
+
|
202
|
+
|
203
|
+
def map_content(content: dict):
|
204
|
+
return Content(
|
205
|
+
id=content["id"],
|
206
|
+
key=content["key"],
|
207
|
+
title=content["title"],
|
208
|
+
url=content["url"],
|
209
|
+
chunks=[map_content_chunk(chunk) for chunk in content["chunks"]],
|
210
|
+
created_at=content["createdAt"],
|
211
|
+
updated_at=content["updatedAt"],
|
212
|
+
)
|
213
|
+
|
214
|
+
|
215
|
+
def map_contents(contents):
|
216
|
+
return [map_content(content) for content in contents]
|
217
|
+
|
218
|
+
|
219
|
+
def map_to_content_chunks(searches: list[unique_sdk.Search]):
|
220
|
+
return [ContentChunk(**search) for search in searches]
|