groundx 2.2.9__tar.gz → 2.3.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {groundx-2.2.9 → groundx-2.3.3}/PKG-INFO +1 -1
- {groundx-2.2.9 → groundx-2.3.3}/pyproject.toml +1 -1
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/__init__.py +14 -16
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/core/client_wrapper.py +1 -1
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/documents/client.py +26 -13
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/ingest.py +170 -60
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/search/client.py +22 -2
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/__init__.py +14 -16
- groundx-2.3.3/src/groundx/types/bucket_list_response.py +34 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/document_detail.py +5 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/document_lookup_response.py +5 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/group_list_response.py +15 -1
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/ingest_local_document_metadata.py +5 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/ingest_remote_document.py +6 -1
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/ingest_response.py +2 -2
- groundx-2.2.9/src/groundx/types/process_status_response_ingest.py → groundx-2.3.3/src/groundx/types/ingest_status.py +5 -4
- groundx-2.2.9/src/groundx/types/ingest_response_ingest.py → groundx-2.3.3/src/groundx/types/ingest_status_light.py +4 -2
- groundx-2.3.3/src/groundx/types/ingest_status_progress.py +26 -0
- groundx-2.2.9/src/groundx/types/process_status_response_ingest_progress_errors.py → groundx-2.3.3/src/groundx/types/ingest_status_progress_cancelled.py +1 -1
- groundx-2.2.9/src/groundx/types/process_status_response_ingest_progress_complete.py → groundx-2.3.3/src/groundx/types/ingest_status_progress_complete.py +1 -1
- groundx-2.2.9/src/groundx/types/process_status_response_ingest_progress_cancelled.py → groundx-2.3.3/src/groundx/types/ingest_status_progress_errors.py +1 -1
- groundx-2.2.9/src/groundx/types/process_status_response_ingest_progress_processing.py → groundx-2.3.3/src/groundx/types/ingest_status_progress_processing.py +1 -1
- groundx-2.2.9/src/groundx/types/bucket_list_response.py → groundx-2.3.3/src/groundx/types/processes_status_response.py +7 -4
- groundx-2.2.9/src/groundx/types/process_status_response.py +0 -20
- groundx-2.2.9/src/groundx/types/process_status_response_ingest_progress.py +0 -26
- groundx-2.2.9/src/groundx/types/processes_status_response.py +0 -6
- {groundx-2.2.9 → groundx-2.3.3}/LICENSE +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/README.md +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/buckets/__init__.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/buckets/client.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/client.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/core/__init__.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/core/api_error.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/core/datetime_utils.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/core/file.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/core/http_client.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/core/jsonable_encoder.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/core/pydantic_utilities.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/core/query_encoder.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/core/remove_none_from_dict.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/core/request_options.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/core/serialization.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/csv_splitter.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/customer/__init__.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/customer/client.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/documents/__init__.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/environment.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/errors/__init__.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/errors/bad_request_error.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/errors/unauthorized_error.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/groups/__init__.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/groups/client.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/health/__init__.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/health/client.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/py.typed +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/search/__init__.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/search/types/__init__.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/search/types/search_content_request_id.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/bounding_box_detail.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/bucket_detail.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/bucket_response.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/bucket_update_detail.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/bucket_update_response.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/customer_detail.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/customer_response.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/document.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/document_list_response.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/document_local_ingest_request.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/document_response.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/document_type.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/group_detail.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/group_response.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/health_response.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/health_response_health.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/health_service.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/health_service_status.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/ingest_local_document.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/message_response.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/meter_detail.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/process_level.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/processing_status.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/search_response.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/search_response_search.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/search_result_item.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/sort.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/sort_order.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/subscription_detail.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/subscription_detail_meters.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/types/website_source.py +0 -0
- {groundx-2.2.9 → groundx-2.3.3}/src/groundx/version.py +0 -0
@@ -27,17 +27,16 @@ from .types import (
|
|
27
27
|
IngestLocalDocumentMetadata,
|
28
28
|
IngestRemoteDocument,
|
29
29
|
IngestResponse,
|
30
|
-
|
30
|
+
IngestStatus,
|
31
|
+
IngestStatusLight,
|
32
|
+
IngestStatusProgress,
|
33
|
+
IngestStatusProgressCancelled,
|
34
|
+
IngestStatusProgressComplete,
|
35
|
+
IngestStatusProgressErrors,
|
36
|
+
IngestStatusProgressProcessing,
|
31
37
|
MessageResponse,
|
32
38
|
MeterDetail,
|
33
39
|
ProcessLevel,
|
34
|
-
ProcessStatusResponse,
|
35
|
-
ProcessStatusResponseIngest,
|
36
|
-
ProcessStatusResponseIngestProgress,
|
37
|
-
ProcessStatusResponseIngestProgressCancelled,
|
38
|
-
ProcessStatusResponseIngestProgressComplete,
|
39
|
-
ProcessStatusResponseIngestProgressErrors,
|
40
|
-
ProcessStatusResponseIngestProgressProcessing,
|
41
40
|
ProcessesStatusResponse,
|
42
41
|
ProcessingStatus,
|
43
42
|
SearchResponse,
|
@@ -87,17 +86,16 @@ __all__ = [
|
|
87
86
|
"IngestLocalDocumentMetadata",
|
88
87
|
"IngestRemoteDocument",
|
89
88
|
"IngestResponse",
|
90
|
-
"
|
89
|
+
"IngestStatus",
|
90
|
+
"IngestStatusLight",
|
91
|
+
"IngestStatusProgress",
|
92
|
+
"IngestStatusProgressCancelled",
|
93
|
+
"IngestStatusProgressComplete",
|
94
|
+
"IngestStatusProgressErrors",
|
95
|
+
"IngestStatusProgressProcessing",
|
91
96
|
"MessageResponse",
|
92
97
|
"MeterDetail",
|
93
98
|
"ProcessLevel",
|
94
|
-
"ProcessStatusResponse",
|
95
|
-
"ProcessStatusResponseIngest",
|
96
|
-
"ProcessStatusResponseIngestProgress",
|
97
|
-
"ProcessStatusResponseIngestProgressCancelled",
|
98
|
-
"ProcessStatusResponseIngestProgressComplete",
|
99
|
-
"ProcessStatusResponseIngestProgressErrors",
|
100
|
-
"ProcessStatusResponseIngestProgressProcessing",
|
101
99
|
"ProcessesStatusResponse",
|
102
100
|
"ProcessingStatus",
|
103
101
|
"SearchContentRequestId",
|
@@ -17,7 +17,6 @@ from ..types.sort import Sort
|
|
17
17
|
from ..types.sort_order import SortOrder
|
18
18
|
from ..types.processing_status import ProcessingStatus
|
19
19
|
from ..types.document_list_response import DocumentListResponse
|
20
|
-
from ..types.process_status_response import ProcessStatusResponse
|
21
20
|
from ..core.jsonable_encoder import jsonable_encoder
|
22
21
|
from ..types.document_lookup_response import DocumentLookupResponse
|
23
22
|
from ..types.document_response import DocumentResponse
|
@@ -41,6 +40,8 @@ class DocumentsClient:
|
|
41
40
|
"""
|
42
41
|
Ingest documents hosted on public URLs into a GroundX bucket.
|
43
42
|
|
43
|
+
[Supported Document Types and Ingest Capacities](https://docs.eyelevel.ai/documentation/fundamentals/document-types-and-ingest-capacities)
|
44
|
+
|
44
45
|
Parameters
|
45
46
|
----------
|
46
47
|
documents : typing.Sequence[IngestRemoteDocument]
|
@@ -125,6 +126,8 @@ class DocumentsClient:
|
|
125
126
|
"""
|
126
127
|
Upload documents hosted on a local file system into a GroundX bucket.
|
127
128
|
|
129
|
+
[Supported Document Types and Ingest Capacities](https://docs.eyelevel.ai/documentation/fundamentals/document-types-and-ingest-capacities)
|
130
|
+
|
128
131
|
Parameters
|
129
132
|
----------
|
130
133
|
request : DocumentLocalIngestRequest
|
@@ -205,9 +208,12 @@ class DocumentsClient:
|
|
205
208
|
) -> IngestResponse:
|
206
209
|
"""
|
207
210
|
Upload the content of a publicly accessible website for ingestion into a GroundX bucket. This is done by following links within a specified URL, recursively, up to a specified depth or number of pages.
|
211
|
+
|
208
212
|
Note1: This endpoint is currently not supported for on-prem deployments.
|
209
213
|
Note2: The `source_url` must include the protocol, http:// or https://.
|
210
214
|
|
215
|
+
[Supported Document Types and Ingest Capacities](https://docs.eyelevel.ai/documentation/fundamentals/document-types-and-ingest-capacities)
|
216
|
+
|
211
217
|
Parameters
|
212
218
|
----------
|
213
219
|
websites : typing.Sequence[WebsiteSource]
|
@@ -442,7 +448,7 @@ class DocumentsClient:
|
|
442
448
|
|
443
449
|
def get_processing_status_by_id(
|
444
450
|
self, process_id: str, *, request_options: typing.Optional[RequestOptions] = None
|
445
|
-
) ->
|
451
|
+
) -> IngestResponse:
|
446
452
|
"""
|
447
453
|
Get the current status of an ingest, initiated with documents.ingest_remote, documents.ingest_local, or documents.crawl_website, by specifying the processId (the processId is included in the response of the documents.ingest functions).
|
448
454
|
|
@@ -456,7 +462,7 @@ class DocumentsClient:
|
|
456
462
|
|
457
463
|
Returns
|
458
464
|
-------
|
459
|
-
|
465
|
+
IngestResponse
|
460
466
|
Look up success
|
461
467
|
|
462
468
|
Examples
|
@@ -478,9 +484,9 @@ class DocumentsClient:
|
|
478
484
|
try:
|
479
485
|
if 200 <= _response.status_code < 300:
|
480
486
|
return typing.cast(
|
481
|
-
|
487
|
+
IngestResponse,
|
482
488
|
parse_obj_as(
|
483
|
-
type_=
|
489
|
+
type_=IngestResponse, # type: ignore
|
484
490
|
object_=_response.json(),
|
485
491
|
),
|
486
492
|
)
|
@@ -522,12 +528,12 @@ class DocumentsClient:
|
|
522
528
|
request_options: typing.Optional[RequestOptions] = None,
|
523
529
|
) -> DocumentLookupResponse:
|
524
530
|
"""
|
525
|
-
lookup the document(s) associated with a processId, bucketId,
|
531
|
+
lookup the document(s) associated with a processId, bucketId, or groupId.
|
526
532
|
|
527
533
|
Parameters
|
528
534
|
----------
|
529
535
|
id : int
|
530
|
-
a processId, bucketId,
|
536
|
+
a processId, bucketId, or groupId
|
531
537
|
|
532
538
|
n : typing.Optional[int]
|
533
539
|
The maximum number of returned documents. Accepts 1-100 with a default of 20.
|
@@ -821,6 +827,8 @@ class AsyncDocumentsClient:
|
|
821
827
|
"""
|
822
828
|
Ingest documents hosted on public URLs into a GroundX bucket.
|
823
829
|
|
830
|
+
[Supported Document Types and Ingest Capacities](https://docs.eyelevel.ai/documentation/fundamentals/document-types-and-ingest-capacities)
|
831
|
+
|
824
832
|
Parameters
|
825
833
|
----------
|
826
834
|
documents : typing.Sequence[IngestRemoteDocument]
|
@@ -913,6 +921,8 @@ class AsyncDocumentsClient:
|
|
913
921
|
"""
|
914
922
|
Upload documents hosted on a local file system into a GroundX bucket.
|
915
923
|
|
924
|
+
[Supported Document Types and Ingest Capacities](https://docs.eyelevel.ai/documentation/fundamentals/document-types-and-ingest-capacities)
|
925
|
+
|
916
926
|
Parameters
|
917
927
|
----------
|
918
928
|
request : DocumentLocalIngestRequest
|
@@ -1005,9 +1015,12 @@ class AsyncDocumentsClient:
|
|
1005
1015
|
) -> IngestResponse:
|
1006
1016
|
"""
|
1007
1017
|
Upload the content of a publicly accessible website for ingestion into a GroundX bucket. This is done by following links within a specified URL, recursively, up to a specified depth or number of pages.
|
1018
|
+
|
1008
1019
|
Note1: This endpoint is currently not supported for on-prem deployments.
|
1009
1020
|
Note2: The `source_url` must include the protocol, http:// or https://.
|
1010
1021
|
|
1022
|
+
[Supported Document Types and Ingest Capacities](https://docs.eyelevel.ai/documentation/fundamentals/document-types-and-ingest-capacities)
|
1023
|
+
|
1011
1024
|
Parameters
|
1012
1025
|
----------
|
1013
1026
|
websites : typing.Sequence[WebsiteSource]
|
@@ -1266,7 +1279,7 @@ class AsyncDocumentsClient:
|
|
1266
1279
|
|
1267
1280
|
async def get_processing_status_by_id(
|
1268
1281
|
self, process_id: str, *, request_options: typing.Optional[RequestOptions] = None
|
1269
|
-
) ->
|
1282
|
+
) -> IngestResponse:
|
1270
1283
|
"""
|
1271
1284
|
Get the current status of an ingest, initiated with documents.ingest_remote, documents.ingest_local, or documents.crawl_website, by specifying the processId (the processId is included in the response of the documents.ingest functions).
|
1272
1285
|
|
@@ -1280,7 +1293,7 @@ class AsyncDocumentsClient:
|
|
1280
1293
|
|
1281
1294
|
Returns
|
1282
1295
|
-------
|
1283
|
-
|
1296
|
+
IngestResponse
|
1284
1297
|
Look up success
|
1285
1298
|
|
1286
1299
|
Examples
|
@@ -1310,9 +1323,9 @@ class AsyncDocumentsClient:
|
|
1310
1323
|
try:
|
1311
1324
|
if 200 <= _response.status_code < 300:
|
1312
1325
|
return typing.cast(
|
1313
|
-
|
1326
|
+
IngestResponse,
|
1314
1327
|
parse_obj_as(
|
1315
|
-
type_=
|
1328
|
+
type_=IngestResponse, # type: ignore
|
1316
1329
|
object_=_response.json(),
|
1317
1330
|
),
|
1318
1331
|
)
|
@@ -1354,12 +1367,12 @@ class AsyncDocumentsClient:
|
|
1354
1367
|
request_options: typing.Optional[RequestOptions] = None,
|
1355
1368
|
) -> DocumentLookupResponse:
|
1356
1369
|
"""
|
1357
|
-
lookup the document(s) associated with a processId, bucketId,
|
1370
|
+
lookup the document(s) associated with a processId, bucketId, or groupId.
|
1358
1371
|
|
1359
1372
|
Parameters
|
1360
1373
|
----------
|
1361
1374
|
id : int
|
1362
|
-
a processId, bucketId,
|
1375
|
+
a processId, bucketId, or groupId
|
1363
1376
|
|
1364
1377
|
n : typing.Optional[int]
|
1365
1378
|
The maximum number of returned documents. Accepts 1-100 with a default of 20.
|
@@ -9,6 +9,7 @@ from .csv_splitter import CSVSplitter
|
|
9
9
|
from .types.document import Document
|
10
10
|
from .types.ingest_remote_document import IngestRemoteDocument
|
11
11
|
from .types.ingest_response import IngestResponse
|
12
|
+
from .types.ingest_status import IngestStatus
|
12
13
|
|
13
14
|
# this is used as the default value for optional parameters
|
14
15
|
OMIT = typing.cast(typing.Any, ...)
|
@@ -140,6 +141,8 @@ class GroundX(GroundXBase):
|
|
140
141
|
self,
|
141
142
|
*,
|
142
143
|
documents: typing.Sequence[Document],
|
144
|
+
batch_size: typing.Optional[int] = 10,
|
145
|
+
wait_for_complete: typing.Optional[bool] = False,
|
143
146
|
upload_api: typing.Optional[str] = "https://api.eyelevel.ai/upload/file",
|
144
147
|
request_options: typing.Optional[RequestOptions] = None,
|
145
148
|
) -> IngestResponse:
|
@@ -150,6 +153,13 @@ class GroundX(GroundXBase):
|
|
150
153
|
----------
|
151
154
|
documents : typing.Sequence[Document]
|
152
155
|
|
156
|
+
# defines how many files to send per batch
|
157
|
+
# ignored unless wait_for_complete is True
|
158
|
+
batch_size : typing.Optional[int]
|
159
|
+
|
160
|
+
# will turn on progress bar and wait for ingestion to complete
|
161
|
+
wait_for_complete : typing.Optional[bool]
|
162
|
+
|
153
163
|
# an endpoint that accepts 'name' and 'type' query params
|
154
164
|
# and returns a presigned URL in a JSON dictionary with key 'URL'
|
155
165
|
upload_api : typing.Optional[str]
|
@@ -183,36 +193,84 @@ class GroundX(GroundXBase):
|
|
183
193
|
"""
|
184
194
|
remote_documents, local_documents = prep_documents(documents)
|
185
195
|
|
186
|
-
if len(remote_documents) + len(local_documents) > MAX_BATCH_SIZE:
|
187
|
-
raise ValueError("You have sent too many documents in this request")
|
188
|
-
|
189
196
|
if len(remote_documents) + len(local_documents) == 0:
|
190
197
|
raise ValueError("No valid documents were provided")
|
191
198
|
|
192
|
-
|
193
|
-
|
199
|
+
if wait_for_complete:
|
200
|
+
with tqdm(total=len(remote_documents) + len(local_documents), desc="Ingesting Files", unit="file") as pbar:
|
201
|
+
n = max(MIN_BATCH_SIZE, min(batch_size or MIN_BATCH_SIZE, MAX_BATCH_SIZE))
|
194
202
|
|
195
|
-
|
196
|
-
|
203
|
+
remote_batch: typing.List[IngestRemoteDocument] = []
|
204
|
+
ingest = IngestResponse(ingest=IngestStatus(process_id="",status="queued"))
|
197
205
|
|
198
|
-
|
199
|
-
|
200
|
-
|
206
|
+
progress = float(len(remote_documents))
|
207
|
+
for rd in remote_documents:
|
208
|
+
if len(remote_batch) >= n:
|
209
|
+
ingest = self.documents.ingest_remote(
|
210
|
+
documents=remote_batch,
|
211
|
+
request_options=request_options,
|
212
|
+
)
|
213
|
+
ingest, progress = self._monitor_batch(ingest, progress, pbar)
|
201
214
|
|
202
|
-
|
203
|
-
if len(splits) == 1 and d.file_name:
|
204
|
-
fn = d.file_name
|
215
|
+
remote_batch = []
|
205
216
|
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
217
|
+
remote_batch.append(rd)
|
218
|
+
pbar.update(0.25)
|
219
|
+
progress -= 0.25
|
220
|
+
|
221
|
+
if remote_batch:
|
222
|
+
ingest = self.documents.ingest_remote(
|
223
|
+
documents=remote_batch,
|
224
|
+
request_options=request_options,
|
214
225
|
)
|
215
|
-
|
226
|
+
ingest, progress = self._monitor_batch(ingest, progress, pbar)
|
227
|
+
|
228
|
+
|
229
|
+
if progress > 0:
|
230
|
+
pbar.update(progress)
|
231
|
+
|
232
|
+
current_batch_size = 0
|
233
|
+
local_batch: typing.List[Document] = []
|
234
|
+
|
235
|
+
progress = float(len(local_documents))
|
236
|
+
for ld in local_documents:
|
237
|
+
fp = Path(os.path.expanduser(ld.file_path))
|
238
|
+
file_size = fp.stat().st_size
|
239
|
+
|
240
|
+
if (current_batch_size + file_size > MAX_BATCH_SIZE_BYTES) or (len(local_batch) >= n):
|
241
|
+
up_docs, progress = self._process_local(local_batch, upload_api, progress, pbar)
|
242
|
+
|
243
|
+
ingest = self.documents.ingest_remote(
|
244
|
+
documents=up_docs,
|
245
|
+
request_options=request_options,
|
246
|
+
)
|
247
|
+
ingest, progress = self._monitor_batch(ingest, progress, pbar)
|
248
|
+
|
249
|
+
local_batch = []
|
250
|
+
current_batch_size = 0
|
251
|
+
|
252
|
+
local_batch.append(ld)
|
253
|
+
current_batch_size += file_size
|
254
|
+
|
255
|
+
if local_batch:
|
256
|
+
up_docs, progress = self._process_local(local_batch, upload_api, progress, pbar)
|
257
|
+
|
258
|
+
ingest = self.documents.ingest_remote(
|
259
|
+
documents=up_docs,
|
260
|
+
request_options=request_options,
|
261
|
+
)
|
262
|
+
ingest, progress = self._monitor_batch(ingest, progress, pbar)
|
263
|
+
|
264
|
+
if progress > 0:
|
265
|
+
pbar.update(progress)
|
266
|
+
|
267
|
+
return ingest
|
268
|
+
elif len(remote_documents) + len(local_documents) > MAX_BATCH_SIZE:
|
269
|
+
raise ValueError("You have sent too many documents in this request")
|
270
|
+
|
271
|
+
|
272
|
+
up_docs, _ = self._process_local(local_documents, upload_api)
|
273
|
+
remote_documents.extend(up_docs)
|
216
274
|
|
217
275
|
return self.documents.ingest_remote(
|
218
276
|
documents=remote_documents,
|
@@ -346,6 +404,92 @@ class GroundX(GroundXBase):
|
|
346
404
|
|
347
405
|
return strip_query_params(upload_url)
|
348
406
|
|
407
|
+
def _process_local(
|
408
|
+
self,
|
409
|
+
local_docs,
|
410
|
+
upload_api,
|
411
|
+
progress = None,
|
412
|
+
pbar = None,
|
413
|
+
):
|
414
|
+
remote_docs = []
|
415
|
+
for d in local_docs:
|
416
|
+
splits = split_doc(Path(os.path.expanduser(d.file_path)))
|
417
|
+
|
418
|
+
for sd in splits:
|
419
|
+
url = self._upload_file(upload_api, sd)
|
420
|
+
|
421
|
+
ft = d.file_type
|
422
|
+
if sd.suffix.lower() in SUFFIX_ALIASES:
|
423
|
+
ft = SUFFIX_ALIASES[sd.suffix.lower()]
|
424
|
+
|
425
|
+
fn = sd.name
|
426
|
+
if len(splits) == 1 and d.file_name:
|
427
|
+
fn = d.file_name
|
428
|
+
|
429
|
+
remote_docs.append(
|
430
|
+
IngestRemoteDocument(
|
431
|
+
bucket_id=d.bucket_id,
|
432
|
+
file_name=fn,
|
433
|
+
file_type=ft,
|
434
|
+
process_level=d.process_level,
|
435
|
+
search_data=d.search_data,
|
436
|
+
source_url=url,
|
437
|
+
)
|
438
|
+
)
|
439
|
+
|
440
|
+
if progress is not None and pbar is not None and pbar.update is not None:
|
441
|
+
pbar.update(0.25)
|
442
|
+
progress -= 0.25
|
443
|
+
|
444
|
+
return remote_docs, progress
|
445
|
+
|
446
|
+
def _monitor_batch(
|
447
|
+
self,
|
448
|
+
ingest,
|
449
|
+
progress,
|
450
|
+
pbar,
|
451
|
+
):
|
452
|
+
completed_files = set()
|
453
|
+
|
454
|
+
while (
|
455
|
+
ingest is not None
|
456
|
+
and ingest.ingest.status not in ["complete", "error", "cancelled"]
|
457
|
+
):
|
458
|
+
time.sleep(3)
|
459
|
+
ingest = self.documents.get_processing_status_by_id(ingest.ingest.process_id)
|
460
|
+
|
461
|
+
if ingest.ingest.progress:
|
462
|
+
if ingest.ingest.progress.processing and ingest.ingest.progress.processing.documents:
|
463
|
+
for doc in ingest.ingest.progress.processing.documents:
|
464
|
+
if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
|
465
|
+
pbar.update(0.75)
|
466
|
+
progress -= 0.75
|
467
|
+
completed_files.add(doc.document_id)
|
468
|
+
if ingest.ingest.progress.complete and ingest.ingest.progress.complete.documents:
|
469
|
+
for doc in ingest.ingest.progress.complete.documents:
|
470
|
+
if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
|
471
|
+
pbar.update(0.75)
|
472
|
+
progress -= 0.75
|
473
|
+
completed_files.add(doc.document_id)
|
474
|
+
if ingest.ingest.progress.cancelled and ingest.ingest.progress.cancelled.documents:
|
475
|
+
for doc in ingest.ingest.progress.cancelled.documents:
|
476
|
+
if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
|
477
|
+
pbar.update(0.75)
|
478
|
+
progress -= 0.75
|
479
|
+
completed_files.add(doc.document_id)
|
480
|
+
if ingest.ingest.progress.errors and ingest.ingest.progress.errors.documents:
|
481
|
+
for doc in ingest.ingest.progress.errors.documents:
|
482
|
+
if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
|
483
|
+
pbar.update(0.75)
|
484
|
+
progress -= 0.75
|
485
|
+
completed_files.add(doc.document_id)
|
486
|
+
|
487
|
+
|
488
|
+
if ingest.ingest.status in ["error", "cancelled"]:
|
489
|
+
raise ValueError(f"Ingest failed with status: {ingest.ingest.status}")
|
490
|
+
|
491
|
+
return ingest, progress
|
492
|
+
|
349
493
|
def _upload_file_batch(
|
350
494
|
self,
|
351
495
|
bucket_id,
|
@@ -356,7 +500,7 @@ class GroundX(GroundXBase):
|
|
356
500
|
):
|
357
501
|
docs = []
|
358
502
|
|
359
|
-
progress =
|
503
|
+
progress = float(len(batch))
|
360
504
|
for file in batch:
|
361
505
|
url = self._upload_file(upload_api, file)
|
362
506
|
if file.suffix.lower() in SUFFIX_ALIASES:
|
@@ -381,44 +525,10 @@ class GroundX(GroundXBase):
|
|
381
525
|
|
382
526
|
if docs:
|
383
527
|
ingest = self.ingest(documents=docs, request_options=request_options)
|
528
|
+
ingest, progress = self._monitor_batch(ingest, progress, pbar)
|
384
529
|
|
385
|
-
|
386
|
-
|
387
|
-
while (
|
388
|
-
ingest is not None
|
389
|
-
and ingest.ingest.status not in ["complete", "error", "cancelled"]
|
390
|
-
):
|
391
|
-
time.sleep(3)
|
392
|
-
ingest = self.documents.get_processing_status_by_id(ingest.ingest.process_id)
|
393
|
-
|
394
|
-
if ingest.ingest.progress:
|
395
|
-
if ingest.ingest.progress.processing and ingest.ingest.progress.processing.documents:
|
396
|
-
for doc in ingest.ingest.progress.processing.documents:
|
397
|
-
if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
|
398
|
-
pbar.update(0.75)
|
399
|
-
progress -= 0.75
|
400
|
-
if ingest.ingest.progress.complete and ingest.ingest.progress.complete.documents:
|
401
|
-
for doc in ingest.ingest.progress.complete.documents:
|
402
|
-
if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
|
403
|
-
pbar.update(0.75)
|
404
|
-
progress -= 0.75
|
405
|
-
if ingest.ingest.progress.cancelled and ingest.ingest.progress.cancelled.documents:
|
406
|
-
for doc in ingest.ingest.progress.cancelled.documents:
|
407
|
-
if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
|
408
|
-
pbar.update(0.75)
|
409
|
-
progress -= 0.75
|
410
|
-
if ingest.ingest.progress.errors and ingest.ingest.progress.errors.documents:
|
411
|
-
for doc in ingest.ingest.progress.errors.documents:
|
412
|
-
if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
|
413
|
-
pbar.update(0.75)
|
414
|
-
progress -= 0.75
|
415
|
-
|
416
|
-
|
417
|
-
if ingest.ingest.status in ["error", "cancelled"]:
|
418
|
-
raise ValueError(f"Ingest failed with status: {ingest.ingest.status}")
|
419
|
-
|
420
|
-
if progress > 0:
|
421
|
-
pbar.update(progress)
|
530
|
+
if progress > 0:
|
531
|
+
pbar.update(progress)
|
422
532
|
|
423
533
|
|
424
534
|
|
@@ -29,6 +29,7 @@ class SearchClient:
|
|
29
29
|
n: typing.Optional[int] = None,
|
30
30
|
next_token: typing.Optional[str] = None,
|
31
31
|
verbosity: typing.Optional[int] = None,
|
32
|
+
filter: typing.Optional[typing.Dict[str, typing.Optional[typing.Any]]] = OMIT,
|
32
33
|
relevance: typing.Optional[float] = OMIT,
|
33
34
|
request_options: typing.Optional[RequestOptions] = None,
|
34
35
|
) -> SearchResponse:
|
@@ -39,7 +40,7 @@ class SearchClient:
|
|
39
40
|
Parameters
|
40
41
|
----------
|
41
42
|
id : SearchContentRequestId
|
42
|
-
The bucketId, groupId,
|
43
|
+
The bucketId, groupId, or documentId to be searched. The document or documents within the specified container will be compared to the query, and relevant information will be extracted.
|
43
44
|
|
44
45
|
query : str
|
45
46
|
The search query to be used to find relevant documentation.
|
@@ -53,6 +54,9 @@ class SearchClient:
|
|
53
54
|
verbosity : typing.Optional[int]
|
54
55
|
The amount of data returned with each search result. 0 == no search results, only the recommended context. 1 == search results but no searchData. 2 == search results and searchData.
|
55
56
|
|
57
|
+
filter : typing.Optional[typing.Dict[str, typing.Optional[typing.Any]]]
|
58
|
+
A dictionary of key-value pairs that can be used to pre-filter documents prior to a search.
|
59
|
+
|
56
60
|
relevance : typing.Optional[float]
|
57
61
|
The minimum search relevance score required to include the result. By default, this is 10.0.
|
58
62
|
|
@@ -87,6 +91,7 @@ class SearchClient:
|
|
87
91
|
},
|
88
92
|
json={
|
89
93
|
"query": query,
|
94
|
+
"filter": filter,
|
90
95
|
"relevance": relevance,
|
91
96
|
},
|
92
97
|
headers={
|
@@ -137,6 +142,7 @@ class SearchClient:
|
|
137
142
|
n: typing.Optional[int] = None,
|
138
143
|
next_token: typing.Optional[str] = None,
|
139
144
|
verbosity: typing.Optional[int] = None,
|
145
|
+
filter: typing.Optional[typing.Dict[str, typing.Optional[typing.Any]]] = OMIT,
|
140
146
|
relevance: typing.Optional[float] = OMIT,
|
141
147
|
request_options: typing.Optional[RequestOptions] = None,
|
142
148
|
) -> SearchResponse:
|
@@ -161,6 +167,9 @@ class SearchClient:
|
|
161
167
|
verbosity : typing.Optional[int]
|
162
168
|
The amount of data returned with each search result. 0 == no search results, only the recommended context. 1 == search results but no searchData. 2 == search results and searchData.
|
163
169
|
|
170
|
+
filter : typing.Optional[typing.Dict[str, typing.Optional[typing.Any]]]
|
171
|
+
A dictionary of key-value pairs that can be used to pre-filter documents prior to a search.
|
172
|
+
|
164
173
|
relevance : typing.Optional[float]
|
165
174
|
The minimum search relevance score required to include the result. By default, this is 10.0.
|
166
175
|
|
@@ -196,6 +205,7 @@ class SearchClient:
|
|
196
205
|
json={
|
197
206
|
"query": query,
|
198
207
|
"documentIds": document_ids,
|
208
|
+
"filter": filter,
|
199
209
|
"relevance": relevance,
|
200
210
|
},
|
201
211
|
headers={
|
@@ -251,6 +261,7 @@ class AsyncSearchClient:
|
|
251
261
|
n: typing.Optional[int] = None,
|
252
262
|
next_token: typing.Optional[str] = None,
|
253
263
|
verbosity: typing.Optional[int] = None,
|
264
|
+
filter: typing.Optional[typing.Dict[str, typing.Optional[typing.Any]]] = OMIT,
|
254
265
|
relevance: typing.Optional[float] = OMIT,
|
255
266
|
request_options: typing.Optional[RequestOptions] = None,
|
256
267
|
) -> SearchResponse:
|
@@ -261,7 +272,7 @@ class AsyncSearchClient:
|
|
261
272
|
Parameters
|
262
273
|
----------
|
263
274
|
id : SearchContentRequestId
|
264
|
-
The bucketId, groupId,
|
275
|
+
The bucketId, groupId, or documentId to be searched. The document or documents within the specified container will be compared to the query, and relevant information will be extracted.
|
265
276
|
|
266
277
|
query : str
|
267
278
|
The search query to be used to find relevant documentation.
|
@@ -275,6 +286,9 @@ class AsyncSearchClient:
|
|
275
286
|
verbosity : typing.Optional[int]
|
276
287
|
The amount of data returned with each search result. 0 == no search results, only the recommended context. 1 == search results but no searchData. 2 == search results and searchData.
|
277
288
|
|
289
|
+
filter : typing.Optional[typing.Dict[str, typing.Optional[typing.Any]]]
|
290
|
+
A dictionary of key-value pairs that can be used to pre-filter documents prior to a search.
|
291
|
+
|
278
292
|
relevance : typing.Optional[float]
|
279
293
|
The minimum search relevance score required to include the result. By default, this is 10.0.
|
280
294
|
|
@@ -317,6 +331,7 @@ class AsyncSearchClient:
|
|
317
331
|
},
|
318
332
|
json={
|
319
333
|
"query": query,
|
334
|
+
"filter": filter,
|
320
335
|
"relevance": relevance,
|
321
336
|
},
|
322
337
|
headers={
|
@@ -367,6 +382,7 @@ class AsyncSearchClient:
|
|
367
382
|
n: typing.Optional[int] = None,
|
368
383
|
next_token: typing.Optional[str] = None,
|
369
384
|
verbosity: typing.Optional[int] = None,
|
385
|
+
filter: typing.Optional[typing.Dict[str, typing.Optional[typing.Any]]] = OMIT,
|
370
386
|
relevance: typing.Optional[float] = OMIT,
|
371
387
|
request_options: typing.Optional[RequestOptions] = None,
|
372
388
|
) -> SearchResponse:
|
@@ -391,6 +407,9 @@ class AsyncSearchClient:
|
|
391
407
|
verbosity : typing.Optional[int]
|
392
408
|
The amount of data returned with each search result. 0 == no search results, only the recommended context. 1 == search results but no searchData. 2 == search results and searchData.
|
393
409
|
|
410
|
+
filter : typing.Optional[typing.Dict[str, typing.Optional[typing.Any]]]
|
411
|
+
A dictionary of key-value pairs that can be used to pre-filter documents prior to a search.
|
412
|
+
|
394
413
|
relevance : typing.Optional[float]
|
395
414
|
The minimum search relevance score required to include the result. By default, this is 10.0.
|
396
415
|
|
@@ -434,6 +453,7 @@ class AsyncSearchClient:
|
|
434
453
|
json={
|
435
454
|
"query": query,
|
436
455
|
"documentIds": document_ids,
|
456
|
+
"filter": filter,
|
437
457
|
"relevance": relevance,
|
438
458
|
},
|
439
459
|
headers={
|