groundx 2.2.8__tar.gz → 2.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {groundx-2.2.8 → groundx-2.3.0}/PKG-INFO +1 -1
- {groundx-2.2.8 → groundx-2.3.0}/pyproject.toml +1 -1
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/core/client_wrapper.py +1 -1
- groundx-2.3.0/src/groundx/csv_splitter.py +64 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/ingest.py +247 -93
- {groundx-2.2.8 → groundx-2.3.0}/LICENSE +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/README.md +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/__init__.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/buckets/__init__.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/buckets/client.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/client.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/core/__init__.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/core/api_error.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/core/datetime_utils.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/core/file.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/core/http_client.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/core/jsonable_encoder.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/core/pydantic_utilities.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/core/query_encoder.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/core/remove_none_from_dict.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/core/request_options.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/core/serialization.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/customer/__init__.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/customer/client.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/documents/__init__.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/documents/client.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/environment.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/errors/__init__.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/errors/bad_request_error.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/errors/unauthorized_error.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/groups/__init__.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/groups/client.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/health/__init__.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/health/client.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/py.typed +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/search/__init__.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/search/client.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/search/types/__init__.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/search/types/search_content_request_id.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/__init__.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/bounding_box_detail.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/bucket_detail.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/bucket_list_response.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/bucket_response.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/bucket_update_detail.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/bucket_update_response.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/customer_detail.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/customer_response.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/document.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/document_detail.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/document_list_response.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/document_local_ingest_request.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/document_lookup_response.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/document_response.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/document_type.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/group_detail.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/group_list_response.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/group_response.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/health_response.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/health_response_health.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/health_service.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/health_service_status.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/ingest_local_document.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/ingest_local_document_metadata.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/ingest_remote_document.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/ingest_response.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/ingest_response_ingest.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/message_response.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/meter_detail.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/process_level.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/process_status_response.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/process_status_response_ingest.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/process_status_response_ingest_progress.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/process_status_response_ingest_progress_cancelled.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/process_status_response_ingest_progress_complete.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/process_status_response_ingest_progress_errors.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/process_status_response_ingest_progress_processing.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/processes_status_response.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/processing_status.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/search_response.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/search_response_search.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/search_result_item.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/sort.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/sort_order.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/subscription_detail.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/subscription_detail_meters.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/website_source.py +0 -0
- {groundx-2.2.8 → groundx-2.3.0}/src/groundx/version.py +0 -0
@@ -0,0 +1,64 @@
|
|
1
|
+
import csv, math, os, tempfile, typing
|
2
|
+
from pathlib import Path
|
3
|
+
|
4
|
+
|
5
|
+
class CSVSplitter:
|
6
|
+
def __init__(self, filepath, delimiter=','):
|
7
|
+
self.filepath = filepath
|
8
|
+
self.delimiter = delimiter
|
9
|
+
self.filename = os.path.basename(filepath)
|
10
|
+
self.file_size = os.path.getsize(filepath)
|
11
|
+
self.rows_count = self.get_row_count()
|
12
|
+
|
13
|
+
def get_row_count(self):
|
14
|
+
with open(self.filepath, "r", newline="", encoding="utf-8") as csvfile:
|
15
|
+
return sum(1 for _ in csvfile) - 1
|
16
|
+
|
17
|
+
def determine_splits(self):
|
18
|
+
row_mod = int(self.rows_count / 1000) + 1
|
19
|
+
file_mod = int(self.file_size / 1024 / 1024) + 1
|
20
|
+
|
21
|
+
return max(row_mod, file_mod)
|
22
|
+
|
23
|
+
def split(self):
|
24
|
+
splits = self.determine_splits()
|
25
|
+
if splits < 2:
|
26
|
+
return [Path(self.filepath)]
|
27
|
+
|
28
|
+
rows_per_file = math.ceil(self.rows_count / splits)
|
29
|
+
|
30
|
+
split_files: typing.List[Path] = []
|
31
|
+
with open(self.filepath, "r", newline="", encoding="utf-8") as csvfile:
|
32
|
+
reader = csv.reader(csvfile, delimiter=self.delimiter)
|
33
|
+
headers = next(reader)
|
34
|
+
|
35
|
+
temp_dir = tempfile.mkdtemp()
|
36
|
+
|
37
|
+
current_file_number = 1
|
38
|
+
current_row = 0
|
39
|
+
current_writer = None
|
40
|
+
current_output_file = None
|
41
|
+
|
42
|
+
for row in reader:
|
43
|
+
if current_row % rows_per_file == 0:
|
44
|
+
if current_output_file:
|
45
|
+
current_output_file.close()
|
46
|
+
output_file_path = os.path.join(
|
47
|
+
temp_dir,
|
48
|
+
f"{os.path.splitext(self.filename)[0]}_{current_file_number}.csv",
|
49
|
+
)
|
50
|
+
split_files.append(Path(output_file_path))
|
51
|
+
current_output_file = open(
|
52
|
+
output_file_path, "w", newline="", encoding="utf-8"
|
53
|
+
)
|
54
|
+
current_writer = csv.writer(current_output_file, delimiter=self.delimiter)
|
55
|
+
current_writer.writerow(headers)
|
56
|
+
current_file_number += 1
|
57
|
+
|
58
|
+
current_writer.writerow(row)
|
59
|
+
current_row += 1
|
60
|
+
|
61
|
+
if current_output_file:
|
62
|
+
current_output_file.close()
|
63
|
+
|
64
|
+
return split_files
|
@@ -5,10 +5,11 @@ from urllib.parse import urlparse, urlunparse
|
|
5
5
|
|
6
6
|
from .client import GroundXBase, AsyncGroundXBase
|
7
7
|
from .core.request_options import RequestOptions
|
8
|
+
from .csv_splitter import CSVSplitter
|
8
9
|
from .types.document import Document
|
9
|
-
from .types.document_type import DocumentType
|
10
10
|
from .types.ingest_remote_document import IngestRemoteDocument
|
11
11
|
from .types.ingest_response import IngestResponse
|
12
|
+
from .types.ingest_response_ingest import IngestResponseIngest
|
12
13
|
|
13
14
|
# this is used as the default value for optional parameters
|
14
15
|
OMIT = typing.cast(typing.Any, ...)
|
@@ -38,10 +39,18 @@ MIME_TO_DOCUMENT_TYPE = {v: k for k, v in DOCUMENT_TYPE_TO_MIME.items()}
|
|
38
39
|
|
39
40
|
ALLOWED_SUFFIXES = {f".{k}": v for k, v in DOCUMENT_TYPE_TO_MIME.items()}
|
40
41
|
|
42
|
+
CSV_SPLITS = {
|
43
|
+
".csv": True,
|
44
|
+
}
|
45
|
+
TSV_SPLITS = {
|
46
|
+
".tsv": True,
|
47
|
+
}
|
48
|
+
|
41
49
|
SUFFIX_ALIASES = {
|
42
|
-
".jpeg": "
|
43
|
-
".heic": "
|
44
|
-
".tif": "
|
50
|
+
".jpeg": "jpg",
|
51
|
+
".heic": "heif",
|
52
|
+
".tif": "tiff",
|
53
|
+
".md": "txt",
|
45
54
|
}
|
46
55
|
|
47
56
|
MAX_BATCH_SIZE = 50
|
@@ -115,11 +124,25 @@ def prep_documents(
|
|
115
124
|
return remote_documents, local_documents
|
116
125
|
|
117
126
|
|
127
|
+
def split_doc(file):
|
128
|
+
if file.is_file() and (
|
129
|
+
file.suffix.lower() in ALLOWED_SUFFIXES
|
130
|
+
or file.suffix.lower() in SUFFIX_ALIASES
|
131
|
+
):
|
132
|
+
if file.suffix.lower() in CSV_SPLITS:
|
133
|
+
return CSVSplitter(filepath=file).split()
|
134
|
+
elif file.suffix.lower() in TSV_SPLITS:
|
135
|
+
return CSVSplitter(filepath=file, delimiter='\t').split()
|
136
|
+
return [file]
|
137
|
+
return []
|
138
|
+
|
118
139
|
class GroundX(GroundXBase):
|
119
140
|
def ingest(
|
120
141
|
self,
|
121
142
|
*,
|
122
143
|
documents: typing.Sequence[Document],
|
144
|
+
batch_size: typing.Optional[int] = 10,
|
145
|
+
wait_for_complete: typing.Optional[bool] = False,
|
123
146
|
upload_api: typing.Optional[str] = "https://api.eyelevel.ai/upload/file",
|
124
147
|
request_options: typing.Optional[RequestOptions] = None,
|
125
148
|
) -> IngestResponse:
|
@@ -130,6 +153,13 @@ class GroundX(GroundXBase):
|
|
130
153
|
----------
|
131
154
|
documents : typing.Sequence[Document]
|
132
155
|
|
156
|
+
# defines how many files to send per batch
|
157
|
+
# ignored unless wait_for_complete is True
|
158
|
+
batch_size : typing.Optional[int]
|
159
|
+
|
160
|
+
# will turn on progress bar and wait for ingestion to complete
|
161
|
+
wait_for_complete : typing.Optional[bool]
|
162
|
+
|
133
163
|
# an endpoint that accepts 'name' and 'type' query params
|
134
164
|
# and returns a presigned URL in a JSON dictionary with key 'URL'
|
135
165
|
upload_api : typing.Optional[str]
|
@@ -163,41 +193,87 @@ class GroundX(GroundXBase):
|
|
163
193
|
"""
|
164
194
|
remote_documents, local_documents = prep_documents(documents)
|
165
195
|
|
166
|
-
if local_documents
|
167
|
-
raise ValueError("
|
196
|
+
if len(remote_documents) + len(local_documents) == 0:
|
197
|
+
raise ValueError("No valid documents were provided")
|
168
198
|
|
169
|
-
if
|
170
|
-
|
171
|
-
|
199
|
+
if wait_for_complete:
|
200
|
+
with tqdm(total=len(remote_documents) + len(local_documents), desc="Ingesting Files", unit="file") as pbar:
|
201
|
+
n = max(MIN_BATCH_SIZE, min(batch_size or MIN_BATCH_SIZE, MAX_BATCH_SIZE))
|
172
202
|
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
203
|
+
remote_batch: typing.List[IngestRemoteDocument] = []
|
204
|
+
ingest = IngestResponse(ingest=IngestResponseIngest(process_id="",status="queued"))
|
205
|
+
|
206
|
+
progress = float(len(remote_documents))
|
207
|
+
for rd in remote_documents:
|
208
|
+
if len(remote_batch) >= n:
|
209
|
+
ingest = self.documents.ingest_remote(
|
210
|
+
documents=remote_batch,
|
211
|
+
request_options=request_options,
|
212
|
+
)
|
213
|
+
ingest, progress = self._monitor_batch(ingest, progress, pbar)
|
214
|
+
|
215
|
+
remote_batch = []
|
216
|
+
|
217
|
+
remote_batch.append(rd)
|
218
|
+
pbar.update(0.25)
|
219
|
+
progress -= 0.25
|
220
|
+
|
221
|
+
if remote_batch:
|
222
|
+
ingest = self.documents.ingest_remote(
|
223
|
+
documents=remote_batch,
|
224
|
+
request_options=request_options,
|
225
|
+
)
|
226
|
+
ingest, progress = self._monitor_batch(ingest, progress, pbar)
|
227
|
+
|
228
|
+
|
229
|
+
if progress > 0:
|
230
|
+
pbar.update(progress)
|
231
|
+
|
232
|
+
current_batch_size = 0
|
233
|
+
local_batch: typing.List[Document] = []
|
234
|
+
|
235
|
+
progress = float(len(local_documents))
|
236
|
+
for ld in local_documents:
|
237
|
+
fp = Path(os.path.expanduser(ld.file_path))
|
238
|
+
file_size = fp.stat().st_size
|
239
|
+
|
240
|
+
if (current_batch_size + file_size > MAX_BATCH_SIZE_BYTES) or (len(local_batch) >= n):
|
241
|
+
up_docs, progress = self._process_local(local_batch, upload_api, progress, pbar)
|
177
242
|
|
178
|
-
|
243
|
+
ingest = self.documents.ingest_remote(
|
244
|
+
documents=up_docs,
|
245
|
+
request_options=request_options,
|
246
|
+
)
|
247
|
+
ingest, progress = self._monitor_batch(ingest, progress, pbar)
|
248
|
+
|
249
|
+
local_batch = []
|
250
|
+
current_batch_size = 0
|
251
|
+
|
252
|
+
local_batch.append(ld)
|
253
|
+
current_batch_size += file_size
|
254
|
+
|
255
|
+
if local_batch:
|
256
|
+
up_docs, progress = self._process_local(local_batch, upload_api, progress, pbar)
|
257
|
+
|
258
|
+
ingest = self.documents.ingest_remote(
|
259
|
+
documents=up_docs,
|
260
|
+
request_options=request_options,
|
261
|
+
)
|
262
|
+
ingest, progress = self._monitor_batch(ingest, progress, pbar)
|
263
|
+
|
264
|
+
if progress > 0:
|
265
|
+
pbar.update(progress)
|
266
|
+
|
267
|
+
return ingest
|
268
|
+
elif len(remote_documents) + len(local_documents) > MAX_BATCH_SIZE:
|
179
269
|
raise ValueError("You have sent too many documents in this request")
|
180
270
|
|
181
|
-
if len(local_documents) == 0:
|
182
|
-
raise ValueError("No valid documents were provided")
|
183
271
|
|
184
|
-
|
185
|
-
|
186
|
-
url = self._upload_file(upload_api, Path(os.path.expanduser(d.file_path)))
|
187
|
-
|
188
|
-
docs.append(
|
189
|
-
IngestRemoteDocument(
|
190
|
-
bucket_id=d.bucket_id,
|
191
|
-
file_name=d.file_name,
|
192
|
-
file_type=d.file_type,
|
193
|
-
process_level=d.process_level,
|
194
|
-
search_data=d.search_data,
|
195
|
-
source_url=url,
|
196
|
-
)
|
197
|
-
)
|
272
|
+
up_docs, _ = self._process_local(local_documents, upload_api)
|
273
|
+
remote_documents.extend(up_docs)
|
198
274
|
|
199
275
|
return self.documents.ingest_remote(
|
200
|
-
documents=
|
276
|
+
documents=remote_documents,
|
201
277
|
request_options=request_options,
|
202
278
|
)
|
203
279
|
|
@@ -252,14 +328,10 @@ class GroundX(GroundXBase):
|
|
252
328
|
def load_directory_files(directory: str) -> typing.List[Path]:
|
253
329
|
dir_path = Path(directory)
|
254
330
|
|
255
|
-
matched_files = [
|
256
|
-
|
257
|
-
for
|
258
|
-
|
259
|
-
file.suffix.lower() in ALLOWED_SUFFIXES
|
260
|
-
or file.suffix.lower() in SUFFIX_ALIASES
|
261
|
-
)
|
262
|
-
]
|
331
|
+
matched_files: typing.List[Path] = []
|
332
|
+
for file in dir_path.rglob("*"):
|
333
|
+
for sd in split_doc(file):
|
334
|
+
matched_files.append(sd)
|
263
335
|
|
264
336
|
return matched_files
|
265
337
|
|
@@ -301,6 +373,8 @@ class GroundX(GroundXBase):
|
|
301
373
|
):
|
302
374
|
file_name = os.path.basename(file_path)
|
303
375
|
file_extension = os.path.splitext(file_name)[1][1:].lower()
|
376
|
+
if f".{file_extension}" in SUFFIX_ALIASES:
|
377
|
+
file_extension = SUFFIX_ALIASES[f".{file_extension}"]
|
304
378
|
|
305
379
|
presigned_info = get_presigned_url(endpoint, file_name, file_extension)
|
306
380
|
|
@@ -330,6 +404,92 @@ class GroundX(GroundXBase):
|
|
330
404
|
|
331
405
|
return strip_query_params(upload_url)
|
332
406
|
|
407
|
+
def _process_local(
|
408
|
+
self,
|
409
|
+
local_docs,
|
410
|
+
upload_api,
|
411
|
+
progress = None,
|
412
|
+
pbar = None,
|
413
|
+
):
|
414
|
+
remote_docs = []
|
415
|
+
for d in local_docs:
|
416
|
+
splits = split_doc(Path(os.path.expanduser(d.file_path)))
|
417
|
+
|
418
|
+
for sd in splits:
|
419
|
+
url = self._upload_file(upload_api, sd)
|
420
|
+
|
421
|
+
ft = d.file_type
|
422
|
+
if sd.suffix.lower() in SUFFIX_ALIASES:
|
423
|
+
ft = SUFFIX_ALIASES[sd.suffix.lower()]
|
424
|
+
|
425
|
+
fn = sd.name
|
426
|
+
if len(splits) == 1 and d.file_name:
|
427
|
+
fn = d.file_name
|
428
|
+
|
429
|
+
remote_docs.append(
|
430
|
+
IngestRemoteDocument(
|
431
|
+
bucket_id=d.bucket_id,
|
432
|
+
file_name=fn,
|
433
|
+
file_type=ft,
|
434
|
+
process_level=d.process_level,
|
435
|
+
search_data=d.search_data,
|
436
|
+
source_url=url,
|
437
|
+
)
|
438
|
+
)
|
439
|
+
|
440
|
+
if progress is not None and pbar is not None and pbar.update is not None:
|
441
|
+
pbar.update(0.25)
|
442
|
+
progress -= 0.25
|
443
|
+
|
444
|
+
return remote_docs, progress
|
445
|
+
|
446
|
+
def _monitor_batch(
|
447
|
+
self,
|
448
|
+
ingest,
|
449
|
+
progress,
|
450
|
+
pbar,
|
451
|
+
):
|
452
|
+
completed_files = set()
|
453
|
+
|
454
|
+
while (
|
455
|
+
ingest is not None
|
456
|
+
and ingest.ingest.status not in ["complete", "error", "cancelled"]
|
457
|
+
):
|
458
|
+
time.sleep(3)
|
459
|
+
ingest = self.documents.get_processing_status_by_id(ingest.ingest.process_id)
|
460
|
+
|
461
|
+
if ingest.ingest.progress:
|
462
|
+
if ingest.ingest.progress.processing and ingest.ingest.progress.processing.documents:
|
463
|
+
for doc in ingest.ingest.progress.processing.documents:
|
464
|
+
if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
|
465
|
+
pbar.update(0.75)
|
466
|
+
progress -= 0.75
|
467
|
+
completed_files.add(doc.document_id)
|
468
|
+
if ingest.ingest.progress.complete and ingest.ingest.progress.complete.documents:
|
469
|
+
for doc in ingest.ingest.progress.complete.documents:
|
470
|
+
if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
|
471
|
+
pbar.update(0.75)
|
472
|
+
progress -= 0.75
|
473
|
+
completed_files.add(doc.document_id)
|
474
|
+
if ingest.ingest.progress.cancelled and ingest.ingest.progress.cancelled.documents:
|
475
|
+
for doc in ingest.ingest.progress.cancelled.documents:
|
476
|
+
if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
|
477
|
+
pbar.update(0.75)
|
478
|
+
progress -= 0.75
|
479
|
+
completed_files.add(doc.document_id)
|
480
|
+
if ingest.ingest.progress.errors and ingest.ingest.progress.errors.documents:
|
481
|
+
for doc in ingest.ingest.progress.errors.documents:
|
482
|
+
if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
|
483
|
+
pbar.update(0.75)
|
484
|
+
progress -= 0.75
|
485
|
+
completed_files.add(doc.document_id)
|
486
|
+
|
487
|
+
|
488
|
+
if ingest.ingest.status in ["error", "cancelled"]:
|
489
|
+
raise ValueError(f"Ingest failed with status: {ingest.ingest.status}")
|
490
|
+
|
491
|
+
return ingest, progress
|
492
|
+
|
333
493
|
def _upload_file_batch(
|
334
494
|
self,
|
335
495
|
bucket_id,
|
@@ -340,41 +500,35 @@ class GroundX(GroundXBase):
|
|
340
500
|
):
|
341
501
|
docs = []
|
342
502
|
|
343
|
-
progress =
|
503
|
+
progress = float(len(batch))
|
344
504
|
for file in batch:
|
345
505
|
url = self._upload_file(upload_api, file)
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
506
|
+
if file.suffix.lower() in SUFFIX_ALIASES:
|
507
|
+
docs.append(
|
508
|
+
Document(
|
509
|
+
bucket_id=bucket_id,
|
510
|
+
file_name=file.name,
|
511
|
+
file_path=url,
|
512
|
+
file_type=SUFFIX_ALIASES[file.suffix.lower()],
|
513
|
+
),
|
514
|
+
)
|
515
|
+
else:
|
516
|
+
docs.append(
|
517
|
+
Document(
|
518
|
+
bucket_id=bucket_id,
|
519
|
+
file_name=file.name,
|
520
|
+
file_path=url,
|
521
|
+
),
|
522
|
+
)
|
352
523
|
pbar.update(0.25)
|
353
524
|
progress -= 0.25
|
354
525
|
|
355
526
|
if docs:
|
356
527
|
ingest = self.ingest(documents=docs, request_options=request_options)
|
528
|
+
ingest, progress = self._monitor_batch(ingest, progress, pbar)
|
357
529
|
|
358
|
-
|
359
|
-
|
360
|
-
while (
|
361
|
-
ingest is not None
|
362
|
-
and ingest.ingest.status not in ["complete", "error", "cancelled"]
|
363
|
-
):
|
364
|
-
time.sleep(3)
|
365
|
-
ingest = self.documents.get_processing_status_by_id(ingest.ingest.process_id)
|
366
|
-
|
367
|
-
if ingest.ingest.progress and ingest.ingest.progress.processing:
|
368
|
-
for doc in ingest.ingest.progress.processing.documents:
|
369
|
-
if doc.status == "complete" and doc.document_id not in completed_files:
|
370
|
-
pbar.update(0.75)
|
371
|
-
progress -= 0.75
|
372
|
-
|
373
|
-
if ingest.ingest.status in ["error", "cancelled"]:
|
374
|
-
raise ValueError(f"Ingest failed with status: {ingest.ingest.status}")
|
375
|
-
|
376
|
-
if progress > 0:
|
377
|
-
pbar.update(progress)
|
530
|
+
if progress > 0:
|
531
|
+
pbar.update(progress)
|
378
532
|
|
379
533
|
|
380
534
|
|
@@ -431,41 +585,39 @@ class AsyncGroundX(AsyncGroundXBase):
|
|
431
585
|
"""
|
432
586
|
remote_documents, local_documents = prep_documents(documents)
|
433
587
|
|
434
|
-
if local_documents
|
435
|
-
raise ValueError("Documents must all be either local or remote, not a mix.")
|
436
|
-
|
437
|
-
if len(remote_documents) > 0:
|
438
|
-
if len(remote_documents) > MAX_BATCH_SIZE:
|
439
|
-
raise ValueError("You have sent too many documents in this request")
|
440
|
-
|
441
|
-
return await self.documents.ingest_remote(
|
442
|
-
documents=remote_documents,
|
443
|
-
request_options=request_options,
|
444
|
-
)
|
445
|
-
|
446
|
-
if len(local_documents) > MAX_BATCH_SIZE:
|
588
|
+
if len(remote_documents) + len(local_documents) > MAX_BATCH_SIZE:
|
447
589
|
raise ValueError("You have sent too many documents in this request")
|
448
590
|
|
449
|
-
if len(local_documents) == 0:
|
591
|
+
if len(remote_documents) + len(local_documents) == 0:
|
450
592
|
raise ValueError("No valid documents were provided")
|
451
593
|
|
452
|
-
docs: typing.List[IngestRemoteDocument] = []
|
453
594
|
for d in local_documents:
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
595
|
+
splits = split_doc(Path(os.path.expanduser(d.file_path)))
|
596
|
+
|
597
|
+
for sd in splits:
|
598
|
+
url = self._upload_file(upload_api, sd)
|
599
|
+
|
600
|
+
ft = d.file_type
|
601
|
+
if sd.suffix.lower() in SUFFIX_ALIASES:
|
602
|
+
ft = SUFFIX_ALIASES[sd.suffix.lower()]
|
603
|
+
|
604
|
+
fn = sd.name
|
605
|
+
if len(splits) == 1 and d.file_name:
|
606
|
+
fn = d.file_name
|
607
|
+
|
608
|
+
remote_documents.append(
|
609
|
+
IngestRemoteDocument(
|
610
|
+
bucket_id=d.bucket_id,
|
611
|
+
file_name=fn,
|
612
|
+
file_type=ft,
|
613
|
+
process_level=d.process_level,
|
614
|
+
search_data=d.search_data,
|
615
|
+
source_url=url,
|
616
|
+
)
|
464
617
|
)
|
465
|
-
)
|
466
618
|
|
467
619
|
return await self.documents.ingest_remote(
|
468
|
-
documents=
|
620
|
+
documents=remote_documents,
|
469
621
|
request_options=request_options,
|
470
622
|
)
|
471
623
|
|
@@ -476,6 +628,8 @@ class AsyncGroundX(AsyncGroundXBase):
|
|
476
628
|
):
|
477
629
|
file_name = os.path.basename(file_path)
|
478
630
|
file_extension = os.path.splitext(file_name)[1][1:].lower()
|
631
|
+
if f".{file_extension}" in SUFFIX_ALIASES:
|
632
|
+
file_extension = SUFFIX_ALIASES[f".{file_extension}"]
|
479
633
|
|
480
634
|
presigned_info = get_presigned_url(endpoint, file_name, file_extension)
|
481
635
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/process_status_response_ingest_progress.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/process_status_response_ingest_progress_errors.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|