groundx 2.2.7__py3-none-any.whl → 2.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- groundx/core/client_wrapper.py +1 -1
- groundx/csv_splitter.py +64 -0
- groundx/ingest.py +272 -253
- {groundx-2.2.7.dist-info → groundx-2.2.9.dist-info}/METADATA +1 -1
- {groundx-2.2.7.dist-info → groundx-2.2.9.dist-info}/RECORD +7 -6
- {groundx-2.2.7.dist-info → groundx-2.2.9.dist-info}/LICENSE +0 -0
- {groundx-2.2.7.dist-info → groundx-2.2.9.dist-info}/WHEEL +0 -0
groundx/core/client_wrapper.py
CHANGED
groundx/csv_splitter.py
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
import csv, math, os, tempfile, typing
|
2
|
+
from pathlib import Path
|
3
|
+
|
4
|
+
|
5
|
+
class CSVSplitter:
|
6
|
+
def __init__(self, filepath, delimiter=','):
|
7
|
+
self.filepath = filepath
|
8
|
+
self.delimiter = delimiter
|
9
|
+
self.filename = os.path.basename(filepath)
|
10
|
+
self.file_size = os.path.getsize(filepath)
|
11
|
+
self.rows_count = self.get_row_count()
|
12
|
+
|
13
|
+
def get_row_count(self):
|
14
|
+
with open(self.filepath, "r", newline="", encoding="utf-8") as csvfile:
|
15
|
+
return sum(1 for _ in csvfile) - 1
|
16
|
+
|
17
|
+
def determine_splits(self):
|
18
|
+
row_mod = int(self.rows_count / 1000) + 1
|
19
|
+
file_mod = int(self.file_size / 1024 / 1024) + 1
|
20
|
+
|
21
|
+
return max(row_mod, file_mod)
|
22
|
+
|
23
|
+
def split(self):
|
24
|
+
splits = self.determine_splits()
|
25
|
+
if splits < 2:
|
26
|
+
return [Path(self.filepath)]
|
27
|
+
|
28
|
+
rows_per_file = math.ceil(self.rows_count / splits)
|
29
|
+
|
30
|
+
split_files: typing.List[Path] = []
|
31
|
+
with open(self.filepath, "r", newline="", encoding="utf-8") as csvfile:
|
32
|
+
reader = csv.reader(csvfile, delimiter=self.delimiter)
|
33
|
+
headers = next(reader)
|
34
|
+
|
35
|
+
temp_dir = tempfile.mkdtemp()
|
36
|
+
|
37
|
+
current_file_number = 1
|
38
|
+
current_row = 0
|
39
|
+
current_writer = None
|
40
|
+
current_output_file = None
|
41
|
+
|
42
|
+
for row in reader:
|
43
|
+
if current_row % rows_per_file == 0:
|
44
|
+
if current_output_file:
|
45
|
+
current_output_file.close()
|
46
|
+
output_file_path = os.path.join(
|
47
|
+
temp_dir,
|
48
|
+
f"{os.path.splitext(self.filename)[0]}_{current_file_number}.csv",
|
49
|
+
)
|
50
|
+
split_files.append(Path(output_file_path))
|
51
|
+
current_output_file = open(
|
52
|
+
output_file_path, "w", newline="", encoding="utf-8"
|
53
|
+
)
|
54
|
+
current_writer = csv.writer(current_output_file, delimiter=self.delimiter)
|
55
|
+
current_writer.writerow(headers)
|
56
|
+
current_file_number += 1
|
57
|
+
|
58
|
+
current_writer.writerow(row)
|
59
|
+
current_row += 1
|
60
|
+
|
61
|
+
if current_output_file:
|
62
|
+
current_output_file.close()
|
63
|
+
|
64
|
+
return split_files
|
groundx/ingest.py
CHANGED
@@ -1,19 +1,12 @@
|
|
1
|
-
import
|
2
|
-
from asyncio import TimeoutError
|
1
|
+
import requests, time, typing, os
|
3
2
|
from pathlib import Path
|
4
3
|
from tqdm import tqdm
|
5
4
|
from urllib.parse import urlparse, urlunparse
|
6
5
|
|
7
|
-
from json.decoder import JSONDecodeError
|
8
|
-
|
9
6
|
from .client import GroundXBase, AsyncGroundXBase
|
10
|
-
from .core.api_error import ApiError
|
11
|
-
from .core.pydantic_utilities import parse_obj_as
|
12
7
|
from .core.request_options import RequestOptions
|
13
|
-
from .
|
14
|
-
from .errors.unauthorized_error import UnauthorizedError
|
8
|
+
from .csv_splitter import CSVSplitter
|
15
9
|
from .types.document import Document
|
16
|
-
from .types.document_type import DocumentType
|
17
10
|
from .types.ingest_remote_document import IngestRemoteDocument
|
18
11
|
from .types.ingest_response import IngestResponse
|
19
12
|
|
@@ -45,23 +38,48 @@ MIME_TO_DOCUMENT_TYPE = {v: k for k, v in DOCUMENT_TYPE_TO_MIME.items()}
|
|
45
38
|
|
46
39
|
ALLOWED_SUFFIXES = {f".{k}": v for k, v in DOCUMENT_TYPE_TO_MIME.items()}
|
47
40
|
|
41
|
+
CSV_SPLITS = {
|
42
|
+
".csv": True,
|
43
|
+
}
|
44
|
+
TSV_SPLITS = {
|
45
|
+
".tsv": True,
|
46
|
+
}
|
47
|
+
|
48
48
|
SUFFIX_ALIASES = {
|
49
|
-
".jpeg": "
|
50
|
-
".heic": "
|
51
|
-
".tif": "
|
49
|
+
".jpeg": "jpg",
|
50
|
+
".heic": "heif",
|
51
|
+
".tif": "tiff",
|
52
|
+
".md": "txt",
|
52
53
|
}
|
53
54
|
|
54
55
|
MAX_BATCH_SIZE = 50
|
55
56
|
MIN_BATCH_SIZE = 1
|
56
57
|
MAX_BATCH_SIZE_BYTES = 50 * 1024 * 1024
|
57
58
|
|
59
|
+
def get_presigned_url(
|
60
|
+
endpoint: str,
|
61
|
+
file_name: str,
|
62
|
+
file_extension: str,
|
63
|
+
) -> typing.Dict[str, typing.Any]:
|
64
|
+
params = {"name": file_name, "type": file_extension}
|
65
|
+
response = requests.get(endpoint, params=params)
|
66
|
+
response.raise_for_status()
|
67
|
+
|
68
|
+
return response.json()
|
69
|
+
|
70
|
+
def strip_query_params(
|
71
|
+
url: str,
|
72
|
+
) -> str:
|
73
|
+
parsed = urlparse(url)
|
74
|
+
clean_url = urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", ""))
|
75
|
+
|
76
|
+
return clean_url
|
77
|
+
|
58
78
|
def prep_documents(
|
59
79
|
documents: typing.Sequence[Document],
|
60
80
|
) -> typing.Tuple[
|
61
81
|
typing.List[IngestRemoteDocument],
|
62
|
-
typing.List[
|
63
|
-
typing.Tuple[str, typing.Tuple[typing.Union[str, None], typing.BinaryIO, str]]
|
64
|
-
],
|
82
|
+
typing.List[Document],
|
65
83
|
]:
|
66
84
|
"""
|
67
85
|
Process documents and separate them into remote and local documents.
|
@@ -80,9 +98,7 @@ def prep_documents(
|
|
80
98
|
except ValueError:
|
81
99
|
return False
|
82
100
|
|
83
|
-
local_documents: typing.List[
|
84
|
-
typing.Tuple[str, typing.Tuple[typing.Union[str, None], typing.BinaryIO, str]]
|
85
|
-
] = []
|
101
|
+
local_documents: typing.List[Document] = []
|
86
102
|
remote_documents: typing.List[IngestRemoteDocument] = []
|
87
103
|
|
88
104
|
for document in documents:
|
@@ -100,64 +116,31 @@ def prep_documents(
|
|
100
116
|
)
|
101
117
|
remote_documents.append(remote_document)
|
102
118
|
elif is_valid_local_path(document.file_path):
|
103
|
-
|
104
|
-
file_name = os.path.basename(expanded_path)
|
105
|
-
mime_type = mimetypes.guess_type(file_name)[0] or "application/octet-stream"
|
106
|
-
file_type = MIME_TO_DOCUMENT_TYPE.get(mime_type, None)
|
107
|
-
if document.file_type:
|
108
|
-
file_type = document.file_type
|
109
|
-
mime_type = DOCUMENT_TYPE_TO_MIME.get(
|
110
|
-
document.file_type, "application/octet-stream"
|
111
|
-
)
|
112
|
-
|
113
|
-
if document.file_name:
|
114
|
-
file_name = document.file_name
|
115
|
-
|
116
|
-
try:
|
117
|
-
local_documents.append(
|
118
|
-
(
|
119
|
-
"blob",
|
120
|
-
(
|
121
|
-
file_name,
|
122
|
-
open(expanded_path, "rb"),
|
123
|
-
mime_type,
|
124
|
-
),
|
125
|
-
)
|
126
|
-
)
|
127
|
-
except Exception as e:
|
128
|
-
raise ValueError(f"Error reading file {expanded_path}: {e}")
|
129
|
-
|
130
|
-
metadata = {
|
131
|
-
"bucketId": document.bucket_id,
|
132
|
-
"fileName": file_name,
|
133
|
-
"fileType": file_type,
|
134
|
-
}
|
135
|
-
if document.process_level:
|
136
|
-
metadata["processLevel"] = document.process_level
|
137
|
-
if document.search_data:
|
138
|
-
metadata["searchData"] = document.search_data
|
139
|
-
|
140
|
-
local_documents.append(
|
141
|
-
(
|
142
|
-
"metadata",
|
143
|
-
(
|
144
|
-
f"data.json",
|
145
|
-
io.BytesIO(json.dumps(metadata).encode("utf-8")),
|
146
|
-
"application/json",
|
147
|
-
),
|
148
|
-
)
|
149
|
-
)
|
119
|
+
local_documents.append(document)
|
150
120
|
else:
|
151
121
|
raise ValueError(f"Invalid file path: {document.file_path}")
|
152
122
|
|
153
123
|
return remote_documents, local_documents
|
154
124
|
|
155
125
|
|
126
|
+
def split_doc(file):
|
127
|
+
if file.is_file() and (
|
128
|
+
file.suffix.lower() in ALLOWED_SUFFIXES
|
129
|
+
or file.suffix.lower() in SUFFIX_ALIASES
|
130
|
+
):
|
131
|
+
if file.suffix.lower() in CSV_SPLITS:
|
132
|
+
return CSVSplitter(filepath=file).split()
|
133
|
+
elif file.suffix.lower() in TSV_SPLITS:
|
134
|
+
return CSVSplitter(filepath=file, delimiter='\t').split()
|
135
|
+
return [file]
|
136
|
+
return []
|
137
|
+
|
156
138
|
class GroundX(GroundXBase):
|
157
139
|
def ingest(
|
158
140
|
self,
|
159
141
|
*,
|
160
142
|
documents: typing.Sequence[Document],
|
143
|
+
upload_api: typing.Optional[str] = "https://api.eyelevel.ai/upload/file",
|
161
144
|
request_options: typing.Optional[RequestOptions] = None,
|
162
145
|
) -> IngestResponse:
|
163
146
|
"""
|
@@ -167,6 +150,10 @@ class GroundX(GroundXBase):
|
|
167
150
|
----------
|
168
151
|
documents : typing.Sequence[Document]
|
169
152
|
|
153
|
+
# an endpoint that accepts 'name' and 'type' query params
|
154
|
+
# and returns a presigned URL in a JSON dictionary with key 'URL'
|
155
|
+
upload_api : typing.Optional[str]
|
156
|
+
|
170
157
|
request_options : typing.Optional[RequestOptions]
|
171
158
|
Request-specific configuration.
|
172
159
|
|
@@ -196,65 +183,41 @@ class GroundX(GroundXBase):
|
|
196
183
|
"""
|
197
184
|
remote_documents, local_documents = prep_documents(documents)
|
198
185
|
|
199
|
-
if local_documents
|
200
|
-
raise ValueError("
|
186
|
+
if len(remote_documents) + len(local_documents) > MAX_BATCH_SIZE:
|
187
|
+
raise ValueError("You have sent too many documents in this request")
|
201
188
|
|
202
|
-
if len(remote_documents)
|
203
|
-
|
204
|
-
documents=remote_documents,
|
205
|
-
request_options=request_options,
|
206
|
-
)
|
189
|
+
if len(remote_documents) + len(local_documents) == 0:
|
190
|
+
raise ValueError("No valid documents were provided")
|
207
191
|
|
208
|
-
|
209
|
-
|
210
|
-
base_url = self._client_wrapper.get_base_url().rstrip("/")
|
211
|
-
follow_redirects = getattr(
|
212
|
-
self._client_wrapper.httpx_client, "follow_redirects", True
|
213
|
-
)
|
192
|
+
for d in local_documents:
|
193
|
+
splits = split_doc(Path(os.path.expanduser(d.file_path)))
|
214
194
|
|
215
|
-
|
216
|
-
|
217
|
-
url,
|
218
|
-
files=local_documents,
|
219
|
-
headers=headers,
|
220
|
-
timeout=timeout,
|
221
|
-
allow_redirects=follow_redirects,
|
222
|
-
)
|
195
|
+
for sd in splits:
|
196
|
+
url = self._upload_file(upload_api, sd)
|
223
197
|
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
),
|
241
|
-
)
|
242
|
-
)
|
243
|
-
if _response.status_code == 401:
|
244
|
-
raise UnauthorizedError(
|
245
|
-
typing.cast(
|
246
|
-
typing.Optional[typing.Any],
|
247
|
-
parse_obj_as(
|
248
|
-
type_=typing.Optional[typing.Any], # type: ignore
|
249
|
-
object_=_response.json(),
|
250
|
-
),
|
198
|
+
ft = d.file_type
|
199
|
+
if sd.suffix.lower() in SUFFIX_ALIASES:
|
200
|
+
ft = SUFFIX_ALIASES[sd.suffix.lower()]
|
201
|
+
|
202
|
+
fn = sd.name
|
203
|
+
if len(splits) == 1 and d.file_name:
|
204
|
+
fn = d.file_name
|
205
|
+
|
206
|
+
remote_documents.append(
|
207
|
+
IngestRemoteDocument(
|
208
|
+
bucket_id=d.bucket_id,
|
209
|
+
file_name=fn,
|
210
|
+
file_type=ft,
|
211
|
+
process_level=d.process_level,
|
212
|
+
search_data=d.search_data,
|
213
|
+
source_url=url,
|
251
214
|
)
|
252
215
|
)
|
253
|
-
_response_json = _response.json()
|
254
|
-
except JSONDecodeError:
|
255
|
-
raise ApiError(status_code=_response.status_code, body=_response.text)
|
256
216
|
|
257
|
-
|
217
|
+
return self.documents.ingest_remote(
|
218
|
+
documents=remote_documents,
|
219
|
+
request_options=request_options,
|
220
|
+
)
|
258
221
|
|
259
222
|
def ingest_directory(
|
260
223
|
self,
|
@@ -275,7 +238,7 @@ class GroundX(GroundXBase):
|
|
275
238
|
batch_size : type.Optional[int]
|
276
239
|
|
277
240
|
# an endpoint that accepts 'name' and 'type' query params
|
278
|
-
# and returns a presigned URL
|
241
|
+
# and returns a presigned URL in a JSON dictionary with key 'URL'
|
279
242
|
upload_api : typing.Optional[str]
|
280
243
|
|
281
244
|
request_options : typing.Optional[RequestOptions]
|
@@ -300,13 +263,6 @@ class GroundX(GroundXBase):
|
|
300
263
|
)
|
301
264
|
"""
|
302
265
|
|
303
|
-
def get_presigned_url(endpoint, file_name, file_extension) -> typing.Dict[str, typing.Any]:
|
304
|
-
params = {"name": file_name, "type": file_extension}
|
305
|
-
response = requests.get(endpoint, params=params)
|
306
|
-
response.raise_for_status()
|
307
|
-
|
308
|
-
return response.json()
|
309
|
-
|
310
266
|
def is_valid_local_directory(path: str) -> bool:
|
311
267
|
expanded_path = os.path.expanduser(path)
|
312
268
|
return os.path.isdir(expanded_path)
|
@@ -314,89 +270,12 @@ class GroundX(GroundXBase):
|
|
314
270
|
def load_directory_files(directory: str) -> typing.List[Path]:
|
315
271
|
dir_path = Path(directory)
|
316
272
|
|
317
|
-
matched_files = [
|
318
|
-
|
319
|
-
for
|
320
|
-
|
321
|
-
file.suffix.lower() in ALLOWED_SUFFIXES
|
322
|
-
or file.suffix.lower() in SUFFIX_ALIASES
|
323
|
-
)
|
324
|
-
]
|
273
|
+
matched_files: typing.List[Path] = []
|
274
|
+
for file in dir_path.rglob("*"):
|
275
|
+
for sd in split_doc(file):
|
276
|
+
matched_files.append(sd)
|
325
277
|
|
326
|
-
return matched_files
|
327
|
-
|
328
|
-
def strip_query_params(url: str) -> str:
|
329
|
-
parsed = urlparse(url)
|
330
|
-
clean_url = urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", ""))
|
331
|
-
return clean_url
|
332
|
-
|
333
|
-
def _upload_file_batch(bucket_id, batch, upload_api, request_options, pbar):
|
334
|
-
docs = []
|
335
|
-
|
336
|
-
progress = len(batch)
|
337
|
-
for file in batch:
|
338
|
-
url = upload_file(upload_api, file)
|
339
|
-
docs.append(
|
340
|
-
Document(
|
341
|
-
bucket_id=bucket_id,
|
342
|
-
file_path=url,
|
343
|
-
),
|
344
|
-
)
|
345
|
-
pbar.update(0.25)
|
346
|
-
progress -= 0.25
|
347
|
-
|
348
|
-
if docs:
|
349
|
-
ingest = self.ingest(documents=docs, request_options=request_options)
|
350
|
-
|
351
|
-
completed_files = set()
|
352
|
-
|
353
|
-
while (
|
354
|
-
ingest is not None
|
355
|
-
and ingest.ingest.status not in ["complete", "error", "cancelled"]
|
356
|
-
):
|
357
|
-
time.sleep(3)
|
358
|
-
ingest = self.documents.get_processing_status_by_id(ingest.ingest.process_id)
|
359
|
-
|
360
|
-
if ingest.ingest.progress and ingest.ingest.progress.processing:
|
361
|
-
for doc in ingest.ingest.progress.processing.documents:
|
362
|
-
if doc.status == "complete" and doc.document_id not in completed_files:
|
363
|
-
pbar.update(0.75)
|
364
|
-
progress -= 0.75
|
365
|
-
|
366
|
-
if ingest.ingest.status in ["error", "cancelled"]:
|
367
|
-
raise ValueError(f"Ingest failed with status: {ingest.ingest.status}")
|
368
|
-
|
369
|
-
if progress > 0:
|
370
|
-
pbar.update(progress)
|
371
|
-
|
372
|
-
def upload_file(endpoint, file_path) -> str:
|
373
|
-
file_name = os.path.basename(file_path)
|
374
|
-
file_extension = os.path.splitext(file_name)[1][1:].lower()
|
375
|
-
|
376
|
-
presigned_info = get_presigned_url(endpoint, file_name, file_extension)
|
377
|
-
|
378
|
-
upload_url = presigned_info["URL"]
|
379
|
-
headers = presigned_info.get("Header", {})
|
380
|
-
method = presigned_info.get("Method", "PUT").upper()
|
381
|
-
|
382
|
-
for key, value in headers.items():
|
383
|
-
if isinstance(value, list):
|
384
|
-
headers[key] = value[0]
|
385
|
-
|
386
|
-
with open(file_path, "rb") as f:
|
387
|
-
file_data = f.read()
|
388
|
-
|
389
|
-
if method == "PUT":
|
390
|
-
upload_response = requests.put(upload_url, data=file_data, headers=headers)
|
391
|
-
else:
|
392
|
-
raise ValueError(f"Unsupported HTTP method: {method}")
|
393
|
-
|
394
|
-
if upload_response.status_code not in (200, 201):
|
395
|
-
raise Exception(
|
396
|
-
f"Upload failed: {upload_response.status_code} - {upload_response.text}"
|
397
|
-
)
|
398
|
-
|
399
|
-
return strip_query_params(upload_url)
|
278
|
+
return matched_files
|
400
279
|
|
401
280
|
if bucket_id < 1:
|
402
281
|
raise ValueError(f"Invalid bucket_id: {bucket_id}")
|
@@ -419,7 +298,7 @@ class GroundX(GroundXBase):
|
|
419
298
|
file_size = file.stat().st_size
|
420
299
|
|
421
300
|
if (current_batch_size + file_size > MAX_BATCH_SIZE_BYTES) or (len(current_batch) >= n):
|
422
|
-
_upload_file_batch(bucket_id, current_batch, upload_api, request_options, pbar)
|
301
|
+
self._upload_file_batch(bucket_id, current_batch, upload_api, request_options, pbar)
|
423
302
|
current_batch = []
|
424
303
|
current_batch_size = 0
|
425
304
|
|
@@ -427,7 +306,119 @@ class GroundX(GroundXBase):
|
|
427
306
|
current_batch_size += file_size
|
428
307
|
|
429
308
|
if current_batch:
|
430
|
-
_upload_file_batch(bucket_id, current_batch, upload_api, request_options, pbar)
|
309
|
+
self._upload_file_batch(bucket_id, current_batch, upload_api, request_options, pbar)
|
310
|
+
|
311
|
+
def _upload_file(
|
312
|
+
self,
|
313
|
+
endpoint,
|
314
|
+
file_path,
|
315
|
+
):
|
316
|
+
file_name = os.path.basename(file_path)
|
317
|
+
file_extension = os.path.splitext(file_name)[1][1:].lower()
|
318
|
+
if f".{file_extension}" in SUFFIX_ALIASES:
|
319
|
+
file_extension = SUFFIX_ALIASES[f".{file_extension}"]
|
320
|
+
|
321
|
+
presigned_info = get_presigned_url(endpoint, file_name, file_extension)
|
322
|
+
|
323
|
+
upload_url = presigned_info["URL"]
|
324
|
+
headers = presigned_info.get("Header", {})
|
325
|
+
method = presigned_info.get("Method", "PUT").upper()
|
326
|
+
|
327
|
+
for key, value in headers.items():
|
328
|
+
if isinstance(value, list):
|
329
|
+
headers[key] = value[0]
|
330
|
+
|
331
|
+
try:
|
332
|
+
with open(file_path, "rb") as f:
|
333
|
+
file_data = f.read()
|
334
|
+
except Exception as e:
|
335
|
+
raise ValueError(f"Error reading file {file_path}: {e}")
|
336
|
+
|
337
|
+
if method == "PUT":
|
338
|
+
upload_response = requests.put(upload_url, data=file_data, headers=headers)
|
339
|
+
else:
|
340
|
+
raise ValueError(f"Unsupported HTTP method: {method}")
|
341
|
+
|
342
|
+
if upload_response.status_code not in (200, 201):
|
343
|
+
raise Exception(
|
344
|
+
f"Upload failed: {upload_response.status_code} - {upload_response.text}"
|
345
|
+
)
|
346
|
+
|
347
|
+
return strip_query_params(upload_url)
|
348
|
+
|
349
|
+
def _upload_file_batch(
|
350
|
+
self,
|
351
|
+
bucket_id,
|
352
|
+
batch,
|
353
|
+
upload_api,
|
354
|
+
request_options,
|
355
|
+
pbar,
|
356
|
+
):
|
357
|
+
docs = []
|
358
|
+
|
359
|
+
progress = len(batch)
|
360
|
+
for file in batch:
|
361
|
+
url = self._upload_file(upload_api, file)
|
362
|
+
if file.suffix.lower() in SUFFIX_ALIASES:
|
363
|
+
docs.append(
|
364
|
+
Document(
|
365
|
+
bucket_id=bucket_id,
|
366
|
+
file_name=file.name,
|
367
|
+
file_path=url,
|
368
|
+
file_type=SUFFIX_ALIASES[file.suffix.lower()],
|
369
|
+
),
|
370
|
+
)
|
371
|
+
else:
|
372
|
+
docs.append(
|
373
|
+
Document(
|
374
|
+
bucket_id=bucket_id,
|
375
|
+
file_name=file.name,
|
376
|
+
file_path=url,
|
377
|
+
),
|
378
|
+
)
|
379
|
+
pbar.update(0.25)
|
380
|
+
progress -= 0.25
|
381
|
+
|
382
|
+
if docs:
|
383
|
+
ingest = self.ingest(documents=docs, request_options=request_options)
|
384
|
+
|
385
|
+
completed_files = set()
|
386
|
+
|
387
|
+
while (
|
388
|
+
ingest is not None
|
389
|
+
and ingest.ingest.status not in ["complete", "error", "cancelled"]
|
390
|
+
):
|
391
|
+
time.sleep(3)
|
392
|
+
ingest = self.documents.get_processing_status_by_id(ingest.ingest.process_id)
|
393
|
+
|
394
|
+
if ingest.ingest.progress:
|
395
|
+
if ingest.ingest.progress.processing and ingest.ingest.progress.processing.documents:
|
396
|
+
for doc in ingest.ingest.progress.processing.documents:
|
397
|
+
if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
|
398
|
+
pbar.update(0.75)
|
399
|
+
progress -= 0.75
|
400
|
+
if ingest.ingest.progress.complete and ingest.ingest.progress.complete.documents:
|
401
|
+
for doc in ingest.ingest.progress.complete.documents:
|
402
|
+
if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
|
403
|
+
pbar.update(0.75)
|
404
|
+
progress -= 0.75
|
405
|
+
if ingest.ingest.progress.cancelled and ingest.ingest.progress.cancelled.documents:
|
406
|
+
for doc in ingest.ingest.progress.cancelled.documents:
|
407
|
+
if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
|
408
|
+
pbar.update(0.75)
|
409
|
+
progress -= 0.75
|
410
|
+
if ingest.ingest.progress.errors and ingest.ingest.progress.errors.documents:
|
411
|
+
for doc in ingest.ingest.progress.errors.documents:
|
412
|
+
if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
|
413
|
+
pbar.update(0.75)
|
414
|
+
progress -= 0.75
|
415
|
+
|
416
|
+
|
417
|
+
if ingest.ingest.status in ["error", "cancelled"]:
|
418
|
+
raise ValueError(f"Ingest failed with status: {ingest.ingest.status}")
|
419
|
+
|
420
|
+
if progress > 0:
|
421
|
+
pbar.update(progress)
|
431
422
|
|
432
423
|
|
433
424
|
|
@@ -436,6 +427,7 @@ class AsyncGroundX(AsyncGroundXBase):
|
|
436
427
|
self,
|
437
428
|
*,
|
438
429
|
documents: typing.Sequence[Document],
|
430
|
+
upload_api: str = "https://api.eyelevel.ai/upload/file",
|
439
431
|
request_options: typing.Optional[RequestOptions] = None,
|
440
432
|
) -> IngestResponse:
|
441
433
|
"""
|
@@ -445,6 +437,10 @@ class AsyncGroundX(AsyncGroundXBase):
|
|
445
437
|
----------
|
446
438
|
documents : typing.Sequence[Document]
|
447
439
|
|
440
|
+
# an endpoint that accepts 'name' and 'type' query params
|
441
|
+
# and returns a presigned URL in a JSON dictionary with key 'URL'
|
442
|
+
upload_api : typing.Optional[str]
|
443
|
+
|
448
444
|
request_options : typing.Optional[RequestOptions]
|
449
445
|
Request-specific configuration.
|
450
446
|
|
@@ -479,53 +475,76 @@ class AsyncGroundX(AsyncGroundXBase):
|
|
479
475
|
"""
|
480
476
|
remote_documents, local_documents = prep_documents(documents)
|
481
477
|
|
482
|
-
if local_documents
|
483
|
-
raise ValueError("
|
478
|
+
if len(remote_documents) + len(local_documents) > MAX_BATCH_SIZE:
|
479
|
+
raise ValueError("You have sent too many documents in this request")
|
484
480
|
|
485
|
-
if len(remote_documents)
|
486
|
-
|
487
|
-
documents=remote_documents,
|
488
|
-
request_options=request_options,
|
489
|
-
)
|
481
|
+
if len(remote_documents) + len(local_documents) == 0:
|
482
|
+
raise ValueError("No valid documents were provided")
|
490
483
|
|
491
|
-
|
492
|
-
|
493
|
-
base_url = self._client_wrapper.get_base_url().rstrip("/")
|
484
|
+
for d in local_documents:
|
485
|
+
splits = split_doc(Path(os.path.expanduser(d.file_path)))
|
494
486
|
|
495
|
-
|
487
|
+
for sd in splits:
|
488
|
+
url = self._upload_file(upload_api, sd)
|
496
489
|
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
for field_name, (file_name, file_obj, content_type) in local_documents:
|
501
|
-
data.add_field(
|
502
|
-
name=field_name,
|
503
|
-
value=file_obj,
|
504
|
-
filename=file_name,
|
505
|
-
content_type=content_type,
|
506
|
-
)
|
490
|
+
ft = d.file_type
|
491
|
+
if sd.suffix.lower() in SUFFIX_ALIASES:
|
492
|
+
ft = SUFFIX_ALIASES[sd.suffix.lower()]
|
507
493
|
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
if response.status == 400:
|
521
|
-
raise BadRequestError(await response.json())
|
522
|
-
if response.status == 401:
|
523
|
-
raise UnauthorizedError(await response.json())
|
524
|
-
|
525
|
-
raise ApiError(
|
526
|
-
status_code=response.status, body=await response.text()
|
494
|
+
fn = sd.name
|
495
|
+
if len(splits) == 1 and d.file_name:
|
496
|
+
fn = d.file_name
|
497
|
+
|
498
|
+
remote_documents.append(
|
499
|
+
IngestRemoteDocument(
|
500
|
+
bucket_id=d.bucket_id,
|
501
|
+
file_name=fn,
|
502
|
+
file_type=ft,
|
503
|
+
process_level=d.process_level,
|
504
|
+
search_data=d.search_data,
|
505
|
+
source_url=url,
|
527
506
|
)
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
507
|
+
)
|
508
|
+
|
509
|
+
return await self.documents.ingest_remote(
|
510
|
+
documents=remote_documents,
|
511
|
+
request_options=request_options,
|
512
|
+
)
|
513
|
+
|
514
|
+
def _upload_file(
|
515
|
+
self,
|
516
|
+
endpoint,
|
517
|
+
file_path,
|
518
|
+
):
|
519
|
+
file_name = os.path.basename(file_path)
|
520
|
+
file_extension = os.path.splitext(file_name)[1][1:].lower()
|
521
|
+
if f".{file_extension}" in SUFFIX_ALIASES:
|
522
|
+
file_extension = SUFFIX_ALIASES[f".{file_extension}"]
|
523
|
+
|
524
|
+
presigned_info = get_presigned_url(endpoint, file_name, file_extension)
|
525
|
+
|
526
|
+
upload_url = presigned_info["URL"]
|
527
|
+
headers = presigned_info.get("Header", {})
|
528
|
+
method = presigned_info.get("Method", "PUT").upper()
|
529
|
+
|
530
|
+
for key, value in headers.items():
|
531
|
+
if isinstance(value, list):
|
532
|
+
headers[key] = value[0]
|
533
|
+
|
534
|
+
try:
|
535
|
+
with open(file_path, "rb") as f:
|
536
|
+
file_data = f.read()
|
537
|
+
except Exception as e:
|
538
|
+
raise ValueError(f"Error reading file {file_path}: {e}")
|
539
|
+
|
540
|
+
if method == "PUT":
|
541
|
+
upload_response = requests.put(upload_url, data=file_data, headers=headers)
|
542
|
+
else:
|
543
|
+
raise ValueError(f"Unsupported HTTP method: {method}")
|
544
|
+
|
545
|
+
if upload_response.status_code not in (200, 201):
|
546
|
+
raise Exception(
|
547
|
+
f"Upload failed: {upload_response.status_code} - {upload_response.text}"
|
548
|
+
)
|
549
|
+
|
550
|
+
return strip_query_params(upload_url)
|
@@ -4,7 +4,7 @@ groundx/buckets/client.py,sha256=4jlc9vfIult1mMJ4FZW4_KFJybZPStZt1FUplIgrxbU,239
|
|
4
4
|
groundx/client.py,sha256=dIW9OyrMyfC1N7HSxRrHh0w_8rJ8osNUOPdYD6ueQ6g,6515
|
5
5
|
groundx/core/__init__.py,sha256=SQ85PF84B9MuKnBwHNHWemSGuy-g_515gFYNFhvEE0I,1438
|
6
6
|
groundx/core/api_error.py,sha256=RE8LELok2QCjABadECTvtDp7qejA1VmINCh6TbqPwSE,426
|
7
|
-
groundx/core/client_wrapper.py,sha256=
|
7
|
+
groundx/core/client_wrapper.py,sha256=D6uZpUYxYzmgxNNCTN7quiFvNlBQmPOyLstnrXfcJcs,1802
|
8
8
|
groundx/core/datetime_utils.py,sha256=nBys2IsYrhPdszxGKCNRPSOCwa-5DWOHG95FB8G9PKo,1047
|
9
9
|
groundx/core/file.py,sha256=d4NNbX8XvXP32z8KpK2Xovv33nFfruIrpz0QWxlgpZk,2663
|
10
10
|
groundx/core/http_client.py,sha256=Z77OIxIbL4OAB2IDqjRq_sYa5yNYAWfmdhdCSSvh6Y4,19552
|
@@ -14,6 +14,7 @@ groundx/core/query_encoder.py,sha256=ekulqNd0j8TgD7ox-Qbz7liqX8-KP9blvT9DsRCenYM
|
|
14
14
|
groundx/core/remove_none_from_dict.py,sha256=EU9SGgYidWq7SexuJbNs4-PZ-5Bl3Vppd864mS6vQZw,342
|
15
15
|
groundx/core/request_options.py,sha256=h0QUNCFVdCW_7GclVySCAY2w4NhtXVBUCmHgmzaxpcg,1681
|
16
16
|
groundx/core/serialization.py,sha256=D9h_t-RQON3-CHWs1C4ESY9B-Yd5d-l5lnTLb_X896g,9601
|
17
|
+
groundx/csv_splitter.py,sha256=6HGXdDpwBX_IJaCbla1WuirJERBTvjLzBf9OBtwGFWU,2254
|
17
18
|
groundx/customer/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
|
18
19
|
groundx/customer/client.py,sha256=C_JANeDewRD1Kg-q7LPxdiOSWbYSTOiYlBYZLRYPI44,3467
|
19
20
|
groundx/documents/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
|
@@ -26,7 +27,7 @@ groundx/groups/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
|
|
26
27
|
groundx/groups/client.py,sha256=bytQRh9m7e4vIuYHb7dD1kCTQZvyBxedCqGnmmLqrsI,35237
|
27
28
|
groundx/health/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
|
28
29
|
groundx/health/client.py,sha256=fcTa21RWPyBuT77PQ0EncC6rBaW_DrYlRvudy9-0H58,7545
|
29
|
-
groundx/ingest.py,sha256=
|
30
|
+
groundx/ingest.py,sha256=LtnUGcgtE1MNYL3PGFrzPqRMnLeOxr-fVsZ3fmTAUKI,18294
|
30
31
|
groundx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
32
|
groundx/search/__init__.py,sha256=RagVzjShP33mDg9o4N3kGzV0egL1RYNjCpXPE8VzMYE,145
|
32
33
|
groundx/search/client.py,sha256=zrrqFy0HowDUYPsMU4nfvDV2RgmkEQ4E8WYNktu3xcs,18684
|
@@ -81,7 +82,7 @@ groundx/types/subscription_detail.py,sha256=WNfUw2EMVECIvNYcV2s51zZ6T3Utc4zYXw63
|
|
81
82
|
groundx/types/subscription_detail_meters.py,sha256=lBa8-1QlMVHjr5RLGqhiTKnD1KMM0AAHTWvz9TVtG8w,830
|
82
83
|
groundx/types/website_source.py,sha256=3WeRCiilNKKBTfhwgjo3jbcVI3vLTeM-KxI6dVzpg9o,1578
|
83
84
|
groundx/version.py,sha256=1yVogKaq260fQfckM2RYN2144SEw0QROsZW8ICtkG4U,74
|
84
|
-
groundx-2.2.
|
85
|
-
groundx-2.2.
|
86
|
-
groundx-2.2.
|
87
|
-
groundx-2.2.
|
85
|
+
groundx-2.2.9.dist-info/LICENSE,sha256=dFE6nY1bHnSn6NqmdlghlU1gQqLqYNphrceGVehSa7o,1065
|
86
|
+
groundx-2.2.9.dist-info/METADATA,sha256=1BWmC2-Lx8AT1vEY0juSK4ugjxyn9W5ndJy0bomFEwQ,5173
|
87
|
+
groundx-2.2.9.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
|
88
|
+
groundx-2.2.9.dist-info/RECORD,,
|
File without changes
|
File without changes
|