groundx 2.2.4__py3-none-any.whl → 2.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- groundx/core/client_wrapper.py +1 -1
- groundx/ingest.py +198 -2
- {groundx-2.2.4.dist-info → groundx-2.2.7.dist-info}/METADATA +2 -1
- {groundx-2.2.4.dist-info → groundx-2.2.7.dist-info}/RECORD +6 -6
- {groundx-2.2.4.dist-info → groundx-2.2.7.dist-info}/LICENSE +0 -0
- {groundx-2.2.4.dist-info → groundx-2.2.7.dist-info}/WHEEL +0 -0
groundx/core/client_wrapper.py
CHANGED
groundx/ingest.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
|
-
import aiohttp, io, json, mimetypes, requests, typing, os
|
1
|
+
import aiohttp, io, json, mimetypes, requests, time, typing, os
|
2
2
|
from asyncio import TimeoutError
|
3
|
-
from
|
3
|
+
from pathlib import Path
|
4
|
+
from tqdm import tqdm
|
5
|
+
from urllib.parse import urlparse, urlunparse
|
4
6
|
|
5
7
|
from json.decoder import JSONDecodeError
|
6
8
|
|
@@ -11,6 +13,7 @@ from .core.request_options import RequestOptions
|
|
11
13
|
from .errors.bad_request_error import BadRequestError
|
12
14
|
from .errors.unauthorized_error import UnauthorizedError
|
13
15
|
from .types.document import Document
|
16
|
+
from .types.document_type import DocumentType
|
14
17
|
from .types.ingest_remote_document import IngestRemoteDocument
|
15
18
|
from .types.ingest_response import IngestResponse
|
16
19
|
|
@@ -19,6 +22,14 @@ OMIT = typing.cast(typing.Any, ...)
|
|
19
22
|
|
20
23
|
|
21
24
|
DOCUMENT_TYPE_TO_MIME = {
|
25
|
+
"bmp": "image/bmp",
|
26
|
+
"gif": "image/gif",
|
27
|
+
"heif": "image/heif",
|
28
|
+
"hwp": "application/x-hwp",
|
29
|
+
"ico": "image/vnd.microsoft.icon",
|
30
|
+
"svg": "image/svg",
|
31
|
+
"tiff": "image/tiff",
|
32
|
+
"webp": "image/webp",
|
22
33
|
"txt": "text/plain",
|
23
34
|
"docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
24
35
|
"pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
@@ -32,6 +43,17 @@ DOCUMENT_TYPE_TO_MIME = {
|
|
32
43
|
}
|
33
44
|
MIME_TO_DOCUMENT_TYPE = {v: k for k, v in DOCUMENT_TYPE_TO_MIME.items()}
|
34
45
|
|
46
|
+
ALLOWED_SUFFIXES = {f".{k}": v for k, v in DOCUMENT_TYPE_TO_MIME.items()}
|
47
|
+
|
48
|
+
SUFFIX_ALIASES = {
|
49
|
+
".jpeg": ".jpg",
|
50
|
+
".heic": ".heif",
|
51
|
+
".tif": ".tiff",
|
52
|
+
}
|
53
|
+
|
54
|
+
MAX_BATCH_SIZE = 50
|
55
|
+
MIN_BATCH_SIZE = 1
|
56
|
+
MAX_BATCH_SIZE_BYTES = 50 * 1024 * 1024
|
35
57
|
|
36
58
|
def prep_documents(
|
37
59
|
documents: typing.Sequence[Document],
|
@@ -234,6 +256,180 @@ class GroundX(GroundXBase):
|
|
234
256
|
|
235
257
|
raise ApiError(status_code=_response.status_code, body=_response_json)
|
236
258
|
|
259
|
+
def ingest_directory(
|
260
|
+
self,
|
261
|
+
*,
|
262
|
+
bucket_id: int,
|
263
|
+
path: str,
|
264
|
+
batch_size: typing.Optional[int] = 10,
|
265
|
+
upload_api: typing.Optional[str] = "https://api.eyelevel.ai/upload/file",
|
266
|
+
request_options: typing.Optional[RequestOptions] = None,
|
267
|
+
):
|
268
|
+
"""
|
269
|
+
Ingest documents from a local directory into a GroundX bucket.
|
270
|
+
|
271
|
+
Parameters
|
272
|
+
----------
|
273
|
+
bucket_id : int
|
274
|
+
path : str
|
275
|
+
batch_size : type.Optional[int]
|
276
|
+
|
277
|
+
# an endpoint that accepts 'name' and 'type' query params
|
278
|
+
# and returns a presigned URL
|
279
|
+
upload_api : typing.Optional[str]
|
280
|
+
|
281
|
+
request_options : typing.Optional[RequestOptions]
|
282
|
+
Request-specific configuration.
|
283
|
+
|
284
|
+
Returns
|
285
|
+
-------
|
286
|
+
IngestResponse
|
287
|
+
Documents successfully uploaded
|
288
|
+
|
289
|
+
Examples
|
290
|
+
--------
|
291
|
+
from groundx import Document, GroundX
|
292
|
+
|
293
|
+
client = GroundX(
|
294
|
+
api_key="YOUR_API_KEY",
|
295
|
+
)
|
296
|
+
|
297
|
+
client.ingest_directory(
|
298
|
+
bucket_id=0,
|
299
|
+
path="/path/to/directory"
|
300
|
+
)
|
301
|
+
"""
|
302
|
+
|
303
|
+
def get_presigned_url(endpoint, file_name, file_extension) -> typing.Dict[str, typing.Any]:
|
304
|
+
params = {"name": file_name, "type": file_extension}
|
305
|
+
response = requests.get(endpoint, params=params)
|
306
|
+
response.raise_for_status()
|
307
|
+
|
308
|
+
return response.json()
|
309
|
+
|
310
|
+
def is_valid_local_directory(path: str) -> bool:
|
311
|
+
expanded_path = os.path.expanduser(path)
|
312
|
+
return os.path.isdir(expanded_path)
|
313
|
+
|
314
|
+
def load_directory_files(directory: str) -> typing.List[Path]:
|
315
|
+
dir_path = Path(directory)
|
316
|
+
|
317
|
+
matched_files = [
|
318
|
+
file
|
319
|
+
for file in dir_path.rglob("*")
|
320
|
+
if file.is_file() and (
|
321
|
+
file.suffix.lower() in ALLOWED_SUFFIXES
|
322
|
+
or file.suffix.lower() in SUFFIX_ALIASES
|
323
|
+
)
|
324
|
+
]
|
325
|
+
|
326
|
+
return matched_files
|
327
|
+
|
328
|
+
def strip_query_params(url: str) -> str:
|
329
|
+
parsed = urlparse(url)
|
330
|
+
clean_url = urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", ""))
|
331
|
+
return clean_url
|
332
|
+
|
333
|
+
def _upload_file_batch(bucket_id, batch, upload_api, request_options, pbar):
|
334
|
+
docs = []
|
335
|
+
|
336
|
+
progress = len(batch)
|
337
|
+
for file in batch:
|
338
|
+
url = upload_file(upload_api, file)
|
339
|
+
docs.append(
|
340
|
+
Document(
|
341
|
+
bucket_id=bucket_id,
|
342
|
+
file_path=url,
|
343
|
+
),
|
344
|
+
)
|
345
|
+
pbar.update(0.25)
|
346
|
+
progress -= 0.25
|
347
|
+
|
348
|
+
if docs:
|
349
|
+
ingest = self.ingest(documents=docs, request_options=request_options)
|
350
|
+
|
351
|
+
completed_files = set()
|
352
|
+
|
353
|
+
while (
|
354
|
+
ingest is not None
|
355
|
+
and ingest.ingest.status not in ["complete", "error", "cancelled"]
|
356
|
+
):
|
357
|
+
time.sleep(3)
|
358
|
+
ingest = self.documents.get_processing_status_by_id(ingest.ingest.process_id)
|
359
|
+
|
360
|
+
if ingest.ingest.progress and ingest.ingest.progress.processing:
|
361
|
+
for doc in ingest.ingest.progress.processing.documents:
|
362
|
+
if doc.status == "complete" and doc.document_id not in completed_files:
|
363
|
+
pbar.update(0.75)
|
364
|
+
progress -= 0.75
|
365
|
+
|
366
|
+
if ingest.ingest.status in ["error", "cancelled"]:
|
367
|
+
raise ValueError(f"Ingest failed with status: {ingest.ingest.status}")
|
368
|
+
|
369
|
+
if progress > 0:
|
370
|
+
pbar.update(progress)
|
371
|
+
|
372
|
+
def upload_file(endpoint, file_path) -> str:
|
373
|
+
file_name = os.path.basename(file_path)
|
374
|
+
file_extension = os.path.splitext(file_name)[1][1:].lower()
|
375
|
+
|
376
|
+
presigned_info = get_presigned_url(endpoint, file_name, file_extension)
|
377
|
+
|
378
|
+
upload_url = presigned_info["URL"]
|
379
|
+
headers = presigned_info.get("Header", {})
|
380
|
+
method = presigned_info.get("Method", "PUT").upper()
|
381
|
+
|
382
|
+
for key, value in headers.items():
|
383
|
+
if isinstance(value, list):
|
384
|
+
headers[key] = value[0]
|
385
|
+
|
386
|
+
with open(file_path, "rb") as f:
|
387
|
+
file_data = f.read()
|
388
|
+
|
389
|
+
if method == "PUT":
|
390
|
+
upload_response = requests.put(upload_url, data=file_data, headers=headers)
|
391
|
+
else:
|
392
|
+
raise ValueError(f"Unsupported HTTP method: {method}")
|
393
|
+
|
394
|
+
if upload_response.status_code not in (200, 201):
|
395
|
+
raise Exception(
|
396
|
+
f"Upload failed: {upload_response.status_code} - {upload_response.text}"
|
397
|
+
)
|
398
|
+
|
399
|
+
return strip_query_params(upload_url)
|
400
|
+
|
401
|
+
if bucket_id < 1:
|
402
|
+
raise ValueError(f"Invalid bucket_id: {bucket_id}")
|
403
|
+
|
404
|
+
if is_valid_local_directory(path) is not True:
|
405
|
+
raise ValueError(f"Invalid directory path: {path}")
|
406
|
+
|
407
|
+
files = load_directory_files(path)
|
408
|
+
|
409
|
+
if len(files) < 1:
|
410
|
+
raise ValueError(f"No supported files found in directory: {path}")
|
411
|
+
|
412
|
+
current_batch: typing.List[Path] = []
|
413
|
+
current_batch_size: int = 0
|
414
|
+
|
415
|
+
n = max(MIN_BATCH_SIZE, min(batch_size or MIN_BATCH_SIZE, MAX_BATCH_SIZE))
|
416
|
+
|
417
|
+
with tqdm(total=len(files), desc="Ingesting Files", unit="file") as pbar:
|
418
|
+
for file in files:
|
419
|
+
file_size = file.stat().st_size
|
420
|
+
|
421
|
+
if (current_batch_size + file_size > MAX_BATCH_SIZE_BYTES) or (len(current_batch) >= n):
|
422
|
+
_upload_file_batch(bucket_id, current_batch, upload_api, request_options, pbar)
|
423
|
+
current_batch = []
|
424
|
+
current_batch_size = 0
|
425
|
+
|
426
|
+
current_batch.append(file)
|
427
|
+
current_batch_size += file_size
|
428
|
+
|
429
|
+
if current_batch:
|
430
|
+
_upload_file_batch(bucket_id, current_batch, upload_api, request_options, pbar)
|
431
|
+
|
432
|
+
|
237
433
|
|
238
434
|
class AsyncGroundX(AsyncGroundXBase):
|
239
435
|
async def ingest(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: groundx
|
3
|
-
Version: 2.2.
|
3
|
+
Version: 2.2.7
|
4
4
|
Summary:
|
5
5
|
License: MIT
|
6
6
|
Requires-Python: >=3.8,<4.0
|
@@ -26,6 +26,7 @@ Requires-Dist: pydantic (>=1.9.2)
|
|
26
26
|
Requires-Dist: pydantic-core (>=2.18.2,<3.0.0)
|
27
27
|
Requires-Dist: requests (>=2.4.0)
|
28
28
|
Requires-Dist: tqdm (>=4.60.0)
|
29
|
+
Requires-Dist: types-tqdm (>=4.60.0)
|
29
30
|
Requires-Dist: typing_extensions (>=4.0.0)
|
30
31
|
Description-Content-Type: text/markdown
|
31
32
|
|
@@ -4,7 +4,7 @@ groundx/buckets/client.py,sha256=4jlc9vfIult1mMJ4FZW4_KFJybZPStZt1FUplIgrxbU,239
|
|
4
4
|
groundx/client.py,sha256=dIW9OyrMyfC1N7HSxRrHh0w_8rJ8osNUOPdYD6ueQ6g,6515
|
5
5
|
groundx/core/__init__.py,sha256=SQ85PF84B9MuKnBwHNHWemSGuy-g_515gFYNFhvEE0I,1438
|
6
6
|
groundx/core/api_error.py,sha256=RE8LELok2QCjABadECTvtDp7qejA1VmINCh6TbqPwSE,426
|
7
|
-
groundx/core/client_wrapper.py,sha256=
|
7
|
+
groundx/core/client_wrapper.py,sha256=Bhc6L2UfeJoET17u-IIW6OWHD5GwdYaita2HNWDJjr4,1802
|
8
8
|
groundx/core/datetime_utils.py,sha256=nBys2IsYrhPdszxGKCNRPSOCwa-5DWOHG95FB8G9PKo,1047
|
9
9
|
groundx/core/file.py,sha256=d4NNbX8XvXP32z8KpK2Xovv33nFfruIrpz0QWxlgpZk,2663
|
10
10
|
groundx/core/http_client.py,sha256=Z77OIxIbL4OAB2IDqjRq_sYa5yNYAWfmdhdCSSvh6Y4,19552
|
@@ -26,7 +26,7 @@ groundx/groups/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
|
|
26
26
|
groundx/groups/client.py,sha256=bytQRh9m7e4vIuYHb7dD1kCTQZvyBxedCqGnmmLqrsI,35237
|
27
27
|
groundx/health/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
|
28
28
|
groundx/health/client.py,sha256=fcTa21RWPyBuT77PQ0EncC6rBaW_DrYlRvudy9-0H58,7545
|
29
|
-
groundx/ingest.py,sha256=
|
29
|
+
groundx/ingest.py,sha256=RTgmeg_4cEaZynSEyf-3ArKGBcnhbcZhJl7BAeUeAMU,18187
|
30
30
|
groundx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
31
|
groundx/search/__init__.py,sha256=RagVzjShP33mDg9o4N3kGzV0egL1RYNjCpXPE8VzMYE,145
|
32
32
|
groundx/search/client.py,sha256=zrrqFy0HowDUYPsMU4nfvDV2RgmkEQ4E8WYNktu3xcs,18684
|
@@ -81,7 +81,7 @@ groundx/types/subscription_detail.py,sha256=WNfUw2EMVECIvNYcV2s51zZ6T3Utc4zYXw63
|
|
81
81
|
groundx/types/subscription_detail_meters.py,sha256=lBa8-1QlMVHjr5RLGqhiTKnD1KMM0AAHTWvz9TVtG8w,830
|
82
82
|
groundx/types/website_source.py,sha256=3WeRCiilNKKBTfhwgjo3jbcVI3vLTeM-KxI6dVzpg9o,1578
|
83
83
|
groundx/version.py,sha256=1yVogKaq260fQfckM2RYN2144SEw0QROsZW8ICtkG4U,74
|
84
|
-
groundx-2.2.
|
85
|
-
groundx-2.2.
|
86
|
-
groundx-2.2.
|
87
|
-
groundx-2.2.
|
84
|
+
groundx-2.2.7.dist-info/LICENSE,sha256=dFE6nY1bHnSn6NqmdlghlU1gQqLqYNphrceGVehSa7o,1065
|
85
|
+
groundx-2.2.7.dist-info/METADATA,sha256=Gx6smhve9G7ECJbxQBrw3pJ_LosNgtehM3VETuW0c9I,5173
|
86
|
+
groundx-2.2.7.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
|
87
|
+
groundx-2.2.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|