groundx 2.2.3__py3-none-any.whl → 2.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- groundx/core/client_wrapper.py +1 -1
- groundx/core/http_client.py +4 -4
- groundx/ingest.py +198 -2
- {groundx-2.2.3.dist-info → groundx-2.2.7.dist-info}/METADATA +3 -1
- {groundx-2.2.3.dist-info → groundx-2.2.7.dist-info}/RECORD +7 -7
- {groundx-2.2.3.dist-info → groundx-2.2.7.dist-info}/LICENSE +0 -0
- {groundx-2.2.3.dist-info → groundx-2.2.7.dist-info}/WHEEL +0 -0
groundx/core/client_wrapper.py
CHANGED
groundx/core/http_client.py
CHANGED
@@ -183,7 +183,7 @@ class HttpClient:
|
|
183
183
|
files: typing.Optional[typing.Dict[str, typing.Optional[typing.Union[File, typing.List[File]]]]] = None,
|
184
184
|
headers: typing.Optional[typing.Dict[str, typing.Any]] = None,
|
185
185
|
request_options: typing.Optional[RequestOptions] = None,
|
186
|
-
retries: int =
|
186
|
+
retries: int = 2,
|
187
187
|
omit: typing.Optional[typing.Any] = None,
|
188
188
|
) -> httpx.Response:
|
189
189
|
base_url = self.get_base_url(base_url)
|
@@ -269,7 +269,7 @@ class HttpClient:
|
|
269
269
|
files: typing.Optional[typing.Dict[str, typing.Optional[typing.Union[File, typing.List[File]]]]] = None,
|
270
270
|
headers: typing.Optional[typing.Dict[str, typing.Any]] = None,
|
271
271
|
request_options: typing.Optional[RequestOptions] = None,
|
272
|
-
retries: int =
|
272
|
+
retries: int = 2,
|
273
273
|
omit: typing.Optional[typing.Any] = None,
|
274
274
|
) -> typing.Iterator[httpx.Response]:
|
275
275
|
base_url = self.get_base_url(base_url)
|
@@ -359,7 +359,7 @@ class AsyncHttpClient:
|
|
359
359
|
files: typing.Optional[typing.Dict[str, typing.Optional[typing.Union[File, typing.List[File]]]]] = None,
|
360
360
|
headers: typing.Optional[typing.Dict[str, typing.Any]] = None,
|
361
361
|
request_options: typing.Optional[RequestOptions] = None,
|
362
|
-
retries: int =
|
362
|
+
retries: int = 2,
|
363
363
|
omit: typing.Optional[typing.Any] = None,
|
364
364
|
) -> httpx.Response:
|
365
365
|
base_url = self.get_base_url(base_url)
|
@@ -445,7 +445,7 @@ class AsyncHttpClient:
|
|
445
445
|
files: typing.Optional[typing.Dict[str, typing.Optional[typing.Union[File, typing.List[File]]]]] = None,
|
446
446
|
headers: typing.Optional[typing.Dict[str, typing.Any]] = None,
|
447
447
|
request_options: typing.Optional[RequestOptions] = None,
|
448
|
-
retries: int =
|
448
|
+
retries: int = 2,
|
449
449
|
omit: typing.Optional[typing.Any] = None,
|
450
450
|
) -> typing.AsyncIterator[httpx.Response]:
|
451
451
|
base_url = self.get_base_url(base_url)
|
groundx/ingest.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
|
-
import aiohttp, io, json, mimetypes, requests, typing, os
|
1
|
+
import aiohttp, io, json, mimetypes, requests, time, typing, os
|
2
2
|
from asyncio import TimeoutError
|
3
|
-
from
|
3
|
+
from pathlib import Path
|
4
|
+
from tqdm import tqdm
|
5
|
+
from urllib.parse import urlparse, urlunparse
|
4
6
|
|
5
7
|
from json.decoder import JSONDecodeError
|
6
8
|
|
@@ -11,6 +13,7 @@ from .core.request_options import RequestOptions
|
|
11
13
|
from .errors.bad_request_error import BadRequestError
|
12
14
|
from .errors.unauthorized_error import UnauthorizedError
|
13
15
|
from .types.document import Document
|
16
|
+
from .types.document_type import DocumentType
|
14
17
|
from .types.ingest_remote_document import IngestRemoteDocument
|
15
18
|
from .types.ingest_response import IngestResponse
|
16
19
|
|
@@ -19,6 +22,14 @@ OMIT = typing.cast(typing.Any, ...)
|
|
19
22
|
|
20
23
|
|
21
24
|
DOCUMENT_TYPE_TO_MIME = {
|
25
|
+
"bmp": "image/bmp",
|
26
|
+
"gif": "image/gif",
|
27
|
+
"heif": "image/heif",
|
28
|
+
"hwp": "application/x-hwp",
|
29
|
+
"ico": "image/vnd.microsoft.icon",
|
30
|
+
"svg": "image/svg",
|
31
|
+
"tiff": "image/tiff",
|
32
|
+
"webp": "image/webp",
|
22
33
|
"txt": "text/plain",
|
23
34
|
"docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
24
35
|
"pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
@@ -32,6 +43,17 @@ DOCUMENT_TYPE_TO_MIME = {
|
|
32
43
|
}
|
33
44
|
MIME_TO_DOCUMENT_TYPE = {v: k for k, v in DOCUMENT_TYPE_TO_MIME.items()}
|
34
45
|
|
46
|
+
ALLOWED_SUFFIXES = {f".{k}": v for k, v in DOCUMENT_TYPE_TO_MIME.items()}
|
47
|
+
|
48
|
+
SUFFIX_ALIASES = {
|
49
|
+
".jpeg": ".jpg",
|
50
|
+
".heic": ".heif",
|
51
|
+
".tif": ".tiff",
|
52
|
+
}
|
53
|
+
|
54
|
+
MAX_BATCH_SIZE = 50
|
55
|
+
MIN_BATCH_SIZE = 1
|
56
|
+
MAX_BATCH_SIZE_BYTES = 50 * 1024 * 1024
|
35
57
|
|
36
58
|
def prep_documents(
|
37
59
|
documents: typing.Sequence[Document],
|
@@ -234,6 +256,180 @@ class GroundX(GroundXBase):
|
|
234
256
|
|
235
257
|
raise ApiError(status_code=_response.status_code, body=_response_json)
|
236
258
|
|
259
|
+
def ingest_directory(
|
260
|
+
self,
|
261
|
+
*,
|
262
|
+
bucket_id: int,
|
263
|
+
path: str,
|
264
|
+
batch_size: typing.Optional[int] = 10,
|
265
|
+
upload_api: typing.Optional[str] = "https://api.eyelevel.ai/upload/file",
|
266
|
+
request_options: typing.Optional[RequestOptions] = None,
|
267
|
+
):
|
268
|
+
"""
|
269
|
+
Ingest documents from a local directory into a GroundX bucket.
|
270
|
+
|
271
|
+
Parameters
|
272
|
+
----------
|
273
|
+
bucket_id : int
|
274
|
+
path : str
|
275
|
+
batch_size : type.Optional[int]
|
276
|
+
|
277
|
+
# an endpoint that accepts 'name' and 'type' query params
|
278
|
+
# and returns a presigned URL
|
279
|
+
upload_api : typing.Optional[str]
|
280
|
+
|
281
|
+
request_options : typing.Optional[RequestOptions]
|
282
|
+
Request-specific configuration.
|
283
|
+
|
284
|
+
Returns
|
285
|
+
-------
|
286
|
+
IngestResponse
|
287
|
+
Documents successfully uploaded
|
288
|
+
|
289
|
+
Examples
|
290
|
+
--------
|
291
|
+
from groundx import Document, GroundX
|
292
|
+
|
293
|
+
client = GroundX(
|
294
|
+
api_key="YOUR_API_KEY",
|
295
|
+
)
|
296
|
+
|
297
|
+
client.ingest_directory(
|
298
|
+
bucket_id=0,
|
299
|
+
path="/path/to/directory"
|
300
|
+
)
|
301
|
+
"""
|
302
|
+
|
303
|
+
def get_presigned_url(endpoint, file_name, file_extension) -> typing.Dict[str, typing.Any]:
|
304
|
+
params = {"name": file_name, "type": file_extension}
|
305
|
+
response = requests.get(endpoint, params=params)
|
306
|
+
response.raise_for_status()
|
307
|
+
|
308
|
+
return response.json()
|
309
|
+
|
310
|
+
def is_valid_local_directory(path: str) -> bool:
|
311
|
+
expanded_path = os.path.expanduser(path)
|
312
|
+
return os.path.isdir(expanded_path)
|
313
|
+
|
314
|
+
def load_directory_files(directory: str) -> typing.List[Path]:
|
315
|
+
dir_path = Path(directory)
|
316
|
+
|
317
|
+
matched_files = [
|
318
|
+
file
|
319
|
+
for file in dir_path.rglob("*")
|
320
|
+
if file.is_file() and (
|
321
|
+
file.suffix.lower() in ALLOWED_SUFFIXES
|
322
|
+
or file.suffix.lower() in SUFFIX_ALIASES
|
323
|
+
)
|
324
|
+
]
|
325
|
+
|
326
|
+
return matched_files
|
327
|
+
|
328
|
+
def strip_query_params(url: str) -> str:
|
329
|
+
parsed = urlparse(url)
|
330
|
+
clean_url = urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", ""))
|
331
|
+
return clean_url
|
332
|
+
|
333
|
+
def _upload_file_batch(bucket_id, batch, upload_api, request_options, pbar):
|
334
|
+
docs = []
|
335
|
+
|
336
|
+
progress = len(batch)
|
337
|
+
for file in batch:
|
338
|
+
url = upload_file(upload_api, file)
|
339
|
+
docs.append(
|
340
|
+
Document(
|
341
|
+
bucket_id=bucket_id,
|
342
|
+
file_path=url,
|
343
|
+
),
|
344
|
+
)
|
345
|
+
pbar.update(0.25)
|
346
|
+
progress -= 0.25
|
347
|
+
|
348
|
+
if docs:
|
349
|
+
ingest = self.ingest(documents=docs, request_options=request_options)
|
350
|
+
|
351
|
+
completed_files = set()
|
352
|
+
|
353
|
+
while (
|
354
|
+
ingest is not None
|
355
|
+
and ingest.ingest.status not in ["complete", "error", "cancelled"]
|
356
|
+
):
|
357
|
+
time.sleep(3)
|
358
|
+
ingest = self.documents.get_processing_status_by_id(ingest.ingest.process_id)
|
359
|
+
|
360
|
+
if ingest.ingest.progress and ingest.ingest.progress.processing:
|
361
|
+
for doc in ingest.ingest.progress.processing.documents:
|
362
|
+
if doc.status == "complete" and doc.document_id not in completed_files:
|
363
|
+
pbar.update(0.75)
|
364
|
+
progress -= 0.75
|
365
|
+
|
366
|
+
if ingest.ingest.status in ["error", "cancelled"]:
|
367
|
+
raise ValueError(f"Ingest failed with status: {ingest.ingest.status}")
|
368
|
+
|
369
|
+
if progress > 0:
|
370
|
+
pbar.update(progress)
|
371
|
+
|
372
|
+
def upload_file(endpoint, file_path) -> str:
|
373
|
+
file_name = os.path.basename(file_path)
|
374
|
+
file_extension = os.path.splitext(file_name)[1][1:].lower()
|
375
|
+
|
376
|
+
presigned_info = get_presigned_url(endpoint, file_name, file_extension)
|
377
|
+
|
378
|
+
upload_url = presigned_info["URL"]
|
379
|
+
headers = presigned_info.get("Header", {})
|
380
|
+
method = presigned_info.get("Method", "PUT").upper()
|
381
|
+
|
382
|
+
for key, value in headers.items():
|
383
|
+
if isinstance(value, list):
|
384
|
+
headers[key] = value[0]
|
385
|
+
|
386
|
+
with open(file_path, "rb") as f:
|
387
|
+
file_data = f.read()
|
388
|
+
|
389
|
+
if method == "PUT":
|
390
|
+
upload_response = requests.put(upload_url, data=file_data, headers=headers)
|
391
|
+
else:
|
392
|
+
raise ValueError(f"Unsupported HTTP method: {method}")
|
393
|
+
|
394
|
+
if upload_response.status_code not in (200, 201):
|
395
|
+
raise Exception(
|
396
|
+
f"Upload failed: {upload_response.status_code} - {upload_response.text}"
|
397
|
+
)
|
398
|
+
|
399
|
+
return strip_query_params(upload_url)
|
400
|
+
|
401
|
+
if bucket_id < 1:
|
402
|
+
raise ValueError(f"Invalid bucket_id: {bucket_id}")
|
403
|
+
|
404
|
+
if is_valid_local_directory(path) is not True:
|
405
|
+
raise ValueError(f"Invalid directory path: {path}")
|
406
|
+
|
407
|
+
files = load_directory_files(path)
|
408
|
+
|
409
|
+
if len(files) < 1:
|
410
|
+
raise ValueError(f"No supported files found in directory: {path}")
|
411
|
+
|
412
|
+
current_batch: typing.List[Path] = []
|
413
|
+
current_batch_size: int = 0
|
414
|
+
|
415
|
+
n = max(MIN_BATCH_SIZE, min(batch_size or MIN_BATCH_SIZE, MAX_BATCH_SIZE))
|
416
|
+
|
417
|
+
with tqdm(total=len(files), desc="Ingesting Files", unit="file") as pbar:
|
418
|
+
for file in files:
|
419
|
+
file_size = file.stat().st_size
|
420
|
+
|
421
|
+
if (current_batch_size + file_size > MAX_BATCH_SIZE_BYTES) or (len(current_batch) >= n):
|
422
|
+
_upload_file_batch(bucket_id, current_batch, upload_api, request_options, pbar)
|
423
|
+
current_batch = []
|
424
|
+
current_batch_size = 0
|
425
|
+
|
426
|
+
current_batch.append(file)
|
427
|
+
current_batch_size += file_size
|
428
|
+
|
429
|
+
if current_batch:
|
430
|
+
_upload_file_batch(bucket_id, current_batch, upload_api, request_options, pbar)
|
431
|
+
|
432
|
+
|
237
433
|
|
238
434
|
class AsyncGroundX(AsyncGroundXBase):
|
239
435
|
async def ingest(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: groundx
|
3
|
-
Version: 2.2.
|
3
|
+
Version: 2.2.7
|
4
4
|
Summary:
|
5
5
|
License: MIT
|
6
6
|
Requires-Python: >=3.8,<4.0
|
@@ -25,6 +25,8 @@ Requires-Dist: httpx (>=0.21.2)
|
|
25
25
|
Requires-Dist: pydantic (>=1.9.2)
|
26
26
|
Requires-Dist: pydantic-core (>=2.18.2,<3.0.0)
|
27
27
|
Requires-Dist: requests (>=2.4.0)
|
28
|
+
Requires-Dist: tqdm (>=4.60.0)
|
29
|
+
Requires-Dist: types-tqdm (>=4.60.0)
|
28
30
|
Requires-Dist: typing_extensions (>=4.0.0)
|
29
31
|
Description-Content-Type: text/markdown
|
30
32
|
|
@@ -4,10 +4,10 @@ groundx/buckets/client.py,sha256=4jlc9vfIult1mMJ4FZW4_KFJybZPStZt1FUplIgrxbU,239
|
|
4
4
|
groundx/client.py,sha256=dIW9OyrMyfC1N7HSxRrHh0w_8rJ8osNUOPdYD6ueQ6g,6515
|
5
5
|
groundx/core/__init__.py,sha256=SQ85PF84B9MuKnBwHNHWemSGuy-g_515gFYNFhvEE0I,1438
|
6
6
|
groundx/core/api_error.py,sha256=RE8LELok2QCjABadECTvtDp7qejA1VmINCh6TbqPwSE,426
|
7
|
-
groundx/core/client_wrapper.py,sha256=
|
7
|
+
groundx/core/client_wrapper.py,sha256=Bhc6L2UfeJoET17u-IIW6OWHD5GwdYaita2HNWDJjr4,1802
|
8
8
|
groundx/core/datetime_utils.py,sha256=nBys2IsYrhPdszxGKCNRPSOCwa-5DWOHG95FB8G9PKo,1047
|
9
9
|
groundx/core/file.py,sha256=d4NNbX8XvXP32z8KpK2Xovv33nFfruIrpz0QWxlgpZk,2663
|
10
|
-
groundx/core/http_client.py,sha256=
|
10
|
+
groundx/core/http_client.py,sha256=Z77OIxIbL4OAB2IDqjRq_sYa5yNYAWfmdhdCSSvh6Y4,19552
|
11
11
|
groundx/core/jsonable_encoder.py,sha256=qaF1gtgH-kQZb4kJskETwcCsOPUof-NnYVdszHkb-dM,3656
|
12
12
|
groundx/core/pydantic_utilities.py,sha256=UibVGGYmBDsV834x8CtckRDrTIL4lYJPMrcq9yvf7RM,11973
|
13
13
|
groundx/core/query_encoder.py,sha256=ekulqNd0j8TgD7ox-Qbz7liqX8-KP9blvT9DsRCenYM,2144
|
@@ -26,7 +26,7 @@ groundx/groups/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
|
|
26
26
|
groundx/groups/client.py,sha256=bytQRh9m7e4vIuYHb7dD1kCTQZvyBxedCqGnmmLqrsI,35237
|
27
27
|
groundx/health/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
|
28
28
|
groundx/health/client.py,sha256=fcTa21RWPyBuT77PQ0EncC6rBaW_DrYlRvudy9-0H58,7545
|
29
|
-
groundx/ingest.py,sha256=
|
29
|
+
groundx/ingest.py,sha256=RTgmeg_4cEaZynSEyf-3ArKGBcnhbcZhJl7BAeUeAMU,18187
|
30
30
|
groundx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
31
|
groundx/search/__init__.py,sha256=RagVzjShP33mDg9o4N3kGzV0egL1RYNjCpXPE8VzMYE,145
|
32
32
|
groundx/search/client.py,sha256=zrrqFy0HowDUYPsMU4nfvDV2RgmkEQ4E8WYNktu3xcs,18684
|
@@ -81,7 +81,7 @@ groundx/types/subscription_detail.py,sha256=WNfUw2EMVECIvNYcV2s51zZ6T3Utc4zYXw63
|
|
81
81
|
groundx/types/subscription_detail_meters.py,sha256=lBa8-1QlMVHjr5RLGqhiTKnD1KMM0AAHTWvz9TVtG8w,830
|
82
82
|
groundx/types/website_source.py,sha256=3WeRCiilNKKBTfhwgjo3jbcVI3vLTeM-KxI6dVzpg9o,1578
|
83
83
|
groundx/version.py,sha256=1yVogKaq260fQfckM2RYN2144SEw0QROsZW8ICtkG4U,74
|
84
|
-
groundx-2.2.
|
85
|
-
groundx-2.2.
|
86
|
-
groundx-2.2.
|
87
|
-
groundx-2.2.
|
84
|
+
groundx-2.2.7.dist-info/LICENSE,sha256=dFE6nY1bHnSn6NqmdlghlU1gQqLqYNphrceGVehSa7o,1065
|
85
|
+
groundx-2.2.7.dist-info/METADATA,sha256=Gx6smhve9G7ECJbxQBrw3pJ_LosNgtehM3VETuW0c9I,5173
|
86
|
+
groundx-2.2.7.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
|
87
|
+
groundx-2.2.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|