groundx 2.2.4__py3-none-any.whl → 2.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,7 +16,7 @@ class BaseClientWrapper:
16
16
  headers: typing.Dict[str, str] = {
17
17
  "X-Fern-Language": "Python",
18
18
  "X-Fern-SDK-Name": "groundx",
19
- "X-Fern-SDK-Version": "2.2.4",
19
+ "X-Fern-SDK-Version": "2.2.7",
20
20
  }
21
21
  headers["X-API-Key"] = self.api_key
22
22
  return headers
groundx/ingest.py CHANGED
@@ -1,6 +1,8 @@
1
- import aiohttp, io, json, mimetypes, requests, typing, os
1
+ import aiohttp, io, json, mimetypes, requests, time, typing, os
2
2
  from asyncio import TimeoutError
3
- from urllib.parse import urlparse
3
+ from pathlib import Path
4
+ from tqdm import tqdm
5
+ from urllib.parse import urlparse, urlunparse
4
6
 
5
7
  from json.decoder import JSONDecodeError
6
8
 
@@ -11,6 +13,7 @@ from .core.request_options import RequestOptions
11
13
  from .errors.bad_request_error import BadRequestError
12
14
  from .errors.unauthorized_error import UnauthorizedError
13
15
  from .types.document import Document
16
+ from .types.document_type import DocumentType
14
17
  from .types.ingest_remote_document import IngestRemoteDocument
15
18
  from .types.ingest_response import IngestResponse
16
19
 
@@ -19,6 +22,14 @@ OMIT = typing.cast(typing.Any, ...)
19
22
 
20
23
 
21
24
  DOCUMENT_TYPE_TO_MIME = {
25
+ "bmp": "image/bmp",
26
+ "gif": "image/gif",
27
+ "heif": "image/heif",
28
+ "hwp": "application/x-hwp",
29
+ "ico": "image/vnd.microsoft.icon",
30
+ "svg": "image/svg",
31
+ "tiff": "image/tiff",
32
+ "webp": "image/webp",
22
33
  "txt": "text/plain",
23
34
  "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
24
35
  "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
@@ -32,6 +43,17 @@ DOCUMENT_TYPE_TO_MIME = {
32
43
  }
33
44
  MIME_TO_DOCUMENT_TYPE = {v: k for k, v in DOCUMENT_TYPE_TO_MIME.items()}
34
45
 
46
+ ALLOWED_SUFFIXES = {f".{k}": v for k, v in DOCUMENT_TYPE_TO_MIME.items()}
47
+
48
+ SUFFIX_ALIASES = {
49
+ ".jpeg": ".jpg",
50
+ ".heic": ".heif",
51
+ ".tif": ".tiff",
52
+ }
53
+
54
+ MAX_BATCH_SIZE = 50
55
+ MIN_BATCH_SIZE = 1
56
+ MAX_BATCH_SIZE_BYTES = 50 * 1024 * 1024
35
57
 
36
58
  def prep_documents(
37
59
  documents: typing.Sequence[Document],
@@ -234,6 +256,180 @@ class GroundX(GroundXBase):
234
256
 
235
257
  raise ApiError(status_code=_response.status_code, body=_response_json)
236
258
 
259
+ def ingest_directory(
260
+ self,
261
+ *,
262
+ bucket_id: int,
263
+ path: str,
264
+ batch_size: typing.Optional[int] = 10,
265
+ upload_api: typing.Optional[str] = "https://api.eyelevel.ai/upload/file",
266
+ request_options: typing.Optional[RequestOptions] = None,
267
+ ):
268
+ """
269
+ Ingest documents from a local directory into a GroundX bucket.
270
+
271
+ Parameters
272
+ ----------
273
+ bucket_id : int
274
+ path : str
275
+ batch_size : type.Optional[int]
276
+
277
+ # an endpoint that accepts 'name' and 'type' query params
278
+ # and returns a presigned URL
279
+ upload_api : typing.Optional[str]
280
+
281
+ request_options : typing.Optional[RequestOptions]
282
+ Request-specific configuration.
283
+
284
+ Returns
285
+ -------
286
+ IngestResponse
287
+ Documents successfully uploaded
288
+
289
+ Examples
290
+ --------
291
+ from groundx import Document, GroundX
292
+
293
+ client = GroundX(
294
+ api_key="YOUR_API_KEY",
295
+ )
296
+
297
+ client.ingest_directory(
298
+ bucket_id=0,
299
+ path="/path/to/directory"
300
+ )
301
+ """
302
+
303
+ def get_presigned_url(endpoint, file_name, file_extension) -> typing.Dict[str, typing.Any]:
304
+ params = {"name": file_name, "type": file_extension}
305
+ response = requests.get(endpoint, params=params)
306
+ response.raise_for_status()
307
+
308
+ return response.json()
309
+
310
+ def is_valid_local_directory(path: str) -> bool:
311
+ expanded_path = os.path.expanduser(path)
312
+ return os.path.isdir(expanded_path)
313
+
314
+ def load_directory_files(directory: str) -> typing.List[Path]:
315
+ dir_path = Path(directory)
316
+
317
+ matched_files = [
318
+ file
319
+ for file in dir_path.rglob("*")
320
+ if file.is_file() and (
321
+ file.suffix.lower() in ALLOWED_SUFFIXES
322
+ or file.suffix.lower() in SUFFIX_ALIASES
323
+ )
324
+ ]
325
+
326
+ return matched_files
327
+
328
+ def strip_query_params(url: str) -> str:
329
+ parsed = urlparse(url)
330
+ clean_url = urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", ""))
331
+ return clean_url
332
+
333
+ def _upload_file_batch(bucket_id, batch, upload_api, request_options, pbar):
334
+ docs = []
335
+
336
+ progress = len(batch)
337
+ for file in batch:
338
+ url = upload_file(upload_api, file)
339
+ docs.append(
340
+ Document(
341
+ bucket_id=bucket_id,
342
+ file_path=url,
343
+ ),
344
+ )
345
+ pbar.update(0.25)
346
+ progress -= 0.25
347
+
348
+ if docs:
349
+ ingest = self.ingest(documents=docs, request_options=request_options)
350
+
351
+ completed_files = set()
352
+
353
+ while (
354
+ ingest is not None
355
+ and ingest.ingest.status not in ["complete", "error", "cancelled"]
356
+ ):
357
+ time.sleep(3)
358
+ ingest = self.documents.get_processing_status_by_id(ingest.ingest.process_id)
359
+
360
+ if ingest.ingest.progress and ingest.ingest.progress.processing:
361
+ for doc in ingest.ingest.progress.processing.documents:
362
+ if doc.status == "complete" and doc.document_id not in completed_files:
363
+ pbar.update(0.75)
364
+ progress -= 0.75
365
+
366
+ if ingest.ingest.status in ["error", "cancelled"]:
367
+ raise ValueError(f"Ingest failed with status: {ingest.ingest.status}")
368
+
369
+ if progress > 0:
370
+ pbar.update(progress)
371
+
372
+ def upload_file(endpoint, file_path) -> str:
373
+ file_name = os.path.basename(file_path)
374
+ file_extension = os.path.splitext(file_name)[1][1:].lower()
375
+
376
+ presigned_info = get_presigned_url(endpoint, file_name, file_extension)
377
+
378
+ upload_url = presigned_info["URL"]
379
+ headers = presigned_info.get("Header", {})
380
+ method = presigned_info.get("Method", "PUT").upper()
381
+
382
+ for key, value in headers.items():
383
+ if isinstance(value, list):
384
+ headers[key] = value[0]
385
+
386
+ with open(file_path, "rb") as f:
387
+ file_data = f.read()
388
+
389
+ if method == "PUT":
390
+ upload_response = requests.put(upload_url, data=file_data, headers=headers)
391
+ else:
392
+ raise ValueError(f"Unsupported HTTP method: {method}")
393
+
394
+ if upload_response.status_code not in (200, 201):
395
+ raise Exception(
396
+ f"Upload failed: {upload_response.status_code} - {upload_response.text}"
397
+ )
398
+
399
+ return strip_query_params(upload_url)
400
+
401
+ if bucket_id < 1:
402
+ raise ValueError(f"Invalid bucket_id: {bucket_id}")
403
+
404
+ if is_valid_local_directory(path) is not True:
405
+ raise ValueError(f"Invalid directory path: {path}")
406
+
407
+ files = load_directory_files(path)
408
+
409
+ if len(files) < 1:
410
+ raise ValueError(f"No supported files found in directory: {path}")
411
+
412
+ current_batch: typing.List[Path] = []
413
+ current_batch_size: int = 0
414
+
415
+ n = max(MIN_BATCH_SIZE, min(batch_size or MIN_BATCH_SIZE, MAX_BATCH_SIZE))
416
+
417
+ with tqdm(total=len(files), desc="Ingesting Files", unit="file") as pbar:
418
+ for file in files:
419
+ file_size = file.stat().st_size
420
+
421
+ if (current_batch_size + file_size > MAX_BATCH_SIZE_BYTES) or (len(current_batch) >= n):
422
+ _upload_file_batch(bucket_id, current_batch, upload_api, request_options, pbar)
423
+ current_batch = []
424
+ current_batch_size = 0
425
+
426
+ current_batch.append(file)
427
+ current_batch_size += file_size
428
+
429
+ if current_batch:
430
+ _upload_file_batch(bucket_id, current_batch, upload_api, request_options, pbar)
431
+
432
+
237
433
 
238
434
  class AsyncGroundX(AsyncGroundXBase):
239
435
  async def ingest(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: groundx
3
- Version: 2.2.4
3
+ Version: 2.2.7
4
4
  Summary:
5
5
  License: MIT
6
6
  Requires-Python: >=3.8,<4.0
@@ -26,6 +26,7 @@ Requires-Dist: pydantic (>=1.9.2)
26
26
  Requires-Dist: pydantic-core (>=2.18.2,<3.0.0)
27
27
  Requires-Dist: requests (>=2.4.0)
28
28
  Requires-Dist: tqdm (>=4.60.0)
29
+ Requires-Dist: types-tqdm (>=4.60.0)
29
30
  Requires-Dist: typing_extensions (>=4.0.0)
30
31
  Description-Content-Type: text/markdown
31
32
 
@@ -4,7 +4,7 @@ groundx/buckets/client.py,sha256=4jlc9vfIult1mMJ4FZW4_KFJybZPStZt1FUplIgrxbU,239
4
4
  groundx/client.py,sha256=dIW9OyrMyfC1N7HSxRrHh0w_8rJ8osNUOPdYD6ueQ6g,6515
5
5
  groundx/core/__init__.py,sha256=SQ85PF84B9MuKnBwHNHWemSGuy-g_515gFYNFhvEE0I,1438
6
6
  groundx/core/api_error.py,sha256=RE8LELok2QCjABadECTvtDp7qejA1VmINCh6TbqPwSE,426
7
- groundx/core/client_wrapper.py,sha256=rpvFZnPH9F7U4KGeUUqy3rGA57uer9q3SdfSnzptNJ0,1802
7
+ groundx/core/client_wrapper.py,sha256=Bhc6L2UfeJoET17u-IIW6OWHD5GwdYaita2HNWDJjr4,1802
8
8
  groundx/core/datetime_utils.py,sha256=nBys2IsYrhPdszxGKCNRPSOCwa-5DWOHG95FB8G9PKo,1047
9
9
  groundx/core/file.py,sha256=d4NNbX8XvXP32z8KpK2Xovv33nFfruIrpz0QWxlgpZk,2663
10
10
  groundx/core/http_client.py,sha256=Z77OIxIbL4OAB2IDqjRq_sYa5yNYAWfmdhdCSSvh6Y4,19552
@@ -26,7 +26,7 @@ groundx/groups/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
26
26
  groundx/groups/client.py,sha256=bytQRh9m7e4vIuYHb7dD1kCTQZvyBxedCqGnmmLqrsI,35237
27
27
  groundx/health/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
28
28
  groundx/health/client.py,sha256=fcTa21RWPyBuT77PQ0EncC6rBaW_DrYlRvudy9-0H58,7545
29
- groundx/ingest.py,sha256=snQ586PRmV_s3VQNBqYfKM0Lo_AaRvft5mX4sT4k-l0,11536
29
+ groundx/ingest.py,sha256=RTgmeg_4cEaZynSEyf-3ArKGBcnhbcZhJl7BAeUeAMU,18187
30
30
  groundx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
31
  groundx/search/__init__.py,sha256=RagVzjShP33mDg9o4N3kGzV0egL1RYNjCpXPE8VzMYE,145
32
32
  groundx/search/client.py,sha256=zrrqFy0HowDUYPsMU4nfvDV2RgmkEQ4E8WYNktu3xcs,18684
@@ -81,7 +81,7 @@ groundx/types/subscription_detail.py,sha256=WNfUw2EMVECIvNYcV2s51zZ6T3Utc4zYXw63
81
81
  groundx/types/subscription_detail_meters.py,sha256=lBa8-1QlMVHjr5RLGqhiTKnD1KMM0AAHTWvz9TVtG8w,830
82
82
  groundx/types/website_source.py,sha256=3WeRCiilNKKBTfhwgjo3jbcVI3vLTeM-KxI6dVzpg9o,1578
83
83
  groundx/version.py,sha256=1yVogKaq260fQfckM2RYN2144SEw0QROsZW8ICtkG4U,74
84
- groundx-2.2.4.dist-info/LICENSE,sha256=dFE6nY1bHnSn6NqmdlghlU1gQqLqYNphrceGVehSa7o,1065
85
- groundx-2.2.4.dist-info/METADATA,sha256=hIIy9ZCSg8stpVhmHV-LFw44ky_Gtd7lSvOY6FLJzCc,5136
86
- groundx-2.2.4.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
87
- groundx-2.2.4.dist-info/RECORD,,
84
+ groundx-2.2.7.dist-info/LICENSE,sha256=dFE6nY1bHnSn6NqmdlghlU1gQqLqYNphrceGVehSa7o,1065
85
+ groundx-2.2.7.dist-info/METADATA,sha256=Gx6smhve9G7ECJbxQBrw3pJ_LosNgtehM3VETuW0c9I,5173
86
+ groundx-2.2.7.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
87
+ groundx-2.2.7.dist-info/RECORD,,