groundx 2.2.9__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,7 +16,7 @@ class BaseClientWrapper:
16
16
  headers: typing.Dict[str, str] = {
17
17
  "X-Fern-Language": "Python",
18
18
  "X-Fern-SDK-Name": "groundx",
19
- "X-Fern-SDK-Version": "2.2.9",
19
+ "X-Fern-SDK-Version": "2.3.0",
20
20
  }
21
21
  headers["X-API-Key"] = self.api_key
22
22
  return headers
groundx/ingest.py CHANGED
@@ -9,6 +9,7 @@ from .csv_splitter import CSVSplitter
9
9
  from .types.document import Document
10
10
  from .types.ingest_remote_document import IngestRemoteDocument
11
11
  from .types.ingest_response import IngestResponse
12
+ from .types.ingest_response_ingest import IngestResponseIngest
12
13
 
13
14
  # this is used as the default value for optional parameters
14
15
  OMIT = typing.cast(typing.Any, ...)
@@ -140,6 +141,8 @@ class GroundX(GroundXBase):
140
141
  self,
141
142
  *,
142
143
  documents: typing.Sequence[Document],
144
+ batch_size: typing.Optional[int] = 10,
145
+ wait_for_complete: typing.Optional[bool] = False,
143
146
  upload_api: typing.Optional[str] = "https://api.eyelevel.ai/upload/file",
144
147
  request_options: typing.Optional[RequestOptions] = None,
145
148
  ) -> IngestResponse:
@@ -150,6 +153,13 @@ class GroundX(GroundXBase):
150
153
  ----------
151
154
  documents : typing.Sequence[Document]
152
155
 
156
+ # defines how many files to send per batch
157
+ # ignored unless wait_for_complete is True
158
+ batch_size : typing.Optional[int]
159
+
160
+ # will turn on progress bar and wait for ingestion to complete
161
+ wait_for_complete : typing.Optional[bool]
162
+
153
163
  # an endpoint that accepts 'name' and 'type' query params
154
164
  # and returns a presigned URL in a JSON dictionary with key 'URL'
155
165
  upload_api : typing.Optional[str]
@@ -183,36 +193,84 @@ class GroundX(GroundXBase):
183
193
  """
184
194
  remote_documents, local_documents = prep_documents(documents)
185
195
 
186
- if len(remote_documents) + len(local_documents) > MAX_BATCH_SIZE:
187
- raise ValueError("You have sent too many documents in this request")
188
-
189
196
  if len(remote_documents) + len(local_documents) == 0:
190
197
  raise ValueError("No valid documents were provided")
191
198
 
192
- for d in local_documents:
193
- splits = split_doc(Path(os.path.expanduser(d.file_path)))
199
+ if wait_for_complete:
200
+ with tqdm(total=len(remote_documents) + len(local_documents), desc="Ingesting Files", unit="file") as pbar:
201
+ n = max(MIN_BATCH_SIZE, min(batch_size or MIN_BATCH_SIZE, MAX_BATCH_SIZE))
194
202
 
195
- for sd in splits:
196
- url = self._upload_file(upload_api, sd)
203
+ remote_batch: typing.List[IngestRemoteDocument] = []
204
+ ingest = IngestResponse(ingest=IngestResponseIngest(process_id="",status="queued"))
197
205
 
198
- ft = d.file_type
199
- if sd.suffix.lower() in SUFFIX_ALIASES:
200
- ft = SUFFIX_ALIASES[sd.suffix.lower()]
206
+ progress = float(len(remote_documents))
207
+ for rd in remote_documents:
208
+ if len(remote_batch) >= n:
209
+ ingest = self.documents.ingest_remote(
210
+ documents=remote_batch,
211
+ request_options=request_options,
212
+ )
213
+ ingest, progress = self._monitor_batch(ingest, progress, pbar)
201
214
 
202
- fn = sd.name
203
- if len(splits) == 1 and d.file_name:
204
- fn = d.file_name
215
+ remote_batch = []
205
216
 
206
- remote_documents.append(
207
- IngestRemoteDocument(
208
- bucket_id=d.bucket_id,
209
- file_name=fn,
210
- file_type=ft,
211
- process_level=d.process_level,
212
- search_data=d.search_data,
213
- source_url=url,
217
+ remote_batch.append(rd)
218
+ pbar.update(0.25)
219
+ progress -= 0.25
220
+
221
+ if remote_batch:
222
+ ingest = self.documents.ingest_remote(
223
+ documents=remote_batch,
224
+ request_options=request_options,
214
225
  )
215
- )
226
+ ingest, progress = self._monitor_batch(ingest, progress, pbar)
227
+
228
+
229
+ if progress > 0:
230
+ pbar.update(progress)
231
+
232
+ current_batch_size = 0
233
+ local_batch: typing.List[Document] = []
234
+
235
+ progress = float(len(local_documents))
236
+ for ld in local_documents:
237
+ fp = Path(os.path.expanduser(ld.file_path))
238
+ file_size = fp.stat().st_size
239
+
240
+ if (current_batch_size + file_size > MAX_BATCH_SIZE_BYTES) or (len(local_batch) >= n):
241
+ up_docs, progress = self._process_local(local_batch, upload_api, progress, pbar)
242
+
243
+ ingest = self.documents.ingest_remote(
244
+ documents=up_docs,
245
+ request_options=request_options,
246
+ )
247
+ ingest, progress = self._monitor_batch(ingest, progress, pbar)
248
+
249
+ local_batch = []
250
+ current_batch_size = 0
251
+
252
+ local_batch.append(ld)
253
+ current_batch_size += file_size
254
+
255
+ if local_batch:
256
+ up_docs, progress = self._process_local(local_batch, upload_api, progress, pbar)
257
+
258
+ ingest = self.documents.ingest_remote(
259
+ documents=up_docs,
260
+ request_options=request_options,
261
+ )
262
+ ingest, progress = self._monitor_batch(ingest, progress, pbar)
263
+
264
+ if progress > 0:
265
+ pbar.update(progress)
266
+
267
+ return ingest
268
+ elif len(remote_documents) + len(local_documents) > MAX_BATCH_SIZE:
269
+ raise ValueError("You have sent too many documents in this request")
270
+
271
+
272
+ up_docs, _ = self._process_local(local_documents, upload_api)
273
+ remote_documents.extend(up_docs)
216
274
 
217
275
  return self.documents.ingest_remote(
218
276
  documents=remote_documents,
@@ -346,6 +404,92 @@ class GroundX(GroundXBase):
346
404
 
347
405
  return strip_query_params(upload_url)
348
406
 
407
+ def _process_local(
408
+ self,
409
+ local_docs,
410
+ upload_api,
411
+ progress = None,
412
+ pbar = None,
413
+ ):
414
+ remote_docs = []
415
+ for d in local_docs:
416
+ splits = split_doc(Path(os.path.expanduser(d.file_path)))
417
+
418
+ for sd in splits:
419
+ url = self._upload_file(upload_api, sd)
420
+
421
+ ft = d.file_type
422
+ if sd.suffix.lower() in SUFFIX_ALIASES:
423
+ ft = SUFFIX_ALIASES[sd.suffix.lower()]
424
+
425
+ fn = sd.name
426
+ if len(splits) == 1 and d.file_name:
427
+ fn = d.file_name
428
+
429
+ remote_docs.append(
430
+ IngestRemoteDocument(
431
+ bucket_id=d.bucket_id,
432
+ file_name=fn,
433
+ file_type=ft,
434
+ process_level=d.process_level,
435
+ search_data=d.search_data,
436
+ source_url=url,
437
+ )
438
+ )
439
+
440
+ if progress is not None and pbar is not None and pbar.update is not None:
441
+ pbar.update(0.25)
442
+ progress -= 0.25
443
+
444
+ return remote_docs, progress
445
+
446
+ def _monitor_batch(
447
+ self,
448
+ ingest,
449
+ progress,
450
+ pbar,
451
+ ):
452
+ completed_files = set()
453
+
454
+ while (
455
+ ingest is not None
456
+ and ingest.ingest.status not in ["complete", "error", "cancelled"]
457
+ ):
458
+ time.sleep(3)
459
+ ingest = self.documents.get_processing_status_by_id(ingest.ingest.process_id)
460
+
461
+ if ingest.ingest.progress:
462
+ if ingest.ingest.progress.processing and ingest.ingest.progress.processing.documents:
463
+ for doc in ingest.ingest.progress.processing.documents:
464
+ if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
465
+ pbar.update(0.75)
466
+ progress -= 0.75
467
+ completed_files.add(doc.document_id)
468
+ if ingest.ingest.progress.complete and ingest.ingest.progress.complete.documents:
469
+ for doc in ingest.ingest.progress.complete.documents:
470
+ if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
471
+ pbar.update(0.75)
472
+ progress -= 0.75
473
+ completed_files.add(doc.document_id)
474
+ if ingest.ingest.progress.cancelled and ingest.ingest.progress.cancelled.documents:
475
+ for doc in ingest.ingest.progress.cancelled.documents:
476
+ if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
477
+ pbar.update(0.75)
478
+ progress -= 0.75
479
+ completed_files.add(doc.document_id)
480
+ if ingest.ingest.progress.errors and ingest.ingest.progress.errors.documents:
481
+ for doc in ingest.ingest.progress.errors.documents:
482
+ if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
483
+ pbar.update(0.75)
484
+ progress -= 0.75
485
+ completed_files.add(doc.document_id)
486
+
487
+
488
+ if ingest.ingest.status in ["error", "cancelled"]:
489
+ raise ValueError(f"Ingest failed with status: {ingest.ingest.status}")
490
+
491
+ return ingest, progress
492
+
349
493
  def _upload_file_batch(
350
494
  self,
351
495
  bucket_id,
@@ -356,7 +500,7 @@ class GroundX(GroundXBase):
356
500
  ):
357
501
  docs = []
358
502
 
359
- progress = len(batch)
503
+ progress = float(len(batch))
360
504
  for file in batch:
361
505
  url = self._upload_file(upload_api, file)
362
506
  if file.suffix.lower() in SUFFIX_ALIASES:
@@ -381,44 +525,10 @@ class GroundX(GroundXBase):
381
525
 
382
526
  if docs:
383
527
  ingest = self.ingest(documents=docs, request_options=request_options)
528
+ ingest, progress = self._monitor_batch(ingest, progress, pbar)
384
529
 
385
- completed_files = set()
386
-
387
- while (
388
- ingest is not None
389
- and ingest.ingest.status not in ["complete", "error", "cancelled"]
390
- ):
391
- time.sleep(3)
392
- ingest = self.documents.get_processing_status_by_id(ingest.ingest.process_id)
393
-
394
- if ingest.ingest.progress:
395
- if ingest.ingest.progress.processing and ingest.ingest.progress.processing.documents:
396
- for doc in ingest.ingest.progress.processing.documents:
397
- if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
398
- pbar.update(0.75)
399
- progress -= 0.75
400
- if ingest.ingest.progress.complete and ingest.ingest.progress.complete.documents:
401
- for doc in ingest.ingest.progress.complete.documents:
402
- if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
403
- pbar.update(0.75)
404
- progress -= 0.75
405
- if ingest.ingest.progress.cancelled and ingest.ingest.progress.cancelled.documents:
406
- for doc in ingest.ingest.progress.cancelled.documents:
407
- if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
408
- pbar.update(0.75)
409
- progress -= 0.75
410
- if ingest.ingest.progress.errors and ingest.ingest.progress.errors.documents:
411
- for doc in ingest.ingest.progress.errors.documents:
412
- if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
413
- pbar.update(0.75)
414
- progress -= 0.75
415
-
416
-
417
- if ingest.ingest.status in ["error", "cancelled"]:
418
- raise ValueError(f"Ingest failed with status: {ingest.ingest.status}")
419
-
420
- if progress > 0:
421
- pbar.update(progress)
530
+ if progress > 0:
531
+ pbar.update(progress)
422
532
 
423
533
 
424
534
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: groundx
3
- Version: 2.2.9
3
+ Version: 2.3.0
4
4
  Summary:
5
5
  License: MIT
6
6
  Requires-Python: >=3.8,<4.0
@@ -4,7 +4,7 @@ groundx/buckets/client.py,sha256=4jlc9vfIult1mMJ4FZW4_KFJybZPStZt1FUplIgrxbU,239
4
4
  groundx/client.py,sha256=dIW9OyrMyfC1N7HSxRrHh0w_8rJ8osNUOPdYD6ueQ6g,6515
5
5
  groundx/core/__init__.py,sha256=SQ85PF84B9MuKnBwHNHWemSGuy-g_515gFYNFhvEE0I,1438
6
6
  groundx/core/api_error.py,sha256=RE8LELok2QCjABadECTvtDp7qejA1VmINCh6TbqPwSE,426
7
- groundx/core/client_wrapper.py,sha256=D6uZpUYxYzmgxNNCTN7quiFvNlBQmPOyLstnrXfcJcs,1802
7
+ groundx/core/client_wrapper.py,sha256=8jK6xYLcQGKejOGCZNZNBDgRUuZdWlW5WCXSA5K1yIo,1802
8
8
  groundx/core/datetime_utils.py,sha256=nBys2IsYrhPdszxGKCNRPSOCwa-5DWOHG95FB8G9PKo,1047
9
9
  groundx/core/file.py,sha256=d4NNbX8XvXP32z8KpK2Xovv33nFfruIrpz0QWxlgpZk,2663
10
10
  groundx/core/http_client.py,sha256=Z77OIxIbL4OAB2IDqjRq_sYa5yNYAWfmdhdCSSvh6Y4,19552
@@ -27,7 +27,7 @@ groundx/groups/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
27
27
  groundx/groups/client.py,sha256=bytQRh9m7e4vIuYHb7dD1kCTQZvyBxedCqGnmmLqrsI,35237
28
28
  groundx/health/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
29
29
  groundx/health/client.py,sha256=fcTa21RWPyBuT77PQ0EncC6rBaW_DrYlRvudy9-0H58,7545
30
- groundx/ingest.py,sha256=LtnUGcgtE1MNYL3PGFrzPqRMnLeOxr-fVsZ3fmTAUKI,18294
30
+ groundx/ingest.py,sha256=1vp0-E-Von_kRtTpgykSVxQOqPWzWod4Og3G4l0RJFg,22424
31
31
  groundx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
32
  groundx/search/__init__.py,sha256=RagVzjShP33mDg9o4N3kGzV0egL1RYNjCpXPE8VzMYE,145
33
33
  groundx/search/client.py,sha256=zrrqFy0HowDUYPsMU4nfvDV2RgmkEQ4E8WYNktu3xcs,18684
@@ -82,7 +82,7 @@ groundx/types/subscription_detail.py,sha256=WNfUw2EMVECIvNYcV2s51zZ6T3Utc4zYXw63
82
82
  groundx/types/subscription_detail_meters.py,sha256=lBa8-1QlMVHjr5RLGqhiTKnD1KMM0AAHTWvz9TVtG8w,830
83
83
  groundx/types/website_source.py,sha256=3WeRCiilNKKBTfhwgjo3jbcVI3vLTeM-KxI6dVzpg9o,1578
84
84
  groundx/version.py,sha256=1yVogKaq260fQfckM2RYN2144SEw0QROsZW8ICtkG4U,74
85
- groundx-2.2.9.dist-info/LICENSE,sha256=dFE6nY1bHnSn6NqmdlghlU1gQqLqYNphrceGVehSa7o,1065
86
- groundx-2.2.9.dist-info/METADATA,sha256=1BWmC2-Lx8AT1vEY0juSK4ugjxyn9W5ndJy0bomFEwQ,5173
87
- groundx-2.2.9.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
88
- groundx-2.2.9.dist-info/RECORD,,
85
+ groundx-2.3.0.dist-info/LICENSE,sha256=dFE6nY1bHnSn6NqmdlghlU1gQqLqYNphrceGVehSa7o,1065
86
+ groundx-2.3.0.dist-info/METADATA,sha256=RKrqM0mYmmZSPBj7n8AH-3BGYZyAuSiTmycUbs3yCJI,5173
87
+ groundx-2.3.0.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
88
+ groundx-2.3.0.dist-info/RECORD,,