groundx 2.0.15__py3-none-any.whl → 2.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. groundx/__init__.py +73 -21
  2. groundx/buckets/__init__.py +2 -0
  3. groundx/buckets/client.py +55 -388
  4. groundx/buckets/raw_client.py +628 -0
  5. groundx/client.py +22 -21
  6. groundx/core/__init__.py +5 -0
  7. groundx/core/api_error.py +13 -5
  8. groundx/core/client_wrapper.py +4 -3
  9. groundx/core/force_multipart.py +16 -0
  10. groundx/core/http_client.py +76 -32
  11. groundx/core/http_response.py +55 -0
  12. groundx/core/jsonable_encoder.py +0 -1
  13. groundx/core/pydantic_utilities.py +71 -112
  14. groundx/core/serialization.py +7 -3
  15. groundx/csv_splitter.py +64 -0
  16. groundx/customer/__init__.py +2 -0
  17. groundx/customer/client.py +31 -43
  18. groundx/customer/raw_client.py +91 -0
  19. groundx/documents/__init__.py +1 -2
  20. groundx/documents/client.py +455 -953
  21. groundx/documents/raw_client.py +1450 -0
  22. groundx/errors/__init__.py +2 -0
  23. groundx/errors/bad_request_error.py +4 -3
  24. groundx/errors/unauthorized_error.py +4 -3
  25. groundx/extract/__init__.py +48 -0
  26. groundx/extract/agents/__init__.py +7 -0
  27. groundx/extract/agents/agent.py +202 -0
  28. groundx/extract/classes/__init__.py +24 -0
  29. groundx/extract/classes/agent.py +23 -0
  30. groundx/extract/classes/api.py +15 -0
  31. groundx/extract/classes/document.py +338 -0
  32. groundx/extract/classes/field.py +88 -0
  33. groundx/extract/classes/groundx.py +147 -0
  34. groundx/extract/classes/prompt.py +36 -0
  35. groundx/extract/classes/test_document.py +109 -0
  36. groundx/extract/classes/test_field.py +43 -0
  37. groundx/extract/classes/test_groundx.py +223 -0
  38. groundx/extract/classes/test_prompt.py +68 -0
  39. groundx/extract/post_process/__init__.py +7 -0
  40. groundx/extract/post_process/post_process.py +33 -0
  41. groundx/extract/services/.DS_Store +0 -0
  42. groundx/extract/services/__init__.py +14 -0
  43. groundx/extract/services/csv.py +76 -0
  44. groundx/extract/services/logger.py +126 -0
  45. groundx/extract/services/logging_cfg.py +53 -0
  46. groundx/extract/services/ratelimit.py +104 -0
  47. groundx/extract/services/sheets_client.py +160 -0
  48. groundx/extract/services/status.py +197 -0
  49. groundx/extract/services/upload.py +68 -0
  50. groundx/extract/services/upload_minio.py +122 -0
  51. groundx/extract/services/upload_s3.py +91 -0
  52. groundx/extract/services/utility.py +52 -0
  53. groundx/extract/settings/__init__.py +15 -0
  54. groundx/extract/settings/settings.py +212 -0
  55. groundx/extract/settings/test_settings.py +512 -0
  56. groundx/extract/tasks/__init__.py +6 -0
  57. groundx/extract/tasks/utility.py +27 -0
  58. groundx/extract/utility/__init__.py +15 -0
  59. groundx/extract/utility/classes.py +193 -0
  60. groundx/extract/utility/test_utility.py +81 -0
  61. groundx/groups/__init__.py +2 -0
  62. groundx/groups/client.py +63 -550
  63. groundx/groups/raw_client.py +901 -0
  64. groundx/health/__init__.py +2 -0
  65. groundx/health/client.py +35 -101
  66. groundx/health/raw_client.py +193 -0
  67. groundx/ingest.py +771 -0
  68. groundx/search/__init__.py +2 -0
  69. groundx/search/client.py +94 -227
  70. groundx/search/raw_client.py +442 -0
  71. groundx/search/types/__init__.py +2 -0
  72. groundx/types/__init__.py +68 -16
  73. groundx/types/bounding_box_detail.py +4 -4
  74. groundx/types/bucket_detail.py +5 -5
  75. groundx/types/bucket_list_response.py +17 -3
  76. groundx/types/bucket_response.py +3 -3
  77. groundx/types/bucket_update_detail.py +4 -4
  78. groundx/types/bucket_update_response.py +3 -3
  79. groundx/types/customer_detail.py +2 -2
  80. groundx/types/customer_response.py +3 -3
  81. groundx/types/document.py +54 -0
  82. groundx/types/document_detail.py +16 -4
  83. groundx/types/document_list_response.py +4 -4
  84. groundx/types/document_local_ingest_request.py +7 -0
  85. groundx/types/document_lookup_response.py +8 -3
  86. groundx/types/document_response.py +3 -3
  87. groundx/types/document_type.py +21 -1
  88. groundx/types/group_detail.py +4 -4
  89. groundx/types/group_list_response.py +17 -3
  90. groundx/types/group_response.py +3 -3
  91. groundx/types/health_response.py +3 -3
  92. groundx/types/health_response_health.py +3 -3
  93. groundx/types/health_service.py +5 -5
  94. groundx/types/ingest_local_document.py +25 -0
  95. groundx/types/ingest_local_document_metadata.py +51 -0
  96. groundx/types/ingest_remote_document.py +15 -6
  97. groundx/types/ingest_response.py +4 -4
  98. groundx/types/{process_status_response_ingest.py → ingest_status.py} +8 -7
  99. groundx/types/{ingest_response_ingest.py → ingest_status_light.py} +7 -5
  100. groundx/types/ingest_status_progress.py +26 -0
  101. groundx/types/{process_status_response_ingest_progress_errors.py → ingest_status_progress_cancelled.py} +4 -4
  102. groundx/types/{process_status_response_ingest_progress_complete.py → ingest_status_progress_complete.py} +4 -4
  103. groundx/types/{process_status_response_ingest_progress_cancelled.py → ingest_status_progress_errors.py} +4 -4
  104. groundx/types/{process_status_response_ingest_progress_processing.py → ingest_status_progress_processing.py} +4 -4
  105. groundx/types/message_response.py +2 -2
  106. groundx/types/meter_detail.py +2 -2
  107. groundx/types/process_level.py +5 -0
  108. groundx/types/{process_status_response.py → processes_status_response.py} +8 -5
  109. groundx/types/processing_status.py +3 -1
  110. groundx/types/search_response.py +3 -3
  111. groundx/types/search_response_search.py +3 -3
  112. groundx/types/search_result_item.py +7 -5
  113. groundx/types/search_result_item_pages_item.py +41 -0
  114. groundx/types/subscription_detail.py +3 -3
  115. groundx/types/subscription_detail_meters.py +5 -5
  116. groundx/{documents/types/website_crawl_request_websites_item.py → types/website_source.py} +7 -7
  117. groundx/types/workflow_apply_request.py +24 -0
  118. groundx/types/workflow_detail.py +59 -0
  119. groundx/types/workflow_detail_chunk_strategy.py +5 -0
  120. groundx/types/workflow_detail_relationships.py +36 -0
  121. groundx/types/workflow_engine.py +58 -0
  122. groundx/types/workflow_engine_reasoning_effort.py +5 -0
  123. groundx/types/workflow_engine_service.py +7 -0
  124. groundx/types/workflow_prompt.py +37 -0
  125. groundx/types/workflow_prompt_group.py +25 -0
  126. groundx/types/workflow_prompt_role.py +5 -0
  127. groundx/types/workflow_request.py +31 -0
  128. groundx/types/workflow_request_chunk_strategy.py +5 -0
  129. groundx/types/workflow_response.py +20 -0
  130. groundx/types/workflow_step.py +33 -0
  131. groundx/types/workflow_step_config.py +33 -0
  132. groundx/types/workflow_step_config_field.py +8 -0
  133. groundx/types/workflow_steps.py +38 -0
  134. groundx/types/workflows_response.py +20 -0
  135. groundx/workflows/__init__.py +7 -0
  136. groundx/workflows/client.py +736 -0
  137. groundx/workflows/raw_client.py +841 -0
  138. groundx/workflows/types/__init__.py +7 -0
  139. groundx/workflows/types/workflows_get_request_id.py +5 -0
  140. {groundx-2.0.15.dist-info → groundx-2.7.7.dist-info}/LICENSE +1 -1
  141. {groundx-2.0.15.dist-info → groundx-2.7.7.dist-info}/METADATA +39 -22
  142. groundx-2.7.7.dist-info/RECORD +155 -0
  143. groundx/documents/types/__init__.py +0 -6
  144. groundx/documents/types/documents_ingest_local_request_files_item.py +0 -43
  145. groundx/types/process_status_response_ingest_progress.py +0 -26
  146. groundx-2.0.15.dist-info/RECORD +0 -82
  147. {groundx-2.0.15.dist-info → groundx-2.7.7.dist-info}/WHEEL +0 -0
groundx/ingest.py ADDED
@@ -0,0 +1,771 @@
1
+ import requests, time, typing, os
2
+ from pathlib import Path
3
+ from tqdm import tqdm
4
+ from urllib.parse import urlparse, urlunparse
5
+
6
+ from .client import GroundXBase, AsyncGroundXBase
7
+ from .core.request_options import RequestOptions
8
+ from .csv_splitter import CSVSplitter
9
+ from .types.document import Document
10
+ from .types.ingest_remote_document import IngestRemoteDocument
11
+ from .types.ingest_response import IngestResponse
12
+ from .types.ingest_status import IngestStatus
13
+
14
+ # this is used as the default value for optional parameters
15
+ OMIT = typing.cast(typing.Any, ...)
16
+
17
+
18
+ DOCUMENT_TYPE_TO_MIME = {
19
+ "bmp": "image/bmp",
20
+ "gif": "image/gif",
21
+ "heif": "image/heif",
22
+ "hwp": "application/x-hwp",
23
+ "ico": "image/vnd.microsoft.icon",
24
+ "svg": "image/svg",
25
+ "tiff": "image/tiff",
26
+ "webp": "image/webp",
27
+ "txt": "text/plain",
28
+ "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
29
+ "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
30
+ "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
31
+ "pdf": "application/pdf",
32
+ "png": "image/png",
33
+ "jpg": "image/jpeg",
34
+ "csv": "text/csv",
35
+ "tsv": "text/tab-separated-values",
36
+ "json": "application/json",
37
+ }
38
+ MIME_TO_DOCUMENT_TYPE = {v: k for k, v in DOCUMENT_TYPE_TO_MIME.items()}
39
+
40
+ ALLOWED_SUFFIXES = {f".{k}": v for k, v in DOCUMENT_TYPE_TO_MIME.items()}
41
+
42
+ CSV_SPLITS = {
43
+ ".csv": True,
44
+ }
45
+ TSV_SPLITS = {
46
+ ".tsv": True,
47
+ }
48
+
49
+ SUFFIX_ALIASES = {
50
+ ".jpeg": "jpg",
51
+ ".heic": "heif",
52
+ ".tif": "tiff",
53
+ ".md": "txt",
54
+ }
55
+
56
+ MAX_BATCH_SIZE = 50
57
+ MIN_BATCH_SIZE = 1
58
+ MAX_BATCH_SIZE_BYTES = 50 * 1024 * 1024
59
+
60
+
61
+ def get_presigned_url(
62
+ endpoint: str,
63
+ file_name: str,
64
+ file_extension: str,
65
+ ) -> typing.Dict[str, typing.Any]:
66
+ params = {"name": file_name, "type": file_extension}
67
+ response = requests.get(endpoint, params=params)
68
+ response.raise_for_status()
69
+
70
+ return response.json()
71
+
72
+
73
+ def strip_query_params(
74
+ url: str,
75
+ ) -> str:
76
+ parsed = urlparse(url)
77
+ clean_url = urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", ""))
78
+
79
+ return clean_url
80
+
81
+
82
+ def prep_documents(
83
+ documents: typing.Sequence[Document],
84
+ ) -> typing.Tuple[
85
+ typing.List[IngestRemoteDocument],
86
+ typing.List[Document],
87
+ ]:
88
+ """
89
+ Process documents and separate them into remote and local documents.
90
+ """
91
+ if not documents:
92
+ raise ValueError("No documents provided for ingestion.")
93
+
94
+ def is_valid_local_path(path: str) -> bool:
95
+ expanded_path = os.path.expanduser(path)
96
+ return os.path.exists(expanded_path)
97
+
98
+ def is_valid_url(path: str) -> bool:
99
+ try:
100
+ result = urlparse(path)
101
+ return all([result.scheme, result.netloc])
102
+ except ValueError:
103
+ return False
104
+
105
+ local_documents: typing.List[Document] = []
106
+ remote_documents: typing.List[IngestRemoteDocument] = []
107
+
108
+ for document in documents:
109
+ if not hasattr(document, "file_path"):
110
+ raise ValueError("Each document must have a 'file_path' attribute.")
111
+
112
+ if is_valid_url(document.file_path):
113
+ remote_document = IngestRemoteDocument(
114
+ bucket_id=document.bucket_id,
115
+ file_name=document.file_name,
116
+ file_type=document.file_type,
117
+ filter=document.filter,
118
+ process_level=document.process_level,
119
+ search_data=document.search_data,
120
+ source_url=document.file_path,
121
+ )
122
+ remote_documents.append(remote_document)
123
+ elif is_valid_local_path(document.file_path):
124
+ local_documents.append(document)
125
+ else:
126
+ raise ValueError(f"Invalid file path: {document.file_path}")
127
+
128
+ return remote_documents, local_documents
129
+
130
+
131
+ def split_doc(file: Path) -> typing.List[Path]:
132
+ if file.is_file() and (
133
+ file.suffix.lower() in ALLOWED_SUFFIXES or file.suffix.lower() in SUFFIX_ALIASES
134
+ ):
135
+ if file.suffix.lower() in CSV_SPLITS:
136
+ return CSVSplitter(filepath=file).split()
137
+ elif file.suffix.lower() in TSV_SPLITS:
138
+ return CSVSplitter(filepath=file, delimiter="\t").split()
139
+ return [file]
140
+ return []
141
+
142
+
143
+ class GroundX(GroundXBase):
144
+ def ingest(
145
+ self,
146
+ *,
147
+ documents: typing.Sequence[Document],
148
+ batch_size: int = 10,
149
+ wait_for_complete: bool = False,
150
+ upload_api: str = "https://api.eyelevel.ai/upload/file",
151
+ callback_url: typing.Optional[str] = None,
152
+ callback_data: typing.Optional[str] = None,
153
+ request_options: typing.Optional[RequestOptions] = None,
154
+ ) -> IngestResponse:
155
+ """
156
+ Ingest local or hosted documents into a GroundX bucket.
157
+
158
+ Parameters
159
+ ----------
160
+ documents : typing.Sequence[Document]
161
+
162
+ # defines how many files to send per batch
163
+ # ignored unless wait_for_complete is True
164
+ batch_size : typing.Optional[int]
165
+
166
+ # will turn on progress bar and wait for ingestion to complete
167
+ wait_for_complete : typing.Optional[bool]
168
+
169
+ # an endpoint that accepts 'name' and 'type' query params
170
+ # and returns a presigned URL in a JSON dictionary with key 'URL'
171
+ upload_api : typing.Optional[str]
172
+
173
+ # an endpoint that will receive processing event updates as POST
174
+ callback_url : typing.Optional[str]
175
+
176
+ # a string that is returned, along with processing event updates,
177
+ # to the callback URL.
178
+ callback_data : typing.Optional[str]
179
+
180
+ request_options : typing.Optional[RequestOptions]
181
+ Request-specific configuration.
182
+
183
+ Returns
184
+ -------
185
+ IngestResponse
186
+ Documents successfully uploaded
187
+
188
+ Examples
189
+ --------
190
+ from groundx import Document, GroundX
191
+
192
+ client = GroundX(
193
+ api_key="YOUR_API_KEY",
194
+ )
195
+
196
+ client.ingest(
197
+ documents=[
198
+ Document(
199
+ bucket_id=1234,
200
+ file_name="my_file1.txt",
201
+ file_path="https://my.source.url.com/file1.txt",
202
+ file_type="txt",
203
+ )
204
+ ],
205
+ )
206
+ """
207
+ remote_documents, local_documents = prep_documents(documents)
208
+
209
+ if len(remote_documents) + len(local_documents) == 0:
210
+ raise ValueError("No valid documents were provided")
211
+
212
+ if wait_for_complete:
213
+ with tqdm(
214
+ total=len(remote_documents) + len(local_documents),
215
+ desc="Ingesting Files",
216
+ unit="file",
217
+ ) as pbar:
218
+ n = max(
219
+ MIN_BATCH_SIZE, min(batch_size or MIN_BATCH_SIZE, MAX_BATCH_SIZE)
220
+ )
221
+
222
+ remote_batch: typing.List[IngestRemoteDocument] = []
223
+ ingest = IngestResponse(
224
+ ingest=IngestStatus(process_id="", status="queued")
225
+ )
226
+
227
+ progress = float(len(remote_documents))
228
+ for rd in remote_documents:
229
+ if len(remote_batch) >= n:
230
+ ingest = self.documents.ingest_remote(
231
+ documents=remote_batch,
232
+ callback_url=callback_url,
233
+ callback_data=callback_data,
234
+ request_options=request_options,
235
+ )
236
+ ingest, progress = self._monitor_batch(ingest, progress, pbar)
237
+
238
+ remote_batch = []
239
+
240
+ remote_batch.append(rd)
241
+ pbar.update(0.25)
242
+ progress -= 0.25
243
+
244
+ if remote_batch:
245
+ ingest = self.documents.ingest_remote(
246
+ documents=remote_batch,
247
+ callback_data=callback_data,
248
+ callback_url=callback_url,
249
+ request_options=request_options,
250
+ )
251
+ ingest, progress = self._monitor_batch(ingest, progress, pbar)
252
+
253
+ if progress > 0:
254
+ pbar.update(progress)
255
+
256
+ current_batch_size = 0
257
+ local_batch: typing.List[Document] = []
258
+
259
+ progress = float(len(local_documents))
260
+ for ld in local_documents:
261
+ fp = Path(os.path.expanduser(ld.file_path))
262
+ file_size = fp.stat().st_size
263
+
264
+ if (current_batch_size + file_size > MAX_BATCH_SIZE_BYTES) or (
265
+ len(local_batch) >= n
266
+ ):
267
+ up_docs, progress = self._process_local(
268
+ local_batch, upload_api, progress, pbar
269
+ )
270
+
271
+ ingest = self.documents.ingest_remote(
272
+ documents=up_docs,
273
+ callback_url=callback_url,
274
+ callback_data=callback_data,
275
+ request_options=request_options,
276
+ )
277
+ ingest, progress = self._monitor_batch(ingest, progress, pbar)
278
+
279
+ local_batch = []
280
+ current_batch_size = 0
281
+
282
+ local_batch.append(ld)
283
+ current_batch_size += file_size
284
+
285
+ if local_batch:
286
+ up_docs, progress = self._process_local(
287
+ local_batch, upload_api, progress, pbar
288
+ )
289
+
290
+ ingest = self.documents.ingest_remote(
291
+ documents=up_docs,
292
+ callback_data=callback_data,
293
+ callback_url=callback_url,
294
+ request_options=request_options,
295
+ )
296
+ ingest, progress = self._monitor_batch(ingest, progress, pbar)
297
+
298
+ if progress > 0:
299
+ pbar.update(progress)
300
+
301
+ return ingest
302
+ elif len(remote_documents) + len(local_documents) > MAX_BATCH_SIZE:
303
+ raise ValueError("You have sent too many documents in this request")
304
+
305
+ up_docs, _ = self._process_local(local_documents, upload_api, 0, None)
306
+ remote_documents.extend(up_docs)
307
+
308
+ return self.documents.ingest_remote(
309
+ documents=remote_documents,
310
+ callback_url=callback_url,
311
+ callback_data=callback_data,
312
+ request_options=request_options,
313
+ )
314
+
315
+ def ingest_directory(
316
+ self,
317
+ *,
318
+ bucket_id: int,
319
+ path: str,
320
+ batch_size: int = 10,
321
+ upload_api: str = "https://api.eyelevel.ai/upload/file",
322
+ callback_url: typing.Optional[str] = None,
323
+ callback_data: typing.Optional[str] = None,
324
+ request_options: typing.Optional[RequestOptions] = None,
325
+ ):
326
+ """
327
+ Ingest documents from a local directory into a GroundX bucket.
328
+
329
+ Parameters
330
+ ----------
331
+ bucket_id : int
332
+ path : str
333
+ batch_size : type.Optional[int]
334
+
335
+ # an endpoint that accepts 'name' and 'type' query params
336
+ # and returns a presigned URL in a JSON dictionary with key 'URL'
337
+ upload_api : typing.Optional[str]
338
+
339
+ # an endpoint that will receive processing event updates as POST
340
+ callback_url : typing.Optional[str]
341
+
342
+ # a string that is returned, along with processing event updates,
343
+ # to the callback URL.
344
+ callback_data : typing.Optional[str]
345
+
346
+ request_options : typing.Optional[RequestOptions]
347
+ Request-specific configuration.
348
+
349
+ Returns
350
+ -------
351
+ IngestResponse
352
+ Documents successfully uploaded
353
+
354
+ Examples
355
+ --------
356
+ from groundx import Document, GroundX
357
+
358
+ client = GroundX(
359
+ api_key="YOUR_API_KEY",
360
+ )
361
+
362
+ client.ingest_directory(
363
+ bucket_id=0,
364
+ path="/path/to/directory"
365
+ )
366
+ """
367
+
368
+ def is_valid_local_directory(path: str) -> bool:
369
+ expanded_path = os.path.expanduser(path)
370
+ return os.path.isdir(expanded_path)
371
+
372
+ def load_directory_files(directory: str) -> typing.List[Path]:
373
+ dir_path = Path(directory)
374
+
375
+ matched_files: typing.List[Path] = []
376
+ for file in dir_path.rglob("*"):
377
+ for sd in split_doc(file):
378
+ matched_files.append(sd)
379
+
380
+ return matched_files
381
+
382
+ if bucket_id < 1:
383
+ raise ValueError(f"Invalid bucket_id: {bucket_id}")
384
+
385
+ if is_valid_local_directory(path) is not True:
386
+ raise ValueError(f"Invalid directory path: {path}")
387
+
388
+ files = load_directory_files(path)
389
+
390
+ if len(files) < 1:
391
+ raise ValueError(f"No supported files found in directory: {path}")
392
+
393
+ current_batch: typing.List[Path] = []
394
+ current_batch_size: int = 0
395
+
396
+ n = max(MIN_BATCH_SIZE, min(batch_size or MIN_BATCH_SIZE, MAX_BATCH_SIZE))
397
+
398
+ with tqdm(total=len(files), desc="Ingesting Files", unit="file") as pbar:
399
+ for file in files:
400
+ file_size = file.stat().st_size
401
+
402
+ if (current_batch_size + file_size > MAX_BATCH_SIZE_BYTES) or (
403
+ len(current_batch) >= n
404
+ ):
405
+ self._upload_file_batch(
406
+ bucket_id,
407
+ current_batch,
408
+ upload_api,
409
+ callback_url,
410
+ callback_data,
411
+ request_options,
412
+ pbar,
413
+ )
414
+ current_batch = []
415
+ current_batch_size = 0
416
+
417
+ current_batch.append(file)
418
+ current_batch_size += file_size
419
+
420
+ if current_batch:
421
+ self._upload_file_batch(
422
+ bucket_id,
423
+ current_batch,
424
+ upload_api,
425
+ callback_url,
426
+ callback_data,
427
+ request_options,
428
+ pbar,
429
+ )
430
+
431
+ def _upload_file(
432
+ self,
433
+ endpoint: str,
434
+ file_path: Path,
435
+ ) -> str:
436
+ file_name = os.path.basename(file_path)
437
+ file_extension = os.path.splitext(file_name)[1][1:].lower()
438
+ if f".{file_extension}" in SUFFIX_ALIASES:
439
+ file_extension = SUFFIX_ALIASES[f".{file_extension}"]
440
+
441
+ presigned_info = get_presigned_url(endpoint, file_name, file_extension)
442
+
443
+ upload_url = presigned_info["URL"]
444
+ hd = presigned_info.get("Header", {})
445
+ method = presigned_info.get("Method", "PUT").upper()
446
+
447
+ headers: typing.Dict[str, typing.Any] = {}
448
+ for key, value in hd.items():
449
+ if isinstance(value, list):
450
+ headers[key.upper()] = value[0]
451
+
452
+ try:
453
+ with open(file_path, "rb") as f:
454
+ file_data = f.read()
455
+ except Exception as e:
456
+ raise ValueError(f"Error reading file {file_path}: {e}")
457
+
458
+ if method == "PUT":
459
+ upload_response = requests.put(upload_url, data=file_data, headers=headers)
460
+ else:
461
+ raise ValueError(f"Unsupported HTTP method: {method}")
462
+
463
+ if upload_response.status_code not in (200, 201):
464
+ raise Exception(
465
+ f"Upload failed: {upload_response.status_code} - {upload_response.text}"
466
+ )
467
+
468
+ if "GX-HOSTED-URL" in headers:
469
+ return headers["GX-HOSTED-URL"]
470
+
471
+ return strip_query_params(upload_url)
472
+
473
+ def _process_local(
474
+ self,
475
+ local_docs: typing.List[Document],
476
+ upload_api: str,
477
+ progress: float,
478
+ pbar: typing.Optional[typing.Any] = None,
479
+ ) -> typing.Tuple[typing.List[IngestRemoteDocument], float]:
480
+ remote_docs: typing.List[IngestRemoteDocument] = []
481
+ for d in local_docs:
482
+ splits = split_doc(Path(os.path.expanduser(d.file_path)))
483
+
484
+ for sd in splits:
485
+ url = self._upload_file(upload_api, sd)
486
+
487
+ ft = d.file_type
488
+ if sd.suffix.lower() in SUFFIX_ALIASES:
489
+ ft = SUFFIX_ALIASES[sd.suffix.lower()]
490
+
491
+ fn = sd.name
492
+ if len(splits) == 1 and d.file_name:
493
+ fn = d.file_name
494
+
495
+ remote_docs.append(
496
+ IngestRemoteDocument(
497
+ bucket_id=d.bucket_id,
498
+ file_name=fn,
499
+ file_type=ft,
500
+ filter=d.filter,
501
+ process_level=d.process_level,
502
+ search_data=d.search_data,
503
+ source_url=url,
504
+ )
505
+ )
506
+
507
+ progress -= 0.25
508
+ if pbar is not None and pbar.update is not None:
509
+ pbar.update(0.25)
510
+
511
+ return remote_docs, progress
512
+
513
+ def _monitor_batch(
514
+ self,
515
+ ingest: IngestResponse,
516
+ progress: float,
517
+ pbar: typing.Any,
518
+ ) -> typing.Tuple[IngestResponse, float]:
519
+ completed_files: typing.Set[str] = set()
520
+
521
+ while ingest.ingest.status not in ["complete", "error", "cancelled"]:
522
+ time.sleep(3)
523
+ ingest = self.documents.get_processing_status_by_id(
524
+ ingest.ingest.process_id
525
+ )
526
+
527
+ if ingest.ingest.progress:
528
+ if (
529
+ ingest.ingest.progress.processing
530
+ and ingest.ingest.progress.processing.documents
531
+ ):
532
+ for doc in ingest.ingest.progress.processing.documents:
533
+ if (
534
+ doc.status in ["complete", "error", "cancelled"]
535
+ and doc.document_id not in completed_files
536
+ ):
537
+ pbar.update(0.75)
538
+ progress -= 0.75
539
+ completed_files.add(doc.document_id)
540
+ if (
541
+ ingest.ingest.progress.complete
542
+ and ingest.ingest.progress.complete.documents
543
+ ):
544
+ for doc in ingest.ingest.progress.complete.documents:
545
+ if (
546
+ doc.status in ["complete", "error", "cancelled"]
547
+ and doc.document_id not in completed_files
548
+ ):
549
+ pbar.update(0.75)
550
+ progress -= 0.75
551
+ completed_files.add(doc.document_id)
552
+ if (
553
+ ingest.ingest.progress.cancelled
554
+ and ingest.ingest.progress.cancelled.documents
555
+ ):
556
+ for doc in ingest.ingest.progress.cancelled.documents:
557
+ if (
558
+ doc.status in ["complete", "error", "cancelled"]
559
+ and doc.document_id not in completed_files
560
+ ):
561
+ pbar.update(0.75)
562
+ progress -= 0.75
563
+ completed_files.add(doc.document_id)
564
+ if (
565
+ ingest.ingest.progress.errors
566
+ and ingest.ingest.progress.errors.documents
567
+ ):
568
+ for doc in ingest.ingest.progress.errors.documents:
569
+ if (
570
+ doc.status in ["complete", "error", "cancelled"]
571
+ and doc.document_id not in completed_files
572
+ ):
573
+ pbar.update(0.75)
574
+ progress -= 0.75
575
+ completed_files.add(doc.document_id)
576
+
577
+ if ingest.ingest.status in ["error", "cancelled"]:
578
+ raise ValueError(f"Ingest failed with status: {ingest.ingest.status}")
579
+
580
+ return ingest, progress
581
+
582
+ def _upload_file_batch(
583
+ self,
584
+ bucket_id: int,
585
+ batch: typing.List[Path],
586
+ upload_api: str,
587
+ callback_url: typing.Optional[str],
588
+ callback_data: typing.Optional[str],
589
+ request_options: typing.Optional[RequestOptions],
590
+ pbar: typing.Any,
591
+ ) -> None:
592
+ docs: typing.List[Document] = []
593
+
594
+ progress = float(len(batch))
595
+ for file in batch:
596
+ url = self._upload_file(upload_api, file)
597
+ if file.suffix.lower() in SUFFIX_ALIASES:
598
+ docs.append(
599
+ Document(
600
+ bucket_id=bucket_id,
601
+ file_name=file.name,
602
+ file_path=url,
603
+ file_type=SUFFIX_ALIASES[file.suffix.lower()],
604
+ ),
605
+ )
606
+ else:
607
+ docs.append(
608
+ Document(
609
+ bucket_id=bucket_id,
610
+ file_name=file.name,
611
+ file_path=url,
612
+ ),
613
+ )
614
+ pbar.update(0.25)
615
+ progress -= 0.25
616
+
617
+ if docs:
618
+ ingest = self.ingest(
619
+ documents=docs,
620
+ callback_data=callback_data,
621
+ callback_url=callback_url,
622
+ request_options=request_options,
623
+ )
624
+ ingest, progress = self._monitor_batch(ingest, progress, pbar)
625
+
626
+ if progress > 0:
627
+ pbar.update(progress)
628
+
629
+
630
+ class AsyncGroundX(AsyncGroundXBase):
631
+ async def ingest(
632
+ self,
633
+ *,
634
+ documents: typing.Sequence[Document],
635
+ upload_api: str = "https://api.eyelevel.ai/upload/file",
636
+ callback_url: typing.Optional[str] = None,
637
+ callback_data: typing.Optional[str] = None,
638
+ request_options: typing.Optional[RequestOptions] = None,
639
+ ) -> IngestResponse:
640
+ """
641
+ Ingest local or hosted documents into a GroundX bucket.
642
+
643
+ Parameters
644
+ ----------
645
+ documents : typing.Sequence[Document]
646
+
647
+ # an endpoint that accepts 'name' and 'type' query params
648
+ # and returns a presigned URL in a JSON dictionary with key 'URL'
649
+ upload_api : typing.Optional[str]
650
+
651
+ # an endpoint that will receive processing event updates as POST
652
+ callback_url : typing.Optional[str]
653
+
654
+ # a string that is returned, along with processing event updates,
655
+ # to the callback URL.
656
+ callback_data : typing.Optional[str]
657
+
658
+ request_options : typing.Optional[RequestOptions]
659
+ Request-specific configuration.
660
+
661
+ Returns
662
+ -------
663
+ IngestResponse
664
+ Documents successfully uploaded
665
+
666
+ Examples
667
+ --------
668
+ import asyncio
669
+
670
+ from groundx import AsyncGroundX, Document
671
+
672
+ client = AsyncGroundX(
673
+ api_key="YOUR_API_KEY",
674
+ )
675
+
676
+ async def main() -> None:
677
+ await client.ingest(
678
+ documents=[
679
+ Document(
680
+ bucket_id=1234,
681
+ file_name="my_file1.txt",
682
+ file_path="https://my.source.url.com/file1.txt",
683
+ file_type="txt",
684
+ )
685
+ ],
686
+ )
687
+
688
+ asyncio.run(main())
689
+ """
690
+ remote_documents, local_documents = prep_documents(documents)
691
+
692
+ if len(remote_documents) + len(local_documents) > MAX_BATCH_SIZE:
693
+ raise ValueError("You have sent too many documents in this request")
694
+
695
+ if len(remote_documents) + len(local_documents) == 0:
696
+ raise ValueError("No valid documents were provided")
697
+
698
+ for d in local_documents:
699
+ splits = split_doc(Path(os.path.expanduser(d.file_path)))
700
+
701
+ for sd in splits:
702
+ url = self._upload_file(upload_api, sd)
703
+
704
+ ft = d.file_type
705
+ if sd.suffix.lower() in SUFFIX_ALIASES:
706
+ ft = SUFFIX_ALIASES[sd.suffix.lower()]
707
+
708
+ fn = sd.name
709
+ if len(splits) == 1 and d.file_name:
710
+ fn = d.file_name
711
+
712
+ remote_documents.append(
713
+ IngestRemoteDocument(
714
+ bucket_id=d.bucket_id,
715
+ file_name=fn,
716
+ file_type=ft,
717
+ filter=d.filter,
718
+ process_level=d.process_level,
719
+ search_data=d.search_data,
720
+ source_url=url,
721
+ )
722
+ )
723
+
724
+ return await self.documents.ingest_remote(
725
+ documents=remote_documents,
726
+ callback_url=callback_url,
727
+ callback_data=callback_data,
728
+ request_options=request_options,
729
+ )
730
+
731
+ def _upload_file(
732
+ self,
733
+ endpoint: str,
734
+ file_path: Path,
735
+ ) -> str:
736
+ file_name = os.path.basename(file_path)
737
+ file_extension = os.path.splitext(file_name)[1][1:].lower()
738
+ if f".{file_extension}" in SUFFIX_ALIASES:
739
+ file_extension = SUFFIX_ALIASES[f".{file_extension}"]
740
+
741
+ presigned_info = get_presigned_url(endpoint, file_name, file_extension)
742
+
743
+ upload_url = presigned_info["URL"]
744
+ hd = presigned_info.get("Header", {})
745
+ method = presigned_info.get("Method", "PUT").upper()
746
+
747
+ headers: typing.Dict[str, typing.Any] = {}
748
+ for key, value in hd.items():
749
+ if isinstance(value, list):
750
+ headers[key.upper()] = value[0]
751
+
752
+ try:
753
+ with open(file_path, "rb") as f:
754
+ file_data = f.read()
755
+ except Exception as e:
756
+ raise ValueError(f"Error reading file {file_path}: {e}")
757
+
758
+ if method == "PUT":
759
+ upload_response = requests.put(upload_url, data=file_data, headers=headers)
760
+ else:
761
+ raise ValueError(f"Unsupported HTTP method: {method}")
762
+
763
+ if upload_response.status_code not in (200, 201):
764
+ raise Exception(
765
+ f"Upload failed: {upload_response.status_code} - {upload_response.text}"
766
+ )
767
+
768
+ if "GX-HOSTED-URL" in headers:
769
+ return headers["GX-HOSTED-URL"]
770
+
771
+ return strip_query_params(upload_url)