groundx 2.2.8__tar.gz → 2.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. {groundx-2.2.8 → groundx-2.3.0}/PKG-INFO +1 -1
  2. {groundx-2.2.8 → groundx-2.3.0}/pyproject.toml +1 -1
  3. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/core/client_wrapper.py +1 -1
  4. groundx-2.3.0/src/groundx/csv_splitter.py +64 -0
  5. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/ingest.py +247 -93
  6. {groundx-2.2.8 → groundx-2.3.0}/LICENSE +0 -0
  7. {groundx-2.2.8 → groundx-2.3.0}/README.md +0 -0
  8. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/__init__.py +0 -0
  9. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/buckets/__init__.py +0 -0
  10. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/buckets/client.py +0 -0
  11. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/client.py +0 -0
  12. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/core/__init__.py +0 -0
  13. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/core/api_error.py +0 -0
  14. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/core/datetime_utils.py +0 -0
  15. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/core/file.py +0 -0
  16. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/core/http_client.py +0 -0
  17. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/core/jsonable_encoder.py +0 -0
  18. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/core/pydantic_utilities.py +0 -0
  19. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/core/query_encoder.py +0 -0
  20. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/core/remove_none_from_dict.py +0 -0
  21. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/core/request_options.py +0 -0
  22. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/core/serialization.py +0 -0
  23. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/customer/__init__.py +0 -0
  24. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/customer/client.py +0 -0
  25. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/documents/__init__.py +0 -0
  26. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/documents/client.py +0 -0
  27. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/environment.py +0 -0
  28. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/errors/__init__.py +0 -0
  29. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/errors/bad_request_error.py +0 -0
  30. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/errors/unauthorized_error.py +0 -0
  31. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/groups/__init__.py +0 -0
  32. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/groups/client.py +0 -0
  33. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/health/__init__.py +0 -0
  34. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/health/client.py +0 -0
  35. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/py.typed +0 -0
  36. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/search/__init__.py +0 -0
  37. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/search/client.py +0 -0
  38. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/search/types/__init__.py +0 -0
  39. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/search/types/search_content_request_id.py +0 -0
  40. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/__init__.py +0 -0
  41. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/bounding_box_detail.py +0 -0
  42. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/bucket_detail.py +0 -0
  43. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/bucket_list_response.py +0 -0
  44. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/bucket_response.py +0 -0
  45. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/bucket_update_detail.py +0 -0
  46. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/bucket_update_response.py +0 -0
  47. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/customer_detail.py +0 -0
  48. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/customer_response.py +0 -0
  49. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/document.py +0 -0
  50. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/document_detail.py +0 -0
  51. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/document_list_response.py +0 -0
  52. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/document_local_ingest_request.py +0 -0
  53. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/document_lookup_response.py +0 -0
  54. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/document_response.py +0 -0
  55. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/document_type.py +0 -0
  56. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/group_detail.py +0 -0
  57. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/group_list_response.py +0 -0
  58. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/group_response.py +0 -0
  59. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/health_response.py +0 -0
  60. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/health_response_health.py +0 -0
  61. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/health_service.py +0 -0
  62. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/health_service_status.py +0 -0
  63. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/ingest_local_document.py +0 -0
  64. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/ingest_local_document_metadata.py +0 -0
  65. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/ingest_remote_document.py +0 -0
  66. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/ingest_response.py +0 -0
  67. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/ingest_response_ingest.py +0 -0
  68. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/message_response.py +0 -0
  69. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/meter_detail.py +0 -0
  70. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/process_level.py +0 -0
  71. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/process_status_response.py +0 -0
  72. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/process_status_response_ingest.py +0 -0
  73. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/process_status_response_ingest_progress.py +0 -0
  74. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/process_status_response_ingest_progress_cancelled.py +0 -0
  75. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/process_status_response_ingest_progress_complete.py +0 -0
  76. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/process_status_response_ingest_progress_errors.py +0 -0
  77. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/process_status_response_ingest_progress_processing.py +0 -0
  78. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/processes_status_response.py +0 -0
  79. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/processing_status.py +0 -0
  80. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/search_response.py +0 -0
  81. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/search_response_search.py +0 -0
  82. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/search_result_item.py +0 -0
  83. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/sort.py +0 -0
  84. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/sort_order.py +0 -0
  85. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/subscription_detail.py +0 -0
  86. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/subscription_detail_meters.py +0 -0
  87. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/types/website_source.py +0 -0
  88. {groundx-2.2.8 → groundx-2.3.0}/src/groundx/version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: groundx
3
- Version: 2.2.8
3
+ Version: 2.3.0
4
4
  Summary:
5
5
  License: MIT
6
6
  Requires-Python: >=3.8,<4.0
@@ -3,7 +3,7 @@ name = "groundx"
3
3
 
4
4
  [tool.poetry]
5
5
  name = "groundx"
6
- version = "2.2.8"
6
+ version = "2.3.0"
7
7
  description = ""
8
8
  readme = "README.md"
9
9
  authors = []
@@ -16,7 +16,7 @@ class BaseClientWrapper:
16
16
  headers: typing.Dict[str, str] = {
17
17
  "X-Fern-Language": "Python",
18
18
  "X-Fern-SDK-Name": "groundx",
19
- "X-Fern-SDK-Version": "2.2.8",
19
+ "X-Fern-SDK-Version": "2.3.0",
20
20
  }
21
21
  headers["X-API-Key"] = self.api_key
22
22
  return headers
@@ -0,0 +1,64 @@
1
+ import csv, math, os, tempfile, typing
2
+ from pathlib import Path
3
+
4
+
5
+ class CSVSplitter:
6
+ def __init__(self, filepath, delimiter=','):
7
+ self.filepath = filepath
8
+ self.delimiter = delimiter
9
+ self.filename = os.path.basename(filepath)
10
+ self.file_size = os.path.getsize(filepath)
11
+ self.rows_count = self.get_row_count()
12
+
13
+ def get_row_count(self):
14
+ with open(self.filepath, "r", newline="", encoding="utf-8") as csvfile:
15
+ return sum(1 for _ in csvfile) - 1
16
+
17
+ def determine_splits(self):
18
+ row_mod = int(self.rows_count / 1000) + 1
19
+ file_mod = int(self.file_size / 1024 / 1024) + 1
20
+
21
+ return max(row_mod, file_mod)
22
+
23
+ def split(self):
24
+ splits = self.determine_splits()
25
+ if splits < 2:
26
+ return [Path(self.filepath)]
27
+
28
+ rows_per_file = math.ceil(self.rows_count / splits)
29
+
30
+ split_files: typing.List[Path] = []
31
+ with open(self.filepath, "r", newline="", encoding="utf-8") as csvfile:
32
+ reader = csv.reader(csvfile, delimiter=self.delimiter)
33
+ headers = next(reader)
34
+
35
+ temp_dir = tempfile.mkdtemp()
36
+
37
+ current_file_number = 1
38
+ current_row = 0
39
+ current_writer = None
40
+ current_output_file = None
41
+
42
+ for row in reader:
43
+ if current_row % rows_per_file == 0:
44
+ if current_output_file:
45
+ current_output_file.close()
46
+ output_file_path = os.path.join(
47
+ temp_dir,
48
+ f"{os.path.splitext(self.filename)[0]}_{current_file_number}.csv",
49
+ )
50
+ split_files.append(Path(output_file_path))
51
+ current_output_file = open(
52
+ output_file_path, "w", newline="", encoding="utf-8"
53
+ )
54
+ current_writer = csv.writer(current_output_file, delimiter=self.delimiter)
55
+ current_writer.writerow(headers)
56
+ current_file_number += 1
57
+
58
+ current_writer.writerow(row)
59
+ current_row += 1
60
+
61
+ if current_output_file:
62
+ current_output_file.close()
63
+
64
+ return split_files
@@ -5,10 +5,11 @@ from urllib.parse import urlparse, urlunparse
5
5
 
6
6
  from .client import GroundXBase, AsyncGroundXBase
7
7
  from .core.request_options import RequestOptions
8
+ from .csv_splitter import CSVSplitter
8
9
  from .types.document import Document
9
- from .types.document_type import DocumentType
10
10
  from .types.ingest_remote_document import IngestRemoteDocument
11
11
  from .types.ingest_response import IngestResponse
12
+ from .types.ingest_response_ingest import IngestResponseIngest
12
13
 
13
14
  # this is used as the default value for optional parameters
14
15
  OMIT = typing.cast(typing.Any, ...)
@@ -38,10 +39,18 @@ MIME_TO_DOCUMENT_TYPE = {v: k for k, v in DOCUMENT_TYPE_TO_MIME.items()}
38
39
 
39
40
  ALLOWED_SUFFIXES = {f".{k}": v for k, v in DOCUMENT_TYPE_TO_MIME.items()}
40
41
 
42
+ CSV_SPLITS = {
43
+ ".csv": True,
44
+ }
45
+ TSV_SPLITS = {
46
+ ".tsv": True,
47
+ }
48
+
41
49
  SUFFIX_ALIASES = {
42
- ".jpeg": ".jpg",
43
- ".heic": ".heif",
44
- ".tif": ".tiff",
50
+ ".jpeg": "jpg",
51
+ ".heic": "heif",
52
+ ".tif": "tiff",
53
+ ".md": "txt",
45
54
  }
46
55
 
47
56
  MAX_BATCH_SIZE = 50
@@ -115,11 +124,25 @@ def prep_documents(
115
124
  return remote_documents, local_documents
116
125
 
117
126
 
127
+ def split_doc(file):
128
+ if file.is_file() and (
129
+ file.suffix.lower() in ALLOWED_SUFFIXES
130
+ or file.suffix.lower() in SUFFIX_ALIASES
131
+ ):
132
+ if file.suffix.lower() in CSV_SPLITS:
133
+ return CSVSplitter(filepath=file).split()
134
+ elif file.suffix.lower() in TSV_SPLITS:
135
+ return CSVSplitter(filepath=file, delimiter='\t').split()
136
+ return [file]
137
+ return []
138
+
118
139
  class GroundX(GroundXBase):
119
140
  def ingest(
120
141
  self,
121
142
  *,
122
143
  documents: typing.Sequence[Document],
144
+ batch_size: typing.Optional[int] = 10,
145
+ wait_for_complete: typing.Optional[bool] = False,
123
146
  upload_api: typing.Optional[str] = "https://api.eyelevel.ai/upload/file",
124
147
  request_options: typing.Optional[RequestOptions] = None,
125
148
  ) -> IngestResponse:
@@ -130,6 +153,13 @@ class GroundX(GroundXBase):
130
153
  ----------
131
154
  documents : typing.Sequence[Document]
132
155
 
156
+ # defines how many files to send per batch
157
+ # ignored unless wait_for_complete is True
158
+ batch_size : typing.Optional[int]
159
+
160
+ # will turn on progress bar and wait for ingestion to complete
161
+ wait_for_complete : typing.Optional[bool]
162
+
133
163
  # an endpoint that accepts 'name' and 'type' query params
134
164
  # and returns a presigned URL in a JSON dictionary with key 'URL'
135
165
  upload_api : typing.Optional[str]
@@ -163,41 +193,87 @@ class GroundX(GroundXBase):
163
193
  """
164
194
  remote_documents, local_documents = prep_documents(documents)
165
195
 
166
- if local_documents and remote_documents:
167
- raise ValueError("Documents must all be either local or remote, not a mix.")
196
+ if len(remote_documents) + len(local_documents) == 0:
197
+ raise ValueError("No valid documents were provided")
168
198
 
169
- if len(remote_documents) > 0:
170
- if len(remote_documents) > MAX_BATCH_SIZE:
171
- raise ValueError("You have sent too many documents in this request")
199
+ if wait_for_complete:
200
+ with tqdm(total=len(remote_documents) + len(local_documents), desc="Ingesting Files", unit="file") as pbar:
201
+ n = max(MIN_BATCH_SIZE, min(batch_size or MIN_BATCH_SIZE, MAX_BATCH_SIZE))
172
202
 
173
- return self.documents.ingest_remote(
174
- documents=remote_documents,
175
- request_options=request_options,
176
- )
203
+ remote_batch: typing.List[IngestRemoteDocument] = []
204
+ ingest = IngestResponse(ingest=IngestResponseIngest(process_id="",status="queued"))
205
+
206
+ progress = float(len(remote_documents))
207
+ for rd in remote_documents:
208
+ if len(remote_batch) >= n:
209
+ ingest = self.documents.ingest_remote(
210
+ documents=remote_batch,
211
+ request_options=request_options,
212
+ )
213
+ ingest, progress = self._monitor_batch(ingest, progress, pbar)
214
+
215
+ remote_batch = []
216
+
217
+ remote_batch.append(rd)
218
+ pbar.update(0.25)
219
+ progress -= 0.25
220
+
221
+ if remote_batch:
222
+ ingest = self.documents.ingest_remote(
223
+ documents=remote_batch,
224
+ request_options=request_options,
225
+ )
226
+ ingest, progress = self._monitor_batch(ingest, progress, pbar)
227
+
228
+
229
+ if progress > 0:
230
+ pbar.update(progress)
231
+
232
+ current_batch_size = 0
233
+ local_batch: typing.List[Document] = []
234
+
235
+ progress = float(len(local_documents))
236
+ for ld in local_documents:
237
+ fp = Path(os.path.expanduser(ld.file_path))
238
+ file_size = fp.stat().st_size
239
+
240
+ if (current_batch_size + file_size > MAX_BATCH_SIZE_BYTES) or (len(local_batch) >= n):
241
+ up_docs, progress = self._process_local(local_batch, upload_api, progress, pbar)
177
242
 
178
- if len(local_documents) > MAX_BATCH_SIZE:
243
+ ingest = self.documents.ingest_remote(
244
+ documents=up_docs,
245
+ request_options=request_options,
246
+ )
247
+ ingest, progress = self._monitor_batch(ingest, progress, pbar)
248
+
249
+ local_batch = []
250
+ current_batch_size = 0
251
+
252
+ local_batch.append(ld)
253
+ current_batch_size += file_size
254
+
255
+ if local_batch:
256
+ up_docs, progress = self._process_local(local_batch, upload_api, progress, pbar)
257
+
258
+ ingest = self.documents.ingest_remote(
259
+ documents=up_docs,
260
+ request_options=request_options,
261
+ )
262
+ ingest, progress = self._monitor_batch(ingest, progress, pbar)
263
+
264
+ if progress > 0:
265
+ pbar.update(progress)
266
+
267
+ return ingest
268
+ elif len(remote_documents) + len(local_documents) > MAX_BATCH_SIZE:
179
269
  raise ValueError("You have sent too many documents in this request")
180
270
 
181
- if len(local_documents) == 0:
182
- raise ValueError("No valid documents were provided")
183
271
 
184
- docs: typing.List[IngestRemoteDocument] = []
185
- for d in local_documents:
186
- url = self._upload_file(upload_api, Path(os.path.expanduser(d.file_path)))
187
-
188
- docs.append(
189
- IngestRemoteDocument(
190
- bucket_id=d.bucket_id,
191
- file_name=d.file_name,
192
- file_type=d.file_type,
193
- process_level=d.process_level,
194
- search_data=d.search_data,
195
- source_url=url,
196
- )
197
- )
272
+ up_docs, _ = self._process_local(local_documents, upload_api)
273
+ remote_documents.extend(up_docs)
198
274
 
199
275
  return self.documents.ingest_remote(
200
- documents=docs,
276
+ documents=remote_documents,
201
277
  request_options=request_options,
202
278
  )
203
279
 
@@ -252,14 +328,10 @@ class GroundX(GroundXBase):
252
328
  def load_directory_files(directory: str) -> typing.List[Path]:
253
329
  dir_path = Path(directory)
254
330
 
255
- matched_files = [
256
- file
257
- for file in dir_path.rglob("*")
258
- if file.is_file() and (
259
- file.suffix.lower() in ALLOWED_SUFFIXES
260
- or file.suffix.lower() in SUFFIX_ALIASES
261
- )
262
- ]
331
+ matched_files: typing.List[Path] = []
332
+ for file in dir_path.rglob("*"):
333
+ for sd in split_doc(file):
334
+ matched_files.append(sd)
263
335
 
264
336
  return matched_files
265
337
 
@@ -301,6 +373,8 @@ class GroundX(GroundXBase):
301
373
  ):
302
374
  file_name = os.path.basename(file_path)
303
375
  file_extension = os.path.splitext(file_name)[1][1:].lower()
376
+ if f".{file_extension}" in SUFFIX_ALIASES:
377
+ file_extension = SUFFIX_ALIASES[f".{file_extension}"]
304
378
 
305
379
  presigned_info = get_presigned_url(endpoint, file_name, file_extension)
306
380
 
@@ -330,6 +404,92 @@ class GroundX(GroundXBase):
330
404
 
331
405
  return strip_query_params(upload_url)
332
406
 
407
+ def _process_local(
408
+ self,
409
+ local_docs,
410
+ upload_api,
411
+ progress = None,
412
+ pbar = None,
413
+ ):
414
+ remote_docs = []
415
+ for d in local_docs:
416
+ splits = split_doc(Path(os.path.expanduser(d.file_path)))
417
+
418
+ for sd in splits:
419
+ url = self._upload_file(upload_api, sd)
420
+
421
+ ft = d.file_type
422
+ if sd.suffix.lower() in SUFFIX_ALIASES:
423
+ ft = SUFFIX_ALIASES[sd.suffix.lower()]
424
+
425
+ fn = sd.name
426
+ if len(splits) == 1 and d.file_name:
427
+ fn = d.file_name
428
+
429
+ remote_docs.append(
430
+ IngestRemoteDocument(
431
+ bucket_id=d.bucket_id,
432
+ file_name=fn,
433
+ file_type=ft,
434
+ process_level=d.process_level,
435
+ search_data=d.search_data,
436
+ source_url=url,
437
+ )
438
+ )
439
+
440
+ if progress is not None and pbar is not None and pbar.update is not None:
441
+ pbar.update(0.25)
442
+ progress -= 0.25
443
+
444
+ return remote_docs, progress
445
+
446
+ def _monitor_batch(
447
+ self,
448
+ ingest,
449
+ progress,
450
+ pbar,
451
+ ):
452
+ completed_files = set()
453
+
454
+ while (
455
+ ingest is not None
456
+ and ingest.ingest.status not in ["complete", "error", "cancelled"]
457
+ ):
458
+ time.sleep(3)
459
+ ingest = self.documents.get_processing_status_by_id(ingest.ingest.process_id)
460
+
461
+ if ingest.ingest.progress:
462
+ if ingest.ingest.progress.processing and ingest.ingest.progress.processing.documents:
463
+ for doc in ingest.ingest.progress.processing.documents:
464
+ if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
465
+ pbar.update(0.75)
466
+ progress -= 0.75
467
+ completed_files.add(doc.document_id)
468
+ if ingest.ingest.progress.complete and ingest.ingest.progress.complete.documents:
469
+ for doc in ingest.ingest.progress.complete.documents:
470
+ if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
471
+ pbar.update(0.75)
472
+ progress -= 0.75
473
+ completed_files.add(doc.document_id)
474
+ if ingest.ingest.progress.cancelled and ingest.ingest.progress.cancelled.documents:
475
+ for doc in ingest.ingest.progress.cancelled.documents:
476
+ if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
477
+ pbar.update(0.75)
478
+ progress -= 0.75
479
+ completed_files.add(doc.document_id)
480
+ if ingest.ingest.progress.errors and ingest.ingest.progress.errors.documents:
481
+ for doc in ingest.ingest.progress.errors.documents:
482
+ if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
483
+ pbar.update(0.75)
484
+ progress -= 0.75
485
+ completed_files.add(doc.document_id)
486
+
487
+
488
+ if ingest.ingest.status in ["error", "cancelled"]:
489
+ raise ValueError(f"Ingest failed with status: {ingest.ingest.status}")
490
+
491
+ return ingest, progress
492
+
333
493
  def _upload_file_batch(
334
494
  self,
335
495
  bucket_id,
@@ -340,41 +500,35 @@ class GroundX(GroundXBase):
340
500
  ):
341
501
  docs = []
342
502
 
343
- progress = len(batch)
503
+ progress = float(len(batch))
344
504
  for file in batch:
345
505
  url = self._upload_file(upload_api, file)
346
- docs.append(
347
- Document(
348
- bucket_id=bucket_id,
349
- file_path=url,
350
- ),
351
- )
506
+ if file.suffix.lower() in SUFFIX_ALIASES:
507
+ docs.append(
508
+ Document(
509
+ bucket_id=bucket_id,
510
+ file_name=file.name,
511
+ file_path=url,
512
+ file_type=SUFFIX_ALIASES[file.suffix.lower()],
513
+ ),
514
+ )
515
+ else:
516
+ docs.append(
517
+ Document(
518
+ bucket_id=bucket_id,
519
+ file_name=file.name,
520
+ file_path=url,
521
+ ),
522
+ )
352
523
  pbar.update(0.25)
353
524
  progress -= 0.25
354
525
 
355
526
  if docs:
356
527
  ingest = self.ingest(documents=docs, request_options=request_options)
528
+ ingest, progress = self._monitor_batch(ingest, progress, pbar)
357
529
 
358
- completed_files = set()
359
-
360
- while (
361
- ingest is not None
362
- and ingest.ingest.status not in ["complete", "error", "cancelled"]
363
- ):
364
- time.sleep(3)
365
- ingest = self.documents.get_processing_status_by_id(ingest.ingest.process_id)
366
-
367
- if ingest.ingest.progress and ingest.ingest.progress.processing:
368
- for doc in ingest.ingest.progress.processing.documents:
369
- if doc.status == "complete" and doc.document_id not in completed_files:
370
- pbar.update(0.75)
371
- progress -= 0.75
372
-
373
- if ingest.ingest.status in ["error", "cancelled"]:
374
- raise ValueError(f"Ingest failed with status: {ingest.ingest.status}")
375
-
376
- if progress > 0:
377
- pbar.update(progress)
530
+ if progress > 0:
531
+ pbar.update(progress)
378
532
 
379
533
 
380
534
 
@@ -431,41 +585,39 @@ class AsyncGroundX(AsyncGroundXBase):
431
585
  """
432
586
  remote_documents, local_documents = prep_documents(documents)
433
587
 
434
- if local_documents and remote_documents:
435
- raise ValueError("Documents must all be either local or remote, not a mix.")
436
-
437
- if len(remote_documents) > 0:
438
- if len(remote_documents) > MAX_BATCH_SIZE:
439
- raise ValueError("You have sent too many documents in this request")
440
-
441
- return await self.documents.ingest_remote(
442
- documents=remote_documents,
443
- request_options=request_options,
444
- )
445
-
446
- if len(local_documents) > MAX_BATCH_SIZE:
588
+ if len(remote_documents) + len(local_documents) > MAX_BATCH_SIZE:
447
589
  raise ValueError("You have sent too many documents in this request")
448
590
 
449
- if len(local_documents) == 0:
591
+ if len(remote_documents) + len(local_documents) == 0:
450
592
  raise ValueError("No valid documents were provided")
451
593
 
452
- docs: typing.List[IngestRemoteDocument] = []
453
594
  for d in local_documents:
454
- url = self._upload_file(upload_api, Path(os.path.expanduser(d.file_path)))
455
-
456
- docs.append(
457
- IngestRemoteDocument(
458
- bucket_id=d.bucket_id,
459
- file_name=d.file_name,
460
- file_type=d.file_type,
461
- process_level=d.process_level,
462
- search_data=d.search_data,
463
- source_url=url,
595
+ splits = split_doc(Path(os.path.expanduser(d.file_path)))
596
+
597
+ for sd in splits:
598
+ url = self._upload_file(upload_api, sd)
599
+
600
+ ft = d.file_type
601
+ if sd.suffix.lower() in SUFFIX_ALIASES:
602
+ ft = SUFFIX_ALIASES[sd.suffix.lower()]
603
+
604
+ fn = sd.name
605
+ if len(splits) == 1 and d.file_name:
606
+ fn = d.file_name
607
+
608
+ remote_documents.append(
609
+ IngestRemoteDocument(
610
+ bucket_id=d.bucket_id,
611
+ file_name=fn,
612
+ file_type=ft,
613
+ process_level=d.process_level,
614
+ search_data=d.search_data,
615
+ source_url=url,
616
+ )
464
617
  )
465
- )
466
618
 
467
619
  return await self.documents.ingest_remote(
468
- documents=docs,
620
+ documents=remote_documents,
469
621
  request_options=request_options,
470
622
  )
471
623
 
@@ -476,6 +628,8 @@ class AsyncGroundX(AsyncGroundXBase):
476
628
  ):
477
629
  file_name = os.path.basename(file_path)
478
630
  file_extension = os.path.splitext(file_name)[1][1:].lower()
631
+ if f".{file_extension}" in SUFFIX_ALIASES:
632
+ file_extension = SUFFIX_ALIASES[f".{file_extension}"]
479
633
 
480
634
  presigned_info = get_presigned_url(endpoint, file_name, file_extension)
481
635
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes