groundx 2.2.7__py3-none-any.whl → 2.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,7 +16,7 @@ class BaseClientWrapper:
16
16
  headers: typing.Dict[str, str] = {
17
17
  "X-Fern-Language": "Python",
18
18
  "X-Fern-SDK-Name": "groundx",
19
- "X-Fern-SDK-Version": "2.2.7",
19
+ "X-Fern-SDK-Version": "2.2.9",
20
20
  }
21
21
  headers["X-API-Key"] = self.api_key
22
22
  return headers
@@ -0,0 +1,64 @@
1
+ import csv, math, os, tempfile, typing
2
+ from pathlib import Path
3
+
4
+
5
+ class CSVSplitter:
6
+ def __init__(self, filepath, delimiter=','):
7
+ self.filepath = filepath
8
+ self.delimiter = delimiter
9
+ self.filename = os.path.basename(filepath)
10
+ self.file_size = os.path.getsize(filepath)
11
+ self.rows_count = self.get_row_count()
12
+
13
+ def get_row_count(self):
14
+ with open(self.filepath, "r", newline="", encoding="utf-8") as csvfile:
15
+ return sum(1 for _ in csvfile) - 1
16
+
17
+ def determine_splits(self):
18
+ row_mod = int(self.rows_count / 1000) + 1
19
+ file_mod = int(self.file_size / 1024 / 1024) + 1
20
+
21
+ return max(row_mod, file_mod)
22
+
23
+ def split(self):
24
+ splits = self.determine_splits()
25
+ if splits < 2:
26
+ return [Path(self.filepath)]
27
+
28
+ rows_per_file = math.ceil(self.rows_count / splits)
29
+
30
+ split_files: typing.List[Path] = []
31
+ with open(self.filepath, "r", newline="", encoding="utf-8") as csvfile:
32
+ reader = csv.reader(csvfile, delimiter=self.delimiter)
33
+ headers = next(reader)
34
+
35
+ temp_dir = tempfile.mkdtemp()
36
+
37
+ current_file_number = 1
38
+ current_row = 0
39
+ current_writer = None
40
+ current_output_file = None
41
+
42
+ for row in reader:
43
+ if current_row % rows_per_file == 0:
44
+ if current_output_file:
45
+ current_output_file.close()
46
+ output_file_path = os.path.join(
47
+ temp_dir,
48
+ f"{os.path.splitext(self.filename)[0]}_{current_file_number}.csv",
49
+ )
50
+ split_files.append(Path(output_file_path))
51
+ current_output_file = open(
52
+ output_file_path, "w", newline="", encoding="utf-8"
53
+ )
54
+ current_writer = csv.writer(current_output_file, delimiter=self.delimiter)
55
+ current_writer.writerow(headers)
56
+ current_file_number += 1
57
+
58
+ current_writer.writerow(row)
59
+ current_row += 1
60
+
61
+ if current_output_file:
62
+ current_output_file.close()
63
+
64
+ return split_files
groundx/ingest.py CHANGED
@@ -1,19 +1,12 @@
1
- import aiohttp, io, json, mimetypes, requests, time, typing, os
2
- from asyncio import TimeoutError
1
+ import requests, time, typing, os
3
2
  from pathlib import Path
4
3
  from tqdm import tqdm
5
4
  from urllib.parse import urlparse, urlunparse
6
5
 
7
- from json.decoder import JSONDecodeError
8
-
9
6
  from .client import GroundXBase, AsyncGroundXBase
10
- from .core.api_error import ApiError
11
- from .core.pydantic_utilities import parse_obj_as
12
7
  from .core.request_options import RequestOptions
13
- from .errors.bad_request_error import BadRequestError
14
- from .errors.unauthorized_error import UnauthorizedError
8
+ from .csv_splitter import CSVSplitter
15
9
  from .types.document import Document
16
- from .types.document_type import DocumentType
17
10
  from .types.ingest_remote_document import IngestRemoteDocument
18
11
  from .types.ingest_response import IngestResponse
19
12
 
@@ -45,23 +38,48 @@ MIME_TO_DOCUMENT_TYPE = {v: k for k, v in DOCUMENT_TYPE_TO_MIME.items()}
45
38
 
46
39
  ALLOWED_SUFFIXES = {f".{k}": v for k, v in DOCUMENT_TYPE_TO_MIME.items()}
47
40
 
41
+ CSV_SPLITS = {
42
+ ".csv": True,
43
+ }
44
+ TSV_SPLITS = {
45
+ ".tsv": True,
46
+ }
47
+
48
48
  SUFFIX_ALIASES = {
49
- ".jpeg": ".jpg",
50
- ".heic": ".heif",
51
- ".tif": ".tiff",
49
+ ".jpeg": "jpg",
50
+ ".heic": "heif",
51
+ ".tif": "tiff",
52
+ ".md": "txt",
52
53
  }
53
54
 
54
55
  MAX_BATCH_SIZE = 50
55
56
  MIN_BATCH_SIZE = 1
56
57
  MAX_BATCH_SIZE_BYTES = 50 * 1024 * 1024
57
58
 
59
+ def get_presigned_url(
60
+ endpoint: str,
61
+ file_name: str,
62
+ file_extension: str,
63
+ ) -> typing.Dict[str, typing.Any]:
64
+ params = {"name": file_name, "type": file_extension}
65
+ response = requests.get(endpoint, params=params)
66
+ response.raise_for_status()
67
+
68
+ return response.json()
69
+
70
+ def strip_query_params(
71
+ url: str,
72
+ ) -> str:
73
+ parsed = urlparse(url)
74
+ clean_url = urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", ""))
75
+
76
+ return clean_url
77
+
58
78
  def prep_documents(
59
79
  documents: typing.Sequence[Document],
60
80
  ) -> typing.Tuple[
61
81
  typing.List[IngestRemoteDocument],
62
- typing.List[
63
- typing.Tuple[str, typing.Tuple[typing.Union[str, None], typing.BinaryIO, str]]
64
- ],
82
+ typing.List[Document],
65
83
  ]:
66
84
  """
67
85
  Process documents and separate them into remote and local documents.
@@ -80,9 +98,7 @@ def prep_documents(
80
98
  except ValueError:
81
99
  return False
82
100
 
83
- local_documents: typing.List[
84
- typing.Tuple[str, typing.Tuple[typing.Union[str, None], typing.BinaryIO, str]]
85
- ] = []
101
+ local_documents: typing.List[Document] = []
86
102
  remote_documents: typing.List[IngestRemoteDocument] = []
87
103
 
88
104
  for document in documents:
@@ -100,64 +116,31 @@ def prep_documents(
100
116
  )
101
117
  remote_documents.append(remote_document)
102
118
  elif is_valid_local_path(document.file_path):
103
- expanded_path = os.path.expanduser(document.file_path)
104
- file_name = os.path.basename(expanded_path)
105
- mime_type = mimetypes.guess_type(file_name)[0] or "application/octet-stream"
106
- file_type = MIME_TO_DOCUMENT_TYPE.get(mime_type, None)
107
- if document.file_type:
108
- file_type = document.file_type
109
- mime_type = DOCUMENT_TYPE_TO_MIME.get(
110
- document.file_type, "application/octet-stream"
111
- )
112
-
113
- if document.file_name:
114
- file_name = document.file_name
115
-
116
- try:
117
- local_documents.append(
118
- (
119
- "blob",
120
- (
121
- file_name,
122
- open(expanded_path, "rb"),
123
- mime_type,
124
- ),
125
- )
126
- )
127
- except Exception as e:
128
- raise ValueError(f"Error reading file {expanded_path}: {e}")
129
-
130
- metadata = {
131
- "bucketId": document.bucket_id,
132
- "fileName": file_name,
133
- "fileType": file_type,
134
- }
135
- if document.process_level:
136
- metadata["processLevel"] = document.process_level
137
- if document.search_data:
138
- metadata["searchData"] = document.search_data
139
-
140
- local_documents.append(
141
- (
142
- "metadata",
143
- (
144
- f"data.json",
145
- io.BytesIO(json.dumps(metadata).encode("utf-8")),
146
- "application/json",
147
- ),
148
- )
149
- )
119
+ local_documents.append(document)
150
120
  else:
151
121
  raise ValueError(f"Invalid file path: {document.file_path}")
152
122
 
153
123
  return remote_documents, local_documents
154
124
 
155
125
 
126
+ def split_doc(file):
127
+ if file.is_file() and (
128
+ file.suffix.lower() in ALLOWED_SUFFIXES
129
+ or file.suffix.lower() in SUFFIX_ALIASES
130
+ ):
131
+ if file.suffix.lower() in CSV_SPLITS:
132
+ return CSVSplitter(filepath=file).split()
133
+ elif file.suffix.lower() in TSV_SPLITS:
134
+ return CSVSplitter(filepath=file, delimiter='\t').split()
135
+ return [file]
136
+ return []
137
+
156
138
  class GroundX(GroundXBase):
157
139
  def ingest(
158
140
  self,
159
141
  *,
160
142
  documents: typing.Sequence[Document],
143
+ upload_api: typing.Optional[str] = "https://api.eyelevel.ai/upload/file",
161
144
  request_options: typing.Optional[RequestOptions] = None,
162
145
  ) -> IngestResponse:
163
146
  """
@@ -167,6 +150,10 @@ class GroundX(GroundXBase):
167
150
  ----------
168
151
  documents : typing.Sequence[Document]
169
152
 
153
+ # an endpoint that accepts 'name' and 'type' query params
154
+ # and returns a presigned URL in a JSON dictionary with key 'URL'
155
+ upload_api : typing.Optional[str]
156
+
170
157
  request_options : typing.Optional[RequestOptions]
171
158
  Request-specific configuration.
172
159
 
@@ -196,65 +183,41 @@ class GroundX(GroundXBase):
196
183
  """
197
184
  remote_documents, local_documents = prep_documents(documents)
198
185
 
199
- if local_documents and remote_documents:
200
- raise ValueError("Documents must all be either local or remote, not a mix.")
186
+ if len(remote_documents) + len(local_documents) > MAX_BATCH_SIZE:
187
+ raise ValueError("You have sent too many documents in this request")
201
188
 
202
- if len(remote_documents) > 0:
203
- return self.documents.ingest_remote(
204
- documents=remote_documents,
205
- request_options=request_options,
206
- )
189
+ if len(remote_documents) + len(local_documents) == 0:
190
+ raise ValueError("No valid documents were provided")
207
191
 
208
- timeout = self._client_wrapper.get_timeout()
209
- headers = self._client_wrapper.get_headers()
210
- base_url = self._client_wrapper.get_base_url().rstrip("/")
211
- follow_redirects = getattr(
212
- self._client_wrapper.httpx_client, "follow_redirects", True
213
- )
192
+ for d in local_documents:
193
+ splits = split_doc(Path(os.path.expanduser(d.file_path)))
214
194
 
215
- url = f"{base_url}/v1/ingest/documents/local"
216
- _response = requests.post(
217
- url,
218
- files=local_documents,
219
- headers=headers,
220
- timeout=timeout,
221
- allow_redirects=follow_redirects,
222
- )
195
+ for sd in splits:
196
+ url = self._upload_file(upload_api, sd)
223
197
 
224
- try:
225
- if 200 <= _response.status_code < 300:
226
- return typing.cast(
227
- IngestResponse,
228
- parse_obj_as(
229
- type_=IngestResponse, # type: ignore
230
- object_=_response.json(),
231
- ),
232
- )
233
- if _response.status_code == 400:
234
- raise BadRequestError(
235
- typing.cast(
236
- typing.Optional[typing.Any],
237
- parse_obj_as(
238
- type_=typing.Optional[typing.Any], # type: ignore
239
- object_=_response.json(),
240
- ),
241
- )
242
- )
243
- if _response.status_code == 401:
244
- raise UnauthorizedError(
245
- typing.cast(
246
- typing.Optional[typing.Any],
247
- parse_obj_as(
248
- type_=typing.Optional[typing.Any], # type: ignore
249
- object_=_response.json(),
250
- ),
198
+ ft = d.file_type
199
+ if sd.suffix.lower() in SUFFIX_ALIASES:
200
+ ft = SUFFIX_ALIASES[sd.suffix.lower()]
201
+
202
+ fn = sd.name
203
+ if len(splits) == 1 and d.file_name:
204
+ fn = d.file_name
205
+
206
+ remote_documents.append(
207
+ IngestRemoteDocument(
208
+ bucket_id=d.bucket_id,
209
+ file_name=fn,
210
+ file_type=ft,
211
+ process_level=d.process_level,
212
+ search_data=d.search_data,
213
+ source_url=url,
251
214
  )
252
215
  )
253
- _response_json = _response.json()
254
- except JSONDecodeError:
255
- raise ApiError(status_code=_response.status_code, body=_response.text)
256
216
 
257
- raise ApiError(status_code=_response.status_code, body=_response_json)
217
+ return self.documents.ingest_remote(
218
+ documents=remote_documents,
219
+ request_options=request_options,
220
+ )
258
221
 
259
222
  def ingest_directory(
260
223
  self,
@@ -275,7 +238,7 @@ class GroundX(GroundXBase):
275
238
  batch_size : type.Optional[int]
276
239
 
277
240
  # an endpoint that accepts 'name' and 'type' query params
278
- # and returns a presigned URL
241
+ # and returns a presigned URL in a JSON dictionary with key 'URL'
279
242
  upload_api : typing.Optional[str]
280
243
 
281
244
  request_options : typing.Optional[RequestOptions]
@@ -300,13 +263,6 @@ class GroundX(GroundXBase):
300
263
  )
301
264
  """
302
265
 
303
- def get_presigned_url(endpoint, file_name, file_extension) -> typing.Dict[str, typing.Any]:
304
- params = {"name": file_name, "type": file_extension}
305
- response = requests.get(endpoint, params=params)
306
- response.raise_for_status()
307
-
308
- return response.json()
309
-
310
266
  def is_valid_local_directory(path: str) -> bool:
311
267
  expanded_path = os.path.expanduser(path)
312
268
  return os.path.isdir(expanded_path)
@@ -314,89 +270,12 @@ class GroundX(GroundXBase):
314
270
  def load_directory_files(directory: str) -> typing.List[Path]:
315
271
  dir_path = Path(directory)
316
272
 
317
- matched_files = [
318
- file
319
- for file in dir_path.rglob("*")
320
- if file.is_file() and (
321
- file.suffix.lower() in ALLOWED_SUFFIXES
322
- or file.suffix.lower() in SUFFIX_ALIASES
323
- )
324
- ]
273
+ matched_files: typing.List[Path] = []
274
+ for file in dir_path.rglob("*"):
275
+ for sd in split_doc(file):
276
+ matched_files.append(sd)
325
277
 
326
- return matched_files
327
-
328
- def strip_query_params(url: str) -> str:
329
- parsed = urlparse(url)
330
- clean_url = urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", ""))
331
- return clean_url
332
-
333
- def _upload_file_batch(bucket_id, batch, upload_api, request_options, pbar):
334
- docs = []
335
-
336
- progress = len(batch)
337
- for file in batch:
338
- url = upload_file(upload_api, file)
339
- docs.append(
340
- Document(
341
- bucket_id=bucket_id,
342
- file_path=url,
343
- ),
344
- )
345
- pbar.update(0.25)
346
- progress -= 0.25
347
-
348
- if docs:
349
- ingest = self.ingest(documents=docs, request_options=request_options)
350
-
351
- completed_files = set()
352
-
353
- while (
354
- ingest is not None
355
- and ingest.ingest.status not in ["complete", "error", "cancelled"]
356
- ):
357
- time.sleep(3)
358
- ingest = self.documents.get_processing_status_by_id(ingest.ingest.process_id)
359
-
360
- if ingest.ingest.progress and ingest.ingest.progress.processing:
361
- for doc in ingest.ingest.progress.processing.documents:
362
- if doc.status == "complete" and doc.document_id not in completed_files:
363
- pbar.update(0.75)
364
- progress -= 0.75
365
-
366
- if ingest.ingest.status in ["error", "cancelled"]:
367
- raise ValueError(f"Ingest failed with status: {ingest.ingest.status}")
368
-
369
- if progress > 0:
370
- pbar.update(progress)
371
-
372
- def upload_file(endpoint, file_path) -> str:
373
- file_name = os.path.basename(file_path)
374
- file_extension = os.path.splitext(file_name)[1][1:].lower()
375
-
376
- presigned_info = get_presigned_url(endpoint, file_name, file_extension)
377
-
378
- upload_url = presigned_info["URL"]
379
- headers = presigned_info.get("Header", {})
380
- method = presigned_info.get("Method", "PUT").upper()
381
-
382
- for key, value in headers.items():
383
- if isinstance(value, list):
384
- headers[key] = value[0]
385
-
386
- with open(file_path, "rb") as f:
387
- file_data = f.read()
388
-
389
- if method == "PUT":
390
- upload_response = requests.put(upload_url, data=file_data, headers=headers)
391
- else:
392
- raise ValueError(f"Unsupported HTTP method: {method}")
393
-
394
- if upload_response.status_code not in (200, 201):
395
- raise Exception(
396
- f"Upload failed: {upload_response.status_code} - {upload_response.text}"
397
- )
398
-
399
- return strip_query_params(upload_url)
278
+ return matched_files
400
279
 
401
280
  if bucket_id < 1:
402
281
  raise ValueError(f"Invalid bucket_id: {bucket_id}")
@@ -419,7 +298,7 @@ class GroundX(GroundXBase):
419
298
  file_size = file.stat().st_size
420
299
 
421
300
  if (current_batch_size + file_size > MAX_BATCH_SIZE_BYTES) or (len(current_batch) >= n):
422
- _upload_file_batch(bucket_id, current_batch, upload_api, request_options, pbar)
301
+ self._upload_file_batch(bucket_id, current_batch, upload_api, request_options, pbar)
423
302
  current_batch = []
424
303
  current_batch_size = 0
425
304
 
@@ -427,7 +306,119 @@ class GroundX(GroundXBase):
427
306
  current_batch_size += file_size
428
307
 
429
308
  if current_batch:
430
- _upload_file_batch(bucket_id, current_batch, upload_api, request_options, pbar)
309
+ self._upload_file_batch(bucket_id, current_batch, upload_api, request_options, pbar)
310
+
311
+ def _upload_file(
312
+ self,
313
+ endpoint,
314
+ file_path,
315
+ ):
316
+ file_name = os.path.basename(file_path)
317
+ file_extension = os.path.splitext(file_name)[1][1:].lower()
318
+ if f".{file_extension}" in SUFFIX_ALIASES:
319
+ file_extension = SUFFIX_ALIASES[f".{file_extension}"]
320
+
321
+ presigned_info = get_presigned_url(endpoint, file_name, file_extension)
322
+
323
+ upload_url = presigned_info["URL"]
324
+ headers = presigned_info.get("Header", {})
325
+ method = presigned_info.get("Method", "PUT").upper()
326
+
327
+ for key, value in headers.items():
328
+ if isinstance(value, list):
329
+ headers[key] = value[0]
330
+
331
+ try:
332
+ with open(file_path, "rb") as f:
333
+ file_data = f.read()
334
+ except Exception as e:
335
+ raise ValueError(f"Error reading file {file_path}: {e}")
336
+
337
+ if method == "PUT":
338
+ upload_response = requests.put(upload_url, data=file_data, headers=headers)
339
+ else:
340
+ raise ValueError(f"Unsupported HTTP method: {method}")
341
+
342
+ if upload_response.status_code not in (200, 201):
343
+ raise Exception(
344
+ f"Upload failed: {upload_response.status_code} - {upload_response.text}"
345
+ )
346
+
347
+ return strip_query_params(upload_url)
348
+
349
+ def _upload_file_batch(
350
+ self,
351
+ bucket_id,
352
+ batch,
353
+ upload_api,
354
+ request_options,
355
+ pbar,
356
+ ):
357
+ docs = []
358
+
359
+ progress = len(batch)
360
+ for file in batch:
361
+ url = self._upload_file(upload_api, file)
362
+ if file.suffix.lower() in SUFFIX_ALIASES:
363
+ docs.append(
364
+ Document(
365
+ bucket_id=bucket_id,
366
+ file_name=file.name,
367
+ file_path=url,
368
+ file_type=SUFFIX_ALIASES[file.suffix.lower()],
369
+ ),
370
+ )
371
+ else:
372
+ docs.append(
373
+ Document(
374
+ bucket_id=bucket_id,
375
+ file_name=file.name,
376
+ file_path=url,
377
+ ),
378
+ )
379
+ pbar.update(0.25)
380
+ progress -= 0.25
381
+
382
+ if docs:
383
+ ingest = self.ingest(documents=docs, request_options=request_options)
384
+
385
+ completed_files = set()
386
+
387
+ while (
388
+ ingest is not None
389
+ and ingest.ingest.status not in ["complete", "error", "cancelled"]
390
+ ):
391
+ time.sleep(3)
392
+ ingest = self.documents.get_processing_status_by_id(ingest.ingest.process_id)
393
+
394
+ if ingest.ingest.progress:
395
+ if ingest.ingest.progress.processing and ingest.ingest.progress.processing.documents:
396
+ for doc in ingest.ingest.progress.processing.documents:
397
+ if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
398
+ pbar.update(0.75)
399
+ progress -= 0.75
400
+ if ingest.ingest.progress.complete and ingest.ingest.progress.complete.documents:
401
+ for doc in ingest.ingest.progress.complete.documents:
402
+ if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
403
+ pbar.update(0.75)
404
+ progress -= 0.75
405
+ if ingest.ingest.progress.cancelled and ingest.ingest.progress.cancelled.documents:
406
+ for doc in ingest.ingest.progress.cancelled.documents:
407
+ if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
408
+ pbar.update(0.75)
409
+ progress -= 0.75
410
+ if ingest.ingest.progress.errors and ingest.ingest.progress.errors.documents:
411
+ for doc in ingest.ingest.progress.errors.documents:
412
+ if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
413
+ pbar.update(0.75)
414
+ progress -= 0.75
415
+
416
+
417
+ if ingest.ingest.status in ["error", "cancelled"]:
418
+ raise ValueError(f"Ingest failed with status: {ingest.ingest.status}")
419
+
420
+ if progress > 0:
421
+ pbar.update(progress)
431
422
 
432
423
 
433
424
 
@@ -436,6 +427,7 @@ class AsyncGroundX(AsyncGroundXBase):
436
427
  self,
437
428
  *,
438
429
  documents: typing.Sequence[Document],
430
+ upload_api: str = "https://api.eyelevel.ai/upload/file",
439
431
  request_options: typing.Optional[RequestOptions] = None,
440
432
  ) -> IngestResponse:
441
433
  """
@@ -445,6 +437,10 @@ class AsyncGroundX(AsyncGroundXBase):
445
437
  ----------
446
438
  documents : typing.Sequence[Document]
447
439
 
440
+ # an endpoint that accepts 'name' and 'type' query params
441
+ # and returns a presigned URL in a JSON dictionary with key 'URL'
442
+ upload_api : typing.Optional[str]
443
+
448
444
  request_options : typing.Optional[RequestOptions]
449
445
  Request-specific configuration.
450
446
 
@@ -479,53 +475,76 @@ class AsyncGroundX(AsyncGroundXBase):
479
475
  """
480
476
  remote_documents, local_documents = prep_documents(documents)
481
477
 
482
- if local_documents and remote_documents:
483
- raise ValueError("Documents must all be either local or remote, not a mix.")
478
+ if len(remote_documents) + len(local_documents) > MAX_BATCH_SIZE:
479
+ raise ValueError("You have sent too many documents in this request")
484
480
 
485
- if len(remote_documents) > 0:
486
- return await self.documents.ingest_remote(
487
- documents=remote_documents,
488
- request_options=request_options,
489
- )
481
+ if len(remote_documents) + len(local_documents) == 0:
482
+ raise ValueError("No valid documents were provided")
490
483
 
491
- timeout = self._client_wrapper.get_timeout()
492
- headers = self._client_wrapper.get_headers()
493
- base_url = self._client_wrapper.get_base_url().rstrip("/")
484
+ for d in local_documents:
485
+ splits = split_doc(Path(os.path.expanduser(d.file_path)))
494
486
 
495
- url = f"{base_url}/v1/ingest/documents/local"
487
+ for sd in splits:
488
+ url = self._upload_file(upload_api, sd)
496
489
 
497
- try:
498
- async with aiohttp.ClientSession() as session:
499
- data = aiohttp.FormData()
500
- for field_name, (file_name, file_obj, content_type) in local_documents:
501
- data.add_field(
502
- name=field_name,
503
- value=file_obj,
504
- filename=file_name,
505
- content_type=content_type,
506
- )
490
+ ft = d.file_type
491
+ if sd.suffix.lower() in SUFFIX_ALIASES:
492
+ ft = SUFFIX_ALIASES[sd.suffix.lower()]
507
493
 
508
- async with session.post(
509
- url, data=data, headers=headers, timeout=timeout
510
- ) as response:
511
- if 200 <= response.status < 300:
512
- response_data = await response.json()
513
- return typing.cast(
514
- IngestResponse,
515
- parse_obj_as(
516
- type_=IngestResponse, # type: ignore
517
- object_=response_data,
518
- ),
519
- )
520
- if response.status == 400:
521
- raise BadRequestError(await response.json())
522
- if response.status == 401:
523
- raise UnauthorizedError(await response.json())
524
-
525
- raise ApiError(
526
- status_code=response.status, body=await response.text()
494
+ fn = sd.name
495
+ if len(splits) == 1 and d.file_name:
496
+ fn = d.file_name
497
+
498
+ remote_documents.append(
499
+ IngestRemoteDocument(
500
+ bucket_id=d.bucket_id,
501
+ file_name=fn,
502
+ file_type=ft,
503
+ process_level=d.process_level,
504
+ search_data=d.search_data,
505
+ source_url=url,
527
506
  )
528
- except TimeoutError:
529
- raise ApiError(status_code=408, body="Request timed out")
530
- except aiohttp.ClientError as e:
531
- raise ApiError(status_code=500, body=str(e))
507
+ )
508
+
509
+ return await self.documents.ingest_remote(
510
+ documents=remote_documents,
511
+ request_options=request_options,
512
+ )
513
+
514
+ def _upload_file(
515
+ self,
516
+ endpoint,
517
+ file_path,
518
+ ):
519
+ file_name = os.path.basename(file_path)
520
+ file_extension = os.path.splitext(file_name)[1][1:].lower()
521
+ if f".{file_extension}" in SUFFIX_ALIASES:
522
+ file_extension = SUFFIX_ALIASES[f".{file_extension}"]
523
+
524
+ presigned_info = get_presigned_url(endpoint, file_name, file_extension)
525
+
526
+ upload_url = presigned_info["URL"]
527
+ headers = presigned_info.get("Header", {})
528
+ method = presigned_info.get("Method", "PUT").upper()
529
+
530
+ for key, value in headers.items():
531
+ if isinstance(value, list):
532
+ headers[key] = value[0]
533
+
534
+ try:
535
+ with open(file_path, "rb") as f:
536
+ file_data = f.read()
537
+ except Exception as e:
538
+ raise ValueError(f"Error reading file {file_path}: {e}")
539
+
540
+ if method == "PUT":
541
+ upload_response = requests.put(upload_url, data=file_data, headers=headers)
542
+ else:
543
+ raise ValueError(f"Unsupported HTTP method: {method}")
544
+
545
+ if upload_response.status_code not in (200, 201):
546
+ raise Exception(
547
+ f"Upload failed: {upload_response.status_code} - {upload_response.text}"
548
+ )
549
+
550
+ return strip_query_params(upload_url)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: groundx
3
- Version: 2.2.7
3
+ Version: 2.2.9
4
4
  Summary:
5
5
  License: MIT
6
6
  Requires-Python: >=3.8,<4.0
@@ -4,7 +4,7 @@ groundx/buckets/client.py,sha256=4jlc9vfIult1mMJ4FZW4_KFJybZPStZt1FUplIgrxbU,239
4
4
  groundx/client.py,sha256=dIW9OyrMyfC1N7HSxRrHh0w_8rJ8osNUOPdYD6ueQ6g,6515
5
5
  groundx/core/__init__.py,sha256=SQ85PF84B9MuKnBwHNHWemSGuy-g_515gFYNFhvEE0I,1438
6
6
  groundx/core/api_error.py,sha256=RE8LELok2QCjABadECTvtDp7qejA1VmINCh6TbqPwSE,426
7
- groundx/core/client_wrapper.py,sha256=Bhc6L2UfeJoET17u-IIW6OWHD5GwdYaita2HNWDJjr4,1802
7
+ groundx/core/client_wrapper.py,sha256=D6uZpUYxYzmgxNNCTN7quiFvNlBQmPOyLstnrXfcJcs,1802
8
8
  groundx/core/datetime_utils.py,sha256=nBys2IsYrhPdszxGKCNRPSOCwa-5DWOHG95FB8G9PKo,1047
9
9
  groundx/core/file.py,sha256=d4NNbX8XvXP32z8KpK2Xovv33nFfruIrpz0QWxlgpZk,2663
10
10
  groundx/core/http_client.py,sha256=Z77OIxIbL4OAB2IDqjRq_sYa5yNYAWfmdhdCSSvh6Y4,19552
@@ -14,6 +14,7 @@ groundx/core/query_encoder.py,sha256=ekulqNd0j8TgD7ox-Qbz7liqX8-KP9blvT9DsRCenYM
14
14
  groundx/core/remove_none_from_dict.py,sha256=EU9SGgYidWq7SexuJbNs4-PZ-5Bl3Vppd864mS6vQZw,342
15
15
  groundx/core/request_options.py,sha256=h0QUNCFVdCW_7GclVySCAY2w4NhtXVBUCmHgmzaxpcg,1681
16
16
  groundx/core/serialization.py,sha256=D9h_t-RQON3-CHWs1C4ESY9B-Yd5d-l5lnTLb_X896g,9601
17
+ groundx/csv_splitter.py,sha256=6HGXdDpwBX_IJaCbla1WuirJERBTvjLzBf9OBtwGFWU,2254
17
18
  groundx/customer/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
18
19
  groundx/customer/client.py,sha256=C_JANeDewRD1Kg-q7LPxdiOSWbYSTOiYlBYZLRYPI44,3467
19
20
  groundx/documents/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
@@ -26,7 +27,7 @@ groundx/groups/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
26
27
  groundx/groups/client.py,sha256=bytQRh9m7e4vIuYHb7dD1kCTQZvyBxedCqGnmmLqrsI,35237
27
28
  groundx/health/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
28
29
  groundx/health/client.py,sha256=fcTa21RWPyBuT77PQ0EncC6rBaW_DrYlRvudy9-0H58,7545
29
- groundx/ingest.py,sha256=RTgmeg_4cEaZynSEyf-3ArKGBcnhbcZhJl7BAeUeAMU,18187
30
+ groundx/ingest.py,sha256=LtnUGcgtE1MNYL3PGFrzPqRMnLeOxr-fVsZ3fmTAUKI,18294
30
31
  groundx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
32
  groundx/search/__init__.py,sha256=RagVzjShP33mDg9o4N3kGzV0egL1RYNjCpXPE8VzMYE,145
32
33
  groundx/search/client.py,sha256=zrrqFy0HowDUYPsMU4nfvDV2RgmkEQ4E8WYNktu3xcs,18684
@@ -81,7 +82,7 @@ groundx/types/subscription_detail.py,sha256=WNfUw2EMVECIvNYcV2s51zZ6T3Utc4zYXw63
81
82
  groundx/types/subscription_detail_meters.py,sha256=lBa8-1QlMVHjr5RLGqhiTKnD1KMM0AAHTWvz9TVtG8w,830
82
83
  groundx/types/website_source.py,sha256=3WeRCiilNKKBTfhwgjo3jbcVI3vLTeM-KxI6dVzpg9o,1578
83
84
  groundx/version.py,sha256=1yVogKaq260fQfckM2RYN2144SEw0QROsZW8ICtkG4U,74
84
- groundx-2.2.7.dist-info/LICENSE,sha256=dFE6nY1bHnSn6NqmdlghlU1gQqLqYNphrceGVehSa7o,1065
85
- groundx-2.2.7.dist-info/METADATA,sha256=Gx6smhve9G7ECJbxQBrw3pJ_LosNgtehM3VETuW0c9I,5173
86
- groundx-2.2.7.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
87
- groundx-2.2.7.dist-info/RECORD,,
85
+ groundx-2.2.9.dist-info/LICENSE,sha256=dFE6nY1bHnSn6NqmdlghlU1gQqLqYNphrceGVehSa7o,1065
86
+ groundx-2.2.9.dist-info/METADATA,sha256=1BWmC2-Lx8AT1vEY0juSK4ugjxyn9W5ndJy0bomFEwQ,5173
87
+ groundx-2.2.9.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
88
+ groundx-2.2.9.dist-info/RECORD,,