groundx 2.2.4__tar.gz → 2.2.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. {groundx-2.2.4 → groundx-2.2.8}/PKG-INFO +2 -1
  2. {groundx-2.2.4 → groundx-2.2.8}/pyproject.toml +2 -1
  3. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/core/client_wrapper.py +1 -1
  4. groundx-2.2.8/src/groundx/ingest.py +506 -0
  5. groundx-2.2.4/src/groundx/ingest.py +0 -335
  6. {groundx-2.2.4 → groundx-2.2.8}/LICENSE +0 -0
  7. {groundx-2.2.4 → groundx-2.2.8}/README.md +0 -0
  8. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/__init__.py +0 -0
  9. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/buckets/__init__.py +0 -0
  10. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/buckets/client.py +0 -0
  11. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/client.py +0 -0
  12. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/core/__init__.py +0 -0
  13. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/core/api_error.py +0 -0
  14. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/core/datetime_utils.py +0 -0
  15. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/core/file.py +0 -0
  16. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/core/http_client.py +0 -0
  17. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/core/jsonable_encoder.py +0 -0
  18. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/core/pydantic_utilities.py +0 -0
  19. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/core/query_encoder.py +0 -0
  20. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/core/remove_none_from_dict.py +0 -0
  21. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/core/request_options.py +0 -0
  22. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/core/serialization.py +0 -0
  23. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/customer/__init__.py +0 -0
  24. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/customer/client.py +0 -0
  25. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/documents/__init__.py +0 -0
  26. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/documents/client.py +0 -0
  27. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/environment.py +0 -0
  28. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/errors/__init__.py +0 -0
  29. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/errors/bad_request_error.py +0 -0
  30. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/errors/unauthorized_error.py +0 -0
  31. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/groups/__init__.py +0 -0
  32. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/groups/client.py +0 -0
  33. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/health/__init__.py +0 -0
  34. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/health/client.py +0 -0
  35. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/py.typed +0 -0
  36. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/search/__init__.py +0 -0
  37. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/search/client.py +0 -0
  38. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/search/types/__init__.py +0 -0
  39. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/search/types/search_content_request_id.py +0 -0
  40. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/__init__.py +0 -0
  41. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/bounding_box_detail.py +0 -0
  42. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/bucket_detail.py +0 -0
  43. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/bucket_list_response.py +0 -0
  44. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/bucket_response.py +0 -0
  45. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/bucket_update_detail.py +0 -0
  46. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/bucket_update_response.py +0 -0
  47. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/customer_detail.py +0 -0
  48. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/customer_response.py +0 -0
  49. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/document.py +0 -0
  50. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/document_detail.py +0 -0
  51. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/document_list_response.py +0 -0
  52. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/document_local_ingest_request.py +0 -0
  53. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/document_lookup_response.py +0 -0
  54. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/document_response.py +0 -0
  55. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/document_type.py +0 -0
  56. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/group_detail.py +0 -0
  57. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/group_list_response.py +0 -0
  58. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/group_response.py +0 -0
  59. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/health_response.py +0 -0
  60. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/health_response_health.py +0 -0
  61. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/health_service.py +0 -0
  62. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/health_service_status.py +0 -0
  63. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/ingest_local_document.py +0 -0
  64. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/ingest_local_document_metadata.py +0 -0
  65. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/ingest_remote_document.py +0 -0
  66. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/ingest_response.py +0 -0
  67. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/ingest_response_ingest.py +0 -0
  68. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/message_response.py +0 -0
  69. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/meter_detail.py +0 -0
  70. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/process_level.py +0 -0
  71. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/process_status_response.py +0 -0
  72. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/process_status_response_ingest.py +0 -0
  73. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/process_status_response_ingest_progress.py +0 -0
  74. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/process_status_response_ingest_progress_cancelled.py +0 -0
  75. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/process_status_response_ingest_progress_complete.py +0 -0
  76. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/process_status_response_ingest_progress_errors.py +0 -0
  77. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/process_status_response_ingest_progress_processing.py +0 -0
  78. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/processes_status_response.py +0 -0
  79. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/processing_status.py +0 -0
  80. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/search_response.py +0 -0
  81. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/search_response_search.py +0 -0
  82. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/search_result_item.py +0 -0
  83. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/sort.py +0 -0
  84. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/sort_order.py +0 -0
  85. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/subscription_detail.py +0 -0
  86. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/subscription_detail_meters.py +0 -0
  87. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/types/website_source.py +0 -0
  88. {groundx-2.2.4 → groundx-2.2.8}/src/groundx/version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: groundx
3
- Version: 2.2.4
3
+ Version: 2.2.8
4
4
  Summary:
5
5
  License: MIT
6
6
  Requires-Python: >=3.8,<4.0
@@ -26,6 +26,7 @@ Requires-Dist: pydantic (>=1.9.2)
26
26
  Requires-Dist: pydantic-core (>=2.18.2,<3.0.0)
27
27
  Requires-Dist: requests (>=2.4.0)
28
28
  Requires-Dist: tqdm (>=4.60.0)
29
+ Requires-Dist: types-tqdm (>=4.60.0)
29
30
  Requires-Dist: typing_extensions (>=4.0.0)
30
31
  Description-Content-Type: text/markdown
31
32
 
@@ -3,7 +3,7 @@ name = "groundx"
3
3
 
4
4
  [tool.poetry]
5
5
  name = "groundx"
6
- version = "2.2.4"
6
+ version = "2.2.8"
7
7
  description = ""
8
8
  readme = "README.md"
9
9
  authors = []
@@ -42,6 +42,7 @@ pydantic = ">= 1.9.2"
42
42
  pydantic-core = "^2.18.2"
43
43
  requests = ">=2.4.0"
44
44
  tqdm = ">=4.60.0"
45
+ types-tqdm = ">=4.60.0"
45
46
  typing_extensions = ">= 4.0.0"
46
47
 
47
48
  [tool.poetry.dev-dependencies]
@@ -16,7 +16,7 @@ class BaseClientWrapper:
16
16
  headers: typing.Dict[str, str] = {
17
17
  "X-Fern-Language": "Python",
18
18
  "X-Fern-SDK-Name": "groundx",
19
- "X-Fern-SDK-Version": "2.2.4",
19
+ "X-Fern-SDK-Version": "2.2.8",
20
20
  }
21
21
  headers["X-API-Key"] = self.api_key
22
22
  return headers
@@ -0,0 +1,506 @@
1
+ import requests, time, typing, os
2
+ from pathlib import Path
3
+ from tqdm import tqdm
4
+ from urllib.parse import urlparse, urlunparse
5
+
6
+ from .client import GroundXBase, AsyncGroundXBase
7
+ from .core.request_options import RequestOptions
8
+ from .types.document import Document
9
+ from .types.document_type import DocumentType
10
+ from .types.ingest_remote_document import IngestRemoteDocument
11
+ from .types.ingest_response import IngestResponse
12
+
13
+ # this is used as the default value for optional parameters
14
+ OMIT = typing.cast(typing.Any, ...)
15
+
16
+
17
+ DOCUMENT_TYPE_TO_MIME = {
18
+ "bmp": "image/bmp",
19
+ "gif": "image/gif",
20
+ "heif": "image/heif",
21
+ "hwp": "application/x-hwp",
22
+ "ico": "image/vnd.microsoft.icon",
23
+ "svg": "image/svg",
24
+ "tiff": "image/tiff",
25
+ "webp": "image/webp",
26
+ "txt": "text/plain",
27
+ "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
28
+ "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
29
+ "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
30
+ "pdf": "application/pdf",
31
+ "png": "image/png",
32
+ "jpg": "image/jpeg",
33
+ "csv": "text/csv",
34
+ "tsv": "text/tab-separated-values",
35
+ "json": "application/json",
36
+ }
37
+ MIME_TO_DOCUMENT_TYPE = {v: k for k, v in DOCUMENT_TYPE_TO_MIME.items()}
38
+
39
+ ALLOWED_SUFFIXES = {f".{k}": v for k, v in DOCUMENT_TYPE_TO_MIME.items()}
40
+
41
+ SUFFIX_ALIASES = {
42
+ ".jpeg": ".jpg",
43
+ ".heic": ".heif",
44
+ ".tif": ".tiff",
45
+ }
46
+
47
+ MAX_BATCH_SIZE = 50
48
+ MIN_BATCH_SIZE = 1
49
+ MAX_BATCH_SIZE_BYTES = 50 * 1024 * 1024
50
+
51
+ def get_presigned_url(
52
+ endpoint: str,
53
+ file_name: str,
54
+ file_extension: str,
55
+ ) -> typing.Dict[str, typing.Any]:
56
+ params = {"name": file_name, "type": file_extension}
57
+ response = requests.get(endpoint, params=params)
58
+ response.raise_for_status()
59
+
60
+ return response.json()
61
+
62
+ def strip_query_params(
63
+ url: str,
64
+ ) -> str:
65
+ parsed = urlparse(url)
66
+ clean_url = urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", ""))
67
+
68
+ return clean_url
69
+
70
+ def prep_documents(
71
+ documents: typing.Sequence[Document],
72
+ ) -> typing.Tuple[
73
+ typing.List[IngestRemoteDocument],
74
+ typing.List[Document],
75
+ ]:
76
+ """
77
+ Process documents and separate them into remote and local documents.
78
+ """
79
+ if not documents:
80
+ raise ValueError("No documents provided for ingestion.")
81
+
82
+ def is_valid_local_path(path: str) -> bool:
83
+ expanded_path = os.path.expanduser(path)
84
+ return os.path.exists(expanded_path)
85
+
86
+ def is_valid_url(path: str) -> bool:
87
+ try:
88
+ result = urlparse(path)
89
+ return all([result.scheme, result.netloc])
90
+ except ValueError:
91
+ return False
92
+
93
+ local_documents: typing.List[Document] = []
94
+ remote_documents: typing.List[IngestRemoteDocument] = []
95
+
96
+ for document in documents:
97
+ if not hasattr(document, "file_path"):
98
+ raise ValueError("Each document must have a 'file_path' attribute.")
99
+
100
+ if is_valid_url(document.file_path):
101
+ remote_document = IngestRemoteDocument(
102
+ bucket_id=document.bucket_id,
103
+ file_name=document.file_name,
104
+ file_type=document.file_type,
105
+ process_level=document.process_level,
106
+ search_data=document.search_data,
107
+ source_url=document.file_path,
108
+ )
109
+ remote_documents.append(remote_document)
110
+ elif is_valid_local_path(document.file_path):
111
+ local_documents.append(document)
112
+ else:
113
+ raise ValueError(f"Invalid file path: {document.file_path}")
114
+
115
+ return remote_documents, local_documents
116
+
117
+
118
+ class GroundX(GroundXBase):
119
+ def ingest(
120
+ self,
121
+ *,
122
+ documents: typing.Sequence[Document],
123
+ upload_api: typing.Optional[str] = "https://api.eyelevel.ai/upload/file",
124
+ request_options: typing.Optional[RequestOptions] = None,
125
+ ) -> IngestResponse:
126
+ """
127
+ Ingest local or hosted documents into a GroundX bucket.
128
+
129
+ Parameters
130
+ ----------
131
+ documents : typing.Sequence[Document]
132
+
133
+ # an endpoint that accepts 'name' and 'type' query params
134
+ # and returns a presigned URL in a JSON dictionary with key 'URL'
135
+ upload_api : typing.Optional[str]
136
+
137
+ request_options : typing.Optional[RequestOptions]
138
+ Request-specific configuration.
139
+
140
+ Returns
141
+ -------
142
+ IngestResponse
143
+ Documents successfully uploaded
144
+
145
+ Examples
146
+ --------
147
+ from groundx import Document, GroundX
148
+
149
+ client = GroundX(
150
+ api_key="YOUR_API_KEY",
151
+ )
152
+
153
+ client.ingest(
154
+ documents=[
155
+ Document(
156
+ bucket_id=1234,
157
+ file_name="my_file1.txt",
158
+ file_path="https://my.source.url.com/file1.txt",
159
+ file_type="txt",
160
+ )
161
+ ],
162
+ )
163
+ """
164
+ remote_documents, local_documents = prep_documents(documents)
165
+
166
+ if local_documents and remote_documents:
167
+ raise ValueError("Documents must all be either local or remote, not a mix.")
168
+
169
+ if len(remote_documents) > 0:
170
+ if len(remote_documents) > MAX_BATCH_SIZE:
171
+ raise ValueError("You have sent too many documents in this request")
172
+
173
+ return self.documents.ingest_remote(
174
+ documents=remote_documents,
175
+ request_options=request_options,
176
+ )
177
+
178
+ if len(local_documents) > MAX_BATCH_SIZE:
179
+ raise ValueError("You have sent too many documents in this request")
180
+
181
+ if len(local_documents) == 0:
182
+ raise ValueError("No valid documents were provided")
183
+
184
+ docs: typing.List[IngestRemoteDocument] = []
185
+ for d in local_documents:
186
+ url = self._upload_file(upload_api, Path(os.path.expanduser(d.file_path)))
187
+
188
+ docs.append(
189
+ IngestRemoteDocument(
190
+ bucket_id=d.bucket_id,
191
+ file_name=d.file_name,
192
+ file_type=d.file_type,
193
+ process_level=d.process_level,
194
+ search_data=d.search_data,
195
+ source_url=url,
196
+ )
197
+ )
198
+
199
+ return self.documents.ingest_remote(
200
+ documents=docs,
201
+ request_options=request_options,
202
+ )
203
+
204
+ def ingest_directory(
205
+ self,
206
+ *,
207
+ bucket_id: int,
208
+ path: str,
209
+ batch_size: typing.Optional[int] = 10,
210
+ upload_api: typing.Optional[str] = "https://api.eyelevel.ai/upload/file",
211
+ request_options: typing.Optional[RequestOptions] = None,
212
+ ):
213
+ """
214
+ Ingest documents from a local directory into a GroundX bucket.
215
+
216
+ Parameters
217
+ ----------
218
+ bucket_id : int
219
+ path : str
220
+ batch_size : type.Optional[int]
221
+
222
+ # an endpoint that accepts 'name' and 'type' query params
223
+ # and returns a presigned URL in a JSON dictionary with key 'URL'
224
+ upload_api : typing.Optional[str]
225
+
226
+ request_options : typing.Optional[RequestOptions]
227
+ Request-specific configuration.
228
+
229
+ Returns
230
+ -------
231
+ IngestResponse
232
+ Documents successfully uploaded
233
+
234
+ Examples
235
+ --------
236
+ from groundx import Document, GroundX
237
+
238
+ client = GroundX(
239
+ api_key="YOUR_API_KEY",
240
+ )
241
+
242
+ client.ingest_directory(
243
+ bucket_id=0,
244
+ path="/path/to/directory"
245
+ )
246
+ """
247
+
248
+ def is_valid_local_directory(path: str) -> bool:
249
+ expanded_path = os.path.expanduser(path)
250
+ return os.path.isdir(expanded_path)
251
+
252
+ def load_directory_files(directory: str) -> typing.List[Path]:
253
+ dir_path = Path(directory)
254
+
255
+ matched_files = [
256
+ file
257
+ for file in dir_path.rglob("*")
258
+ if file.is_file() and (
259
+ file.suffix.lower() in ALLOWED_SUFFIXES
260
+ or file.suffix.lower() in SUFFIX_ALIASES
261
+ )
262
+ ]
263
+
264
+ return matched_files
265
+
266
+ if bucket_id < 1:
267
+ raise ValueError(f"Invalid bucket_id: {bucket_id}")
268
+
269
+ if is_valid_local_directory(path) is not True:
270
+ raise ValueError(f"Invalid directory path: {path}")
271
+
272
+ files = load_directory_files(path)
273
+
274
+ if len(files) < 1:
275
+ raise ValueError(f"No supported files found in directory: {path}")
276
+
277
+ current_batch: typing.List[Path] = []
278
+ current_batch_size: int = 0
279
+
280
+ n = max(MIN_BATCH_SIZE, min(batch_size or MIN_BATCH_SIZE, MAX_BATCH_SIZE))
281
+
282
+ with tqdm(total=len(files), desc="Ingesting Files", unit="file") as pbar:
283
+ for file in files:
284
+ file_size = file.stat().st_size
285
+
286
+ if (current_batch_size + file_size > MAX_BATCH_SIZE_BYTES) or (len(current_batch) >= n):
287
+ self._upload_file_batch(bucket_id, current_batch, upload_api, request_options, pbar)
288
+ current_batch = []
289
+ current_batch_size = 0
290
+
291
+ current_batch.append(file)
292
+ current_batch_size += file_size
293
+
294
+ if current_batch:
295
+ self._upload_file_batch(bucket_id, current_batch, upload_api, request_options, pbar)
296
+
297
+ def _upload_file(
298
+ self,
299
+ endpoint,
300
+ file_path,
301
+ ):
302
+ file_name = os.path.basename(file_path)
303
+ file_extension = os.path.splitext(file_name)[1][1:].lower()
304
+
305
+ presigned_info = get_presigned_url(endpoint, file_name, file_extension)
306
+
307
+ upload_url = presigned_info["URL"]
308
+ headers = presigned_info.get("Header", {})
309
+ method = presigned_info.get("Method", "PUT").upper()
310
+
311
+ for key, value in headers.items():
312
+ if isinstance(value, list):
313
+ headers[key] = value[0]
314
+
315
+ try:
316
+ with open(file_path, "rb") as f:
317
+ file_data = f.read()
318
+ except Exception as e:
319
+ raise ValueError(f"Error reading file {file_path}: {e}")
320
+
321
+ if method == "PUT":
322
+ upload_response = requests.put(upload_url, data=file_data, headers=headers)
323
+ else:
324
+ raise ValueError(f"Unsupported HTTP method: {method}")
325
+
326
+ if upload_response.status_code not in (200, 201):
327
+ raise Exception(
328
+ f"Upload failed: {upload_response.status_code} - {upload_response.text}"
329
+ )
330
+
331
+ return strip_query_params(upload_url)
332
+
333
+ def _upload_file_batch(
334
+ self,
335
+ bucket_id,
336
+ batch,
337
+ upload_api,
338
+ request_options,
339
+ pbar,
340
+ ):
341
+ docs = []
342
+
343
+ progress = len(batch)
344
+ for file in batch:
345
+ url = self._upload_file(upload_api, file)
346
+ docs.append(
347
+ Document(
348
+ bucket_id=bucket_id,
349
+ file_path=url,
350
+ ),
351
+ )
352
+ pbar.update(0.25)
353
+ progress -= 0.25
354
+
355
+ if docs:
356
+ ingest = self.ingest(documents=docs, request_options=request_options)
357
+
358
+ completed_files = set()
359
+
360
+ while (
361
+ ingest is not None
362
+ and ingest.ingest.status not in ["complete", "error", "cancelled"]
363
+ ):
364
+ time.sleep(3)
365
+ ingest = self.documents.get_processing_status_by_id(ingest.ingest.process_id)
366
+
367
+ if ingest.ingest.progress and ingest.ingest.progress.processing:
368
+ for doc in ingest.ingest.progress.processing.documents:
369
+ if doc.status == "complete" and doc.document_id not in completed_files:
370
+ pbar.update(0.75)
371
+ progress -= 0.75
372
+
373
+ if ingest.ingest.status in ["error", "cancelled"]:
374
+ raise ValueError(f"Ingest failed with status: {ingest.ingest.status}")
375
+
376
+ if progress > 0:
377
+ pbar.update(progress)
378
+
379
+
380
+
381
+ class AsyncGroundX(AsyncGroundXBase):
382
+ async def ingest(
383
+ self,
384
+ *,
385
+ documents: typing.Sequence[Document],
386
+ upload_api: str = "https://api.eyelevel.ai/upload/file",
387
+ request_options: typing.Optional[RequestOptions] = None,
388
+ ) -> IngestResponse:
389
+ """
390
+ Ingest local or hosted documents into a GroundX bucket.
391
+
392
+ Parameters
393
+ ----------
394
+ documents : typing.Sequence[Document]
395
+
396
+ # an endpoint that accepts 'name' and 'type' query params
397
+ # and returns a presigned URL in a JSON dictionary with key 'URL'
398
+ upload_api : typing.Optional[str]
399
+
400
+ request_options : typing.Optional[RequestOptions]
401
+ Request-specific configuration.
402
+
403
+ Returns
404
+ -------
405
+ IngestResponse
406
+ Documents successfully uploaded
407
+
408
+ Examples
409
+ --------
410
+ import asyncio
411
+
412
+ from groundx import AsyncGroundX, Document
413
+
414
+ client = AsyncGroundX(
415
+ api_key="YOUR_API_KEY",
416
+ )
417
+
418
+ async def main() -> None:
419
+ await client.ingest(
420
+ documents=[
421
+ Document(
422
+ bucket_id=1234,
423
+ file_name="my_file1.txt",
424
+ file_path="https://my.source.url.com/file1.txt",
425
+ file_type="txt",
426
+ )
427
+ ],
428
+ )
429
+
430
+ asyncio.run(main())
431
+ """
432
+ remote_documents, local_documents = prep_documents(documents)
433
+
434
+ if local_documents and remote_documents:
435
+ raise ValueError("Documents must all be either local or remote, not a mix.")
436
+
437
+ if len(remote_documents) > 0:
438
+ if len(remote_documents) > MAX_BATCH_SIZE:
439
+ raise ValueError("You have sent too many documents in this request")
440
+
441
+ return await self.documents.ingest_remote(
442
+ documents=remote_documents,
443
+ request_options=request_options,
444
+ )
445
+
446
+ if len(local_documents) > MAX_BATCH_SIZE:
447
+ raise ValueError("You have sent too many documents in this request")
448
+
449
+ if len(local_documents) == 0:
450
+ raise ValueError("No valid documents were provided")
451
+
452
+ docs: typing.List[IngestRemoteDocument] = []
453
+ for d in local_documents:
454
+ url = self._upload_file(upload_api, Path(os.path.expanduser(d.file_path)))
455
+
456
+ docs.append(
457
+ IngestRemoteDocument(
458
+ bucket_id=d.bucket_id,
459
+ file_name=d.file_name,
460
+ file_type=d.file_type,
461
+ process_level=d.process_level,
462
+ search_data=d.search_data,
463
+ source_url=url,
464
+ )
465
+ )
466
+
467
+ return await self.documents.ingest_remote(
468
+ documents=docs,
469
+ request_options=request_options,
470
+ )
471
+
472
+ def _upload_file(
473
+ self,
474
+ endpoint,
475
+ file_path,
476
+ ):
477
+ file_name = os.path.basename(file_path)
478
+ file_extension = os.path.splitext(file_name)[1][1:].lower()
479
+
480
+ presigned_info = get_presigned_url(endpoint, file_name, file_extension)
481
+
482
+ upload_url = presigned_info["URL"]
483
+ headers = presigned_info.get("Header", {})
484
+ method = presigned_info.get("Method", "PUT").upper()
485
+
486
+ for key, value in headers.items():
487
+ if isinstance(value, list):
488
+ headers[key] = value[0]
489
+
490
+ try:
491
+ with open(file_path, "rb") as f:
492
+ file_data = f.read()
493
+ except Exception as e:
494
+ raise ValueError(f"Error reading file {file_path}: {e}")
495
+
496
+ if method == "PUT":
497
+ upload_response = requests.put(upload_url, data=file_data, headers=headers)
498
+ else:
499
+ raise ValueError(f"Unsupported HTTP method: {method}")
500
+
501
+ if upload_response.status_code not in (200, 201):
502
+ raise Exception(
503
+ f"Upload failed: {upload_response.status_code} - {upload_response.text}"
504
+ )
505
+
506
+ return strip_query_params(upload_url)
@@ -1,335 +0,0 @@
1
- import aiohttp, io, json, mimetypes, requests, typing, os
2
- from asyncio import TimeoutError
3
- from urllib.parse import urlparse
4
-
5
- from json.decoder import JSONDecodeError
6
-
7
- from .client import GroundXBase, AsyncGroundXBase
8
- from .core.api_error import ApiError
9
- from .core.pydantic_utilities import parse_obj_as
10
- from .core.request_options import RequestOptions
11
- from .errors.bad_request_error import BadRequestError
12
- from .errors.unauthorized_error import UnauthorizedError
13
- from .types.document import Document
14
- from .types.ingest_remote_document import IngestRemoteDocument
15
- from .types.ingest_response import IngestResponse
16
-
17
- # this is used as the default value for optional parameters
18
- OMIT = typing.cast(typing.Any, ...)
19
-
20
-
21
- DOCUMENT_TYPE_TO_MIME = {
22
- "txt": "text/plain",
23
- "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
24
- "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
25
- "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
26
- "pdf": "application/pdf",
27
- "png": "image/png",
28
- "jpg": "image/jpeg",
29
- "csv": "text/csv",
30
- "tsv": "text/tab-separated-values",
31
- "json": "application/json",
32
- }
33
- MIME_TO_DOCUMENT_TYPE = {v: k for k, v in DOCUMENT_TYPE_TO_MIME.items()}
34
-
35
-
36
- def prep_documents(
37
- documents: typing.Sequence[Document],
38
- ) -> typing.Tuple[
39
- typing.List[IngestRemoteDocument],
40
- typing.List[
41
- typing.Tuple[str, typing.Tuple[typing.Union[str, None], typing.BinaryIO, str]]
42
- ],
43
- ]:
44
- """
45
- Process documents and separate them into remote and local documents.
46
- """
47
- if not documents:
48
- raise ValueError("No documents provided for ingestion.")
49
-
50
- def is_valid_local_path(path: str) -> bool:
51
- expanded_path = os.path.expanduser(path)
52
- return os.path.exists(expanded_path)
53
-
54
- def is_valid_url(path: str) -> bool:
55
- try:
56
- result = urlparse(path)
57
- return all([result.scheme, result.netloc])
58
- except ValueError:
59
- return False
60
-
61
- local_documents: typing.List[
62
- typing.Tuple[str, typing.Tuple[typing.Union[str, None], typing.BinaryIO, str]]
63
- ] = []
64
- remote_documents: typing.List[IngestRemoteDocument] = []
65
-
66
- for document in documents:
67
- if not hasattr(document, "file_path"):
68
- raise ValueError("Each document must have a 'file_path' attribute.")
69
-
70
- if is_valid_url(document.file_path):
71
- remote_document = IngestRemoteDocument(
72
- bucket_id=document.bucket_id,
73
- file_name=document.file_name,
74
- file_type=document.file_type,
75
- process_level=document.process_level,
76
- search_data=document.search_data,
77
- source_url=document.file_path,
78
- )
79
- remote_documents.append(remote_document)
80
- elif is_valid_local_path(document.file_path):
81
- expanded_path = os.path.expanduser(document.file_path)
82
- file_name = os.path.basename(expanded_path)
83
- mime_type = mimetypes.guess_type(file_name)[0] or "application/octet-stream"
84
- file_type = MIME_TO_DOCUMENT_TYPE.get(mime_type, None)
85
- if document.file_type:
86
- file_type = document.file_type
87
- mime_type = DOCUMENT_TYPE_TO_MIME.get(
88
- document.file_type, "application/octet-stream"
89
- )
90
-
91
- if document.file_name:
92
- file_name = document.file_name
93
-
94
- try:
95
- local_documents.append(
96
- (
97
- "blob",
98
- (
99
- file_name,
100
- open(expanded_path, "rb"),
101
- mime_type,
102
- ),
103
- )
104
- )
105
- except Exception as e:
106
- raise ValueError(f"Error reading file {expanded_path}: {e}")
107
-
108
- metadata = {
109
- "bucketId": document.bucket_id,
110
- "fileName": file_name,
111
- "fileType": file_type,
112
- }
113
- if document.process_level:
114
- metadata["processLevel"] = document.process_level
115
- if document.search_data:
116
- metadata["searchData"] = document.search_data
117
-
118
- local_documents.append(
119
- (
120
- "metadata",
121
- (
122
- f"data.json",
123
- io.BytesIO(json.dumps(metadata).encode("utf-8")),
124
- "application/json",
125
- ),
126
- )
127
- )
128
- else:
129
- raise ValueError(f"Invalid file path: {document.file_path}")
130
-
131
- return remote_documents, local_documents
132
-
133
-
134
- class GroundX(GroundXBase):
135
- def ingest(
136
- self,
137
- *,
138
- documents: typing.Sequence[Document],
139
- request_options: typing.Optional[RequestOptions] = None,
140
- ) -> IngestResponse:
141
- """
142
- Ingest local or hosted documents into a GroundX bucket.
143
-
144
- Parameters
145
- ----------
146
- documents : typing.Sequence[Document]
147
-
148
- request_options : typing.Optional[RequestOptions]
149
- Request-specific configuration.
150
-
151
- Returns
152
- -------
153
- IngestResponse
154
- Documents successfully uploaded
155
-
156
- Examples
157
- --------
158
- from groundx import Document, GroundX
159
-
160
- client = GroundX(
161
- api_key="YOUR_API_KEY",
162
- )
163
-
164
- client.ingest(
165
- documents=[
166
- Document(
167
- bucket_id=1234,
168
- file_name="my_file1.txt",
169
- file_path="https://my.source.url.com/file1.txt",
170
- file_type="txt",
171
- )
172
- ],
173
- )
174
- """
175
- remote_documents, local_documents = prep_documents(documents)
176
-
177
- if local_documents and remote_documents:
178
- raise ValueError("Documents must all be either local or remote, not a mix.")
179
-
180
- if len(remote_documents) > 0:
181
- return self.documents.ingest_remote(
182
- documents=remote_documents,
183
- request_options=request_options,
184
- )
185
-
186
- timeout = self._client_wrapper.get_timeout()
187
- headers = self._client_wrapper.get_headers()
188
- base_url = self._client_wrapper.get_base_url().rstrip("/")
189
- follow_redirects = getattr(
190
- self._client_wrapper.httpx_client, "follow_redirects", True
191
- )
192
-
193
- url = f"{base_url}/v1/ingest/documents/local"
194
- _response = requests.post(
195
- url,
196
- files=local_documents,
197
- headers=headers,
198
- timeout=timeout,
199
- allow_redirects=follow_redirects,
200
- )
201
-
202
- try:
203
- if 200 <= _response.status_code < 300:
204
- return typing.cast(
205
- IngestResponse,
206
- parse_obj_as(
207
- type_=IngestResponse, # type: ignore
208
- object_=_response.json(),
209
- ),
210
- )
211
- if _response.status_code == 400:
212
- raise BadRequestError(
213
- typing.cast(
214
- typing.Optional[typing.Any],
215
- parse_obj_as(
216
- type_=typing.Optional[typing.Any], # type: ignore
217
- object_=_response.json(),
218
- ),
219
- )
220
- )
221
- if _response.status_code == 401:
222
- raise UnauthorizedError(
223
- typing.cast(
224
- typing.Optional[typing.Any],
225
- parse_obj_as(
226
- type_=typing.Optional[typing.Any], # type: ignore
227
- object_=_response.json(),
228
- ),
229
- )
230
- )
231
- _response_json = _response.json()
232
- except JSONDecodeError:
233
- raise ApiError(status_code=_response.status_code, body=_response.text)
234
-
235
- raise ApiError(status_code=_response.status_code, body=_response_json)
236
-
237
-
238
- class AsyncGroundX(AsyncGroundXBase):
239
- async def ingest(
240
- self,
241
- *,
242
- documents: typing.Sequence[Document],
243
- request_options: typing.Optional[RequestOptions] = None,
244
- ) -> IngestResponse:
245
- """
246
- Ingest local or hosted documents into a GroundX bucket.
247
-
248
- Parameters
249
- ----------
250
- documents : typing.Sequence[Document]
251
-
252
- request_options : typing.Optional[RequestOptions]
253
- Request-specific configuration.
254
-
255
- Returns
256
- -------
257
- IngestResponse
258
- Documents successfully uploaded
259
-
260
- Examples
261
- --------
262
- import asyncio
263
-
264
- from groundx import AsyncGroundX, Document
265
-
266
- client = AsyncGroundX(
267
- api_key="YOUR_API_KEY",
268
- )
269
-
270
- async def main() -> None:
271
- await client.ingest(
272
- documents=[
273
- Document(
274
- bucket_id=1234,
275
- file_name="my_file1.txt",
276
- file_path="https://my.source.url.com/file1.txt",
277
- file_type="txt",
278
- )
279
- ],
280
- )
281
-
282
- asyncio.run(main())
283
- """
284
- remote_documents, local_documents = prep_documents(documents)
285
-
286
- if local_documents and remote_documents:
287
- raise ValueError("Documents must all be either local or remote, not a mix.")
288
-
289
- if len(remote_documents) > 0:
290
- return await self.documents.ingest_remote(
291
- documents=remote_documents,
292
- request_options=request_options,
293
- )
294
-
295
- timeout = self._client_wrapper.get_timeout()
296
- headers = self._client_wrapper.get_headers()
297
- base_url = self._client_wrapper.get_base_url().rstrip("/")
298
-
299
- url = f"{base_url}/v1/ingest/documents/local"
300
-
301
- try:
302
- async with aiohttp.ClientSession() as session:
303
- data = aiohttp.FormData()
304
- for field_name, (file_name, file_obj, content_type) in local_documents:
305
- data.add_field(
306
- name=field_name,
307
- value=file_obj,
308
- filename=file_name,
309
- content_type=content_type,
310
- )
311
-
312
- async with session.post(
313
- url, data=data, headers=headers, timeout=timeout
314
- ) as response:
315
- if 200 <= response.status < 300:
316
- response_data = await response.json()
317
- return typing.cast(
318
- IngestResponse,
319
- parse_obj_as(
320
- type_=IngestResponse, # type: ignore
321
- object_=response_data,
322
- ),
323
- )
324
- if response.status == 400:
325
- raise BadRequestError(await response.json())
326
- if response.status == 401:
327
- raise UnauthorizedError(await response.json())
328
-
329
- raise ApiError(
330
- status_code=response.status, body=await response.text()
331
- )
332
- except TimeoutError:
333
- raise ApiError(status_code=408, body="Request timed out")
334
- except aiohttp.ClientError as e:
335
- raise ApiError(status_code=500, body=str(e))
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes