groundx 2.0.20__py3-none-any.whl → 2.0.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
groundx/ingest.py ADDED
@@ -0,0 +1,334 @@
1
+ import aiohttp, io, json, mimetypes, requests, typing, os
2
+ from asyncio import TimeoutError
3
+ from urllib.parse import urlparse
4
+
5
+ from json.decoder import JSONDecodeError
6
+
7
+ from .client import GroundXBase, AsyncGroundXBase
8
+ from .core.api_error import ApiError
9
+ from .core.pydantic_utilities import parse_obj_as
10
+ from .core.request_options import RequestOptions
11
+ from .errors.bad_request_error import BadRequestError
12
+ from .errors.unauthorized_error import UnauthorizedError
13
+ from .types.document import Document
14
+ from .types.ingest_remote_document import IngestRemoteDocument
15
+ from .types.ingest_response import IngestResponse
16
+
17
+ # this is used as the default value for optional parameters
18
+ OMIT = typing.cast(typing.Any, ...)
19
+
20
+
21
+ DOCUMENT_TYPE_TO_MIME = {
22
+ "txt": "text/plain",
23
+ "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
24
+ "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
25
+ "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
26
+ "pdf": "application/pdf",
27
+ "png": "image/png",
28
+ "jpg": "image/jpeg",
29
+ "csv": "text/csv",
30
+ "tsv": "text/tab-separated-values",
31
+ "json": "application/json",
32
+ }
33
+ MIME_TO_DOCUMENT_TYPE = {v: k for k, v in DOCUMENT_TYPE_TO_MIME.items()}
34
+
35
+
36
+ def prep_documents(
37
+ documents: typing.Sequence[Document],
38
+ ) -> typing.Tuple[
39
+ typing.List[IngestRemoteDocument],
40
+ typing.List[
41
+ typing.Tuple[str, typing.Tuple[typing.Union[str, None], typing.BinaryIO, str]]
42
+ ],
43
+ ]:
44
+ """
45
+ Process documents and separate them into remote and local documents.
46
+ """
47
+ if not documents:
48
+ raise ValueError("No documents provided for ingestion.")
49
+
50
+ def is_valid_local_path(path: str) -> bool:
51
+ expanded_path = os.path.expanduser(path)
52
+ return os.path.exists(expanded_path)
53
+
54
+ def is_valid_url(path: str) -> bool:
55
+ try:
56
+ result = urlparse(path)
57
+ return all([result.scheme, result.netloc])
58
+ except ValueError:
59
+ return False
60
+
61
+ idx = 0
62
+ remote_documents: typing.List[IngestRemoteDocument] = []
63
+ local_documents: typing.List[
64
+ typing.Tuple[str, typing.Tuple[typing.Union[str, None], typing.BinaryIO, str]]
65
+ ] = []
66
+
67
+ for document in documents:
68
+ if not hasattr(document, "file_path"):
69
+ raise ValueError("Each document must have a 'file_path' attribute.")
70
+
71
+ if is_valid_url(document.file_path):
72
+ remote_document = IngestRemoteDocument(
73
+ bucket_id=document.bucket_id,
74
+ file_name=document.file_name,
75
+ file_type=document.file_type,
76
+ search_data=document.search_data,
77
+ source_url=document.file_path,
78
+ )
79
+ remote_documents.append(remote_document)
80
+ elif is_valid_local_path(document.file_path):
81
+ expanded_path = os.path.expanduser(document.file_path)
82
+ file_name = os.path.basename(expanded_path)
83
+ mime_type = mimetypes.guess_type(file_name)[0] or "application/octet-stream"
84
+ file_type = MIME_TO_DOCUMENT_TYPE.get(mime_type, None)
85
+ if document.file_type:
86
+ file_type = document.file_type
87
+ mime_type = DOCUMENT_TYPE_TO_MIME.get(
88
+ document.file_type, "application/octet-stream"
89
+ )
90
+
91
+ if document.file_name:
92
+ file_name = document.file_name
93
+
94
+ try:
95
+ local_documents.append(
96
+ (
97
+ "blob",
98
+ (
99
+ file_name,
100
+ open(expanded_path, "rb"),
101
+ mime_type,
102
+ ),
103
+ )
104
+ )
105
+ except Exception as e:
106
+ raise ValueError(f"Error reading file {expanded_path}: {e}")
107
+
108
+ metadata = {
109
+ "bucketId": document.bucket_id,
110
+ "fileName": file_name,
111
+ "fileType": file_type,
112
+ }
113
+ if document.search_data:
114
+ metadata["searchData"] = document.search_data
115
+
116
+ local_documents.append(
117
+ (
118
+ "metadata",
119
+ (
120
+ f"data.json",
121
+ io.BytesIO(json.dumps(metadata).encode("utf-8")),
122
+ "application/json",
123
+ ),
124
+ )
125
+ )
126
+ idx += 1
127
+ else:
128
+ raise ValueError(f"Invalid file path: {document.file_path}")
129
+
130
+ return remote_documents, local_documents
131
+
132
+
133
+ class GroundX(GroundXBase):
134
+ def ingest(
135
+ self,
136
+ *,
137
+ documents: typing.Sequence[Document],
138
+ request_options: typing.Optional[RequestOptions] = None,
139
+ ) -> IngestResponse:
140
+ """
141
+ Ingest local or hosted documents into a GroundX bucket.
142
+
143
+ Parameters
144
+ ----------
145
+ documents : typing.Sequence[Document]
146
+
147
+ request_options : typing.Optional[RequestOptions]
148
+ Request-specific configuration.
149
+
150
+ Returns
151
+ -------
152
+ IngestResponse
153
+ Documents successfully uploaded
154
+
155
+ Examples
156
+ --------
157
+ from groundx import Document, GroundX
158
+
159
+ client = GroundX(
160
+ api_key="YOUR_API_KEY",
161
+ )
162
+
163
+ client.ingest(
164
+ documents=[
165
+ Document(
166
+ bucket_id=1234,
167
+ file_name="my_file1.txt",
168
+ file_path="https://my.source.url.com/file1.txt",
169
+ file_type="txt",
170
+ )
171
+ ],
172
+ )
173
+ """
174
+ remote_documents, local_documents = prep_documents(documents)
175
+
176
+ if local_documents and remote_documents:
177
+ raise ValueError("Documents must all be either local or remote, not a mix.")
178
+
179
+ if len(remote_documents) > 0:
180
+ return self.documents.ingest_remote(
181
+ documents=remote_documents,
182
+ request_options=request_options,
183
+ )
184
+
185
+ timeout = self._client_wrapper.get_timeout()
186
+ headers = self._client_wrapper.get_headers()
187
+ base_url = self._client_wrapper.get_base_url().rstrip("/")
188
+ follow_redirects = getattr(
189
+ self._client_wrapper.httpx_client, "follow_redirects", True
190
+ )
191
+
192
+ url = f"{base_url}/v1/ingest/documents/local"
193
+ _response = requests.post(
194
+ url,
195
+ files=local_documents,
196
+ headers=headers,
197
+ timeout=timeout,
198
+ allow_redirects=follow_redirects,
199
+ )
200
+
201
+ try:
202
+ if 200 <= _response.status_code < 300:
203
+ return typing.cast(
204
+ IngestResponse,
205
+ parse_obj_as(
206
+ type_=IngestResponse, # type: ignore
207
+ object_=_response.json(),
208
+ ),
209
+ )
210
+ if _response.status_code == 400:
211
+ raise BadRequestError(
212
+ typing.cast(
213
+ typing.Optional[typing.Any],
214
+ parse_obj_as(
215
+ type_=typing.Optional[typing.Any], # type: ignore
216
+ object_=_response.json(),
217
+ ),
218
+ )
219
+ )
220
+ if _response.status_code == 401:
221
+ raise UnauthorizedError(
222
+ typing.cast(
223
+ typing.Optional[typing.Any],
224
+ parse_obj_as(
225
+ type_=typing.Optional[typing.Any], # type: ignore
226
+ object_=_response.json(),
227
+ ),
228
+ )
229
+ )
230
+ _response_json = _response.json()
231
+ except JSONDecodeError:
232
+ raise ApiError(status_code=_response.status_code, body=_response.text)
233
+
234
+ raise ApiError(status_code=_response.status_code, body=_response_json)
235
+
236
+
237
+ class AsyncGroundX(AsyncGroundXBase):
238
+ async def ingest(
239
+ self,
240
+ *,
241
+ documents: typing.Sequence[Document],
242
+ request_options: typing.Optional[RequestOptions] = None,
243
+ ) -> IngestResponse:
244
+ """
245
+ Ingest local or hosted documents into a GroundX bucket.
246
+
247
+ Parameters
248
+ ----------
249
+ documents : typing.Sequence[Document]
250
+
251
+ request_options : typing.Optional[RequestOptions]
252
+ Request-specific configuration.
253
+
254
+ Returns
255
+ -------
256
+ IngestResponse
257
+ Documents successfully uploaded
258
+
259
+ Examples
260
+ --------
261
+ import asyncio
262
+
263
+ from groundx import AsyncGroundX, Document
264
+
265
+ client = AsyncGroundX(
266
+ api_key="YOUR_API_KEY",
267
+ )
268
+
269
+ async def main() -> None:
270
+ await client.ingest(
271
+ documents=[
272
+ Document(
273
+ bucket_id=1234,
274
+ file_name="my_file1.txt",
275
+ file_path="https://my.source.url.com/file1.txt",
276
+ file_type="txt",
277
+ )
278
+ ],
279
+ )
280
+
281
+ asyncio.run(main())
282
+ """
283
+ remote_documents, local_documents = prep_documents(documents)
284
+
285
+ if local_documents and remote_documents:
286
+ raise ValueError("Documents must all be either local or remote, not a mix.")
287
+
288
+ if len(remote_documents) > 0:
289
+ return await self.documents.ingest_remote(
290
+ documents=remote_documents,
291
+ request_options=request_options,
292
+ )
293
+
294
+ timeout = self._client_wrapper.get_timeout()
295
+ headers = self._client_wrapper.get_headers()
296
+ base_url = self._client_wrapper.get_base_url().rstrip("/")
297
+
298
+ url = f"{base_url}/v1/ingest/documents/local"
299
+
300
+ try:
301
+ async with aiohttp.ClientSession() as session:
302
+ data = aiohttp.FormData()
303
+ for field_name, (file_name, file_obj, content_type) in local_documents:
304
+ data.add_field(
305
+ name=field_name,
306
+ value=file_obj,
307
+ filename=file_name,
308
+ content_type=content_type,
309
+ )
310
+
311
+ async with session.post(
312
+ url, data=data, headers=headers, timeout=timeout
313
+ ) as response:
314
+ if 200 <= response.status < 300:
315
+ response_data = await response.json()
316
+ return typing.cast(
317
+ IngestResponse,
318
+ parse_obj_as(
319
+ type_=IngestResponse, # type: ignore
320
+ object_=response_data,
321
+ ),
322
+ )
323
+ if response.status == 400:
324
+ raise BadRequestError(await response.json())
325
+ if response.status == 401:
326
+ raise UnauthorizedError(await response.json())
327
+
328
+ raise ApiError(
329
+ status_code=response.status, body=await response.text()
330
+ )
331
+ except TimeoutError:
332
+ raise ApiError(status_code=408, body="Request timed out")
333
+ except aiohttp.ClientError as e:
334
+ raise ApiError(status_code=500, body=str(e))
groundx/types/__init__.py CHANGED
@@ -11,6 +11,7 @@ from .customer_response import CustomerResponse
11
11
  from .document import Document
12
12
  from .document_detail import DocumentDetail
13
13
  from .document_list_response import DocumentListResponse
14
+ from .document_local_ingest_request import DocumentLocalIngestRequest
14
15
  from .document_lookup_response import DocumentLookupResponse
15
16
  from .document_response import DocumentResponse
16
17
  from .document_type import DocumentType
@@ -22,6 +23,7 @@ from .health_response_health import HealthResponseHealth
22
23
  from .health_service import HealthService
23
24
  from .health_service_status import HealthServiceStatus
24
25
  from .ingest_local_document import IngestLocalDocument
26
+ from .ingest_local_document_metadata import IngestLocalDocumentMetadata
25
27
  from .ingest_remote_document import IngestRemoteDocument
26
28
  from .ingest_response import IngestResponse
27
29
  from .ingest_response_ingest import IngestResponseIngest
@@ -56,6 +58,7 @@ __all__ = [
56
58
  "Document",
57
59
  "DocumentDetail",
58
60
  "DocumentListResponse",
61
+ "DocumentLocalIngestRequest",
59
62
  "DocumentLookupResponse",
60
63
  "DocumentResponse",
61
64
  "DocumentType",
@@ -67,6 +70,7 @@ __all__ = [
67
70
  "HealthService",
68
71
  "HealthServiceStatus",
69
72
  "IngestLocalDocument",
73
+ "IngestLocalDocumentMetadata",
70
74
  "IngestRemoteDocument",
71
75
  "IngestResponse",
72
76
  "IngestResponseIngest",
groundx/types/document.py CHANGED
@@ -12,14 +12,14 @@ from ..core.pydantic_utilities import IS_PYDANTIC_V2
12
12
  class Document(UniversalBaseModel):
13
13
  bucket_id: typing_extensions.Annotated[int, FieldMetadata(alias="bucketId")] = pydantic.Field()
14
14
  """
15
- the bucketId of the bucket which this remote file will be ingested to.
15
+ The bucketId of the bucket which this file will be ingested into.
16
16
  """
17
17
 
18
18
  file_name: typing_extensions.Annotated[typing.Optional[str], FieldMetadata(alias="fileName")] = pydantic.Field(
19
19
  default=None
20
20
  )
21
21
  """
22
- The name of the file being ingested
22
+ The name of the file being ingested.
23
23
  """
24
24
 
25
25
  file_path: typing_extensions.Annotated[str, FieldMetadata(alias="filePath")] = pydantic.Field()
@@ -0,0 +1,6 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing
4
+ from .ingest_local_document import IngestLocalDocument
5
+
6
+ DocumentLocalIngestRequest = typing.List[IngestLocalDocument]
@@ -1,37 +1,19 @@
1
1
  # This file was auto-generated by Fern from our API Definition.
2
2
 
3
3
  from ..core.pydantic_utilities import UniversalBaseModel
4
- import typing_extensions
5
- from ..core.serialization import FieldMetadata
6
4
  import pydantic
7
- from .document_type import DocumentType
8
- import typing
5
+ from .ingest_local_document_metadata import IngestLocalDocumentMetadata
9
6
  from ..core.pydantic_utilities import IS_PYDANTIC_V2
7
+ import typing
10
8
 
11
9
 
12
10
  class IngestLocalDocument(UniversalBaseModel):
13
- bucket_id: typing_extensions.Annotated[int, FieldMetadata(alias="bucketId")] = pydantic.Field()
14
- """
15
- the bucketId of the bucket which this local file will be ingested to.
16
- """
17
-
18
- file_data: typing_extensions.Annotated[str, FieldMetadata(alias="fileData")] = pydantic.Field()
11
+ blob: str = pydantic.Field()
19
12
  """
20
- Binary data for the file being ingested.
13
+ The binary file data being ingested.
21
14
  """
22
15
 
23
- file_name: typing_extensions.Annotated[str, FieldMetadata(alias="fileName")] = pydantic.Field()
24
- """
25
- The name of the file being ingested
26
- """
27
-
28
- file_type: typing_extensions.Annotated[DocumentType, FieldMetadata(alias="fileType")]
29
- search_data: typing_extensions.Annotated[
30
- typing.Optional[typing.Dict[str, typing.Optional[typing.Any]]], FieldMetadata(alias="searchData")
31
- ] = pydantic.Field(default=None)
32
- """
33
- Custom metadata which can be used to influence GroundX's search functionality. This data can be used to further hone GroundX search.
34
- """
16
+ metadata: IngestLocalDocumentMetadata
35
17
 
36
18
  if IS_PYDANTIC_V2:
37
19
  model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
@@ -0,0 +1,42 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from ..core.pydantic_utilities import UniversalBaseModel
4
+ import typing_extensions
5
+ import typing
6
+ from ..core.serialization import FieldMetadata
7
+ import pydantic
8
+ from .document_type import DocumentType
9
+ from ..core.pydantic_utilities import IS_PYDANTIC_V2
10
+
11
+
12
+ class IngestLocalDocumentMetadata(UniversalBaseModel):
13
+ bucket_id: typing_extensions.Annotated[typing.Optional[int], FieldMetadata(alias="bucketId")] = pydantic.Field(
14
+ default=None
15
+ )
16
+ """
17
+ The bucketId of the bucket which this local file will be ingested into.
18
+ """
19
+
20
+ file_name: typing_extensions.Annotated[typing.Optional[str], FieldMetadata(alias="fileName")] = pydantic.Field(
21
+ default=None
22
+ )
23
+ """
24
+ The name of the file being ingested
25
+ """
26
+
27
+ file_type: typing_extensions.Annotated[typing.Optional[DocumentType], FieldMetadata(alias="fileType")] = None
28
+ search_data: typing_extensions.Annotated[
29
+ typing.Optional[typing.Dict[str, typing.Optional[typing.Any]]], FieldMetadata(alias="searchData")
30
+ ] = pydantic.Field(default=None)
31
+ """
32
+ Custom metadata which can be used to influence GroundX's search functionality. This data can be used to further hone GroundX search.
33
+ """
34
+
35
+ if IS_PYDANTIC_V2:
36
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
37
+ else:
38
+
39
+ class Config:
40
+ frozen = True
41
+ smart_union = True
42
+ extra = pydantic.Extra.allow
@@ -12,7 +12,7 @@ from ..core.pydantic_utilities import IS_PYDANTIC_V2
12
12
  class IngestRemoteDocument(UniversalBaseModel):
13
13
  bucket_id: typing_extensions.Annotated[int, FieldMetadata(alias="bucketId")] = pydantic.Field()
14
14
  """
15
- the bucketId of the bucket which this remote file will be ingested to.
15
+ The bucketId of the bucket which this remote file will be ingested into.
16
16
  """
17
17
 
18
18
  file_name: typing_extensions.Annotated[typing.Optional[str], FieldMetadata(alias="fileName")] = pydantic.Field(
@@ -11,7 +11,7 @@ from ..core.pydantic_utilities import IS_PYDANTIC_V2
11
11
  class WebsiteSource(UniversalBaseModel):
12
12
  bucket_id: typing_extensions.Annotated[int, FieldMetadata(alias="bucketId")] = pydantic.Field()
13
13
  """
14
- the bucketId of the bucket which this website will be ingested to.
14
+ The bucketId of the bucket which this website will be ingested into.
15
15
  """
16
16
 
17
17
  cap: typing.Optional[int] = pydantic.Field(default=None)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: groundx
3
- Version: 2.0.20
3
+ Version: 2.0.29
4
4
  Summary:
5
5
  License: MIT
6
6
  Requires-Python: >=3.8,<4.0
@@ -20,18 +20,20 @@ Classifier: Programming Language :: Python :: 3.11
20
20
  Classifier: Programming Language :: Python :: 3.12
21
21
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
22
  Classifier: Typing :: Typed
23
+ Requires-Dist: aiohttp (>=3.8.0)
23
24
  Requires-Dist: httpx (>=0.21.2)
24
25
  Requires-Dist: pydantic (>=1.9.2)
25
26
  Requires-Dist: pydantic-core (>=2.18.2,<3.0.0)
27
+ Requires-Dist: requests (>=2.4.0)
26
28
  Requires-Dist: typing_extensions (>=4.0.0)
27
29
  Description-Content-Type: text/markdown
28
30
 
29
- # Eyelevel Python Library
31
+ # GroundX Python Library
30
32
 
31
33
  [![fern shield](https://img.shields.io/badge/%F0%9F%8C%BF-Built%20with%20Fern-brightgreen)](https://buildwithfern.com?utm_source=github&utm_medium=github&utm_campaign=readme&utm_source=https%3A%2F%2Fgithub.com%2Feyelevelai%2Fgroundx-python)
32
34
  [![pypi](https://img.shields.io/pypi/v/groundx)](https://pypi.python.org/pypi/groundx)
33
35
 
34
- The Eyelevel Python library provides convenient access to the Eyelevel API from Python.
36
+ The GroundX Python library provides convenient access to the GroundX API from Python.
35
37
 
36
38
  ## Documentation
37
39
 
@@ -57,21 +59,15 @@ from groundx import Document, GroundX
57
59
  client = GroundX(
58
60
  api_key="YOUR_API_KEY",
59
61
  )
60
- client.documents.ingest(
62
+
63
+ client.ingest(
61
64
  documents=[
62
65
  Document(
63
66
  bucket_id=1234,
64
67
  file_name="my_file1.txt",
65
- file_path="https://my.source.url.com/file1.txt",
66
68
  file_type="txt",
67
- search_data={"key": "value"},
68
- ),
69
- Document(
70
- bucket_id=1234,
71
- file_name="my_file2.pdf",
72
- file_path="/local/path/file2.pdf",
73
- file_type="pdf",
74
- ),
69
+ source_url="https://my.source.url.com/file1.txt",
70
+ )
75
71
  ],
76
72
  )
77
73
  ```
@@ -89,27 +85,18 @@ client = AsyncGroundX(
89
85
  api_key="YOUR_API_KEY",
90
86
  )
91
87
 
92
-
93
88
  async def main() -> None:
94
- await client.documents.ingest(
89
+ await client.ingest(
95
90
  documents=[
96
91
  Document(
97
92
  bucket_id=1234,
98
93
  file_name="my_file1.txt",
99
- file_path="https://my.source.url.com/file1.txt",
100
94
  file_type="txt",
101
- search_data={"key": "value"},
102
- ),
103
- Document(
104
- bucket_id=1234,
105
- file_name="my_file2.pdf",
106
- file_path="/local/path/file2.pdf",
107
- file_type="pdf",
108
- ),
95
+ source_url="https://my.source.url.com/file1.txt",
96
+ )
109
97
  ],
110
98
  )
111
99
 
112
-
113
100
  asyncio.run(main())
114
101
  ```
115
102
 
@@ -122,7 +109,7 @@ will be thrown.
122
109
  from groundx.core.api_error import ApiError
123
110
 
124
111
  try:
125
- client.documents.ingest(...)
112
+ client.ingest(...)
126
113
  except ApiError as e:
127
114
  print(e.status_code)
128
115
  print(e.body)
@@ -145,7 +132,7 @@ A request is deemed retriable when any of the following HTTP status codes is ret
145
132
  Use the `max_retries` request option to configure this behavior.
146
133
 
147
134
  ```python
148
- client.documents.ingest(..., request_options={
135
+ client.ingest(..., request_options={
149
136
  "max_retries": 1
150
137
  })
151
138
  ```
@@ -165,7 +152,7 @@ client = GroundX(
165
152
 
166
153
 
167
154
  # Override timeout for a specific method
168
- client.documents.ingest(..., request_options={
155
+ client.ingest(..., request_options={
169
156
  "timeout_in_seconds": 1
170
157
  })
171
158
  ```