indexify 0.0.43__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/__init__.py +15 -14
- indexify/base_client.py +48 -21
- indexify/cli.py +247 -0
- indexify/client.py +18 -790
- indexify/error.py +3 -30
- indexify/executor/agent.py +364 -0
- indexify/executor/api_objects.py +43 -0
- indexify/executor/downloader.py +124 -0
- indexify/executor/executor_tasks.py +72 -0
- indexify/executor/function_worker.py +177 -0
- indexify/executor/indexify_executor.py +32 -0
- indexify/executor/runtime_probes.py +48 -0
- indexify/executor/task_reporter.py +110 -0
- indexify/executor/task_store.py +113 -0
- indexify/foo +72 -0
- indexify/functions_sdk/data_objects.py +37 -0
- indexify/functions_sdk/graph.py +281 -0
- indexify/functions_sdk/graph_validation.py +66 -0
- indexify/functions_sdk/image.py +34 -0
- indexify/functions_sdk/indexify_functions.py +188 -0
- indexify/functions_sdk/local_cache.py +46 -0
- indexify/functions_sdk/object_serializer.py +60 -0
- indexify/local_client.py +183 -0
- indexify/remote_client.py +319 -0
- indexify-0.2.1.dist-info/METADATA +151 -0
- indexify-0.2.1.dist-info/RECORD +33 -0
- indexify-0.2.1.dist-info/entry_points.txt +3 -0
- indexify/exceptions.py +0 -3
- indexify/extraction_policy.py +0 -75
- indexify/extractor_sdk/__init__.py +0 -14
- indexify/extractor_sdk/data.py +0 -100
- indexify/extractor_sdk/extractor.py +0 -225
- indexify/extractor_sdk/utils.py +0 -102
- indexify/extractors/__init__.py +0 -0
- indexify/extractors/embedding.py +0 -55
- indexify/extractors/pdf_parser.py +0 -93
- indexify/graph.py +0 -133
- indexify/local_runner.py +0 -128
- indexify/runner.py +0 -22
- indexify/utils.py +0 -7
- indexify-0.0.43.dist-info/METADATA +0 -66
- indexify-0.0.43.dist-info/RECORD +0 -25
- {indexify-0.0.43.dist-info → indexify-0.2.1.dist-info}/LICENSE.txt +0 -0
- {indexify-0.0.43.dist-info → indexify-0.2.1.dist-info}/WHEEL +0 -0
indexify/client.py
CHANGED
@@ -1,790 +1,18 @@
|
|
1
|
-
import
|
2
|
-
|
3
|
-
import
|
4
|
-
import
|
5
|
-
from
|
6
|
-
from
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
from .settings import DEFAULT_SERVICE_URL, DEFAULT_SERVICE_URL_HTTPS
|
20
|
-
from .utils import json_set_default
|
21
|
-
|
22
|
-
Document = namedtuple("Document", ["text", "labels", "id"])
|
23
|
-
|
24
|
-
SQLQueryRow = namedtuple("SQLQueryRow", ["content_id", "data"])
|
25
|
-
|
26
|
-
|
27
|
-
def generate_unique_hex_id():
|
28
|
-
"""
|
29
|
-
Generate a unique hexadecimal identifier
|
30
|
-
|
31
|
-
Returns:
|
32
|
-
str: a unique hexadecimal string
|
33
|
-
"""
|
34
|
-
return uuid.uuid4().hex[:16]
|
35
|
-
|
36
|
-
|
37
|
-
def generate_hash_from_string(input_string: str):
|
38
|
-
"""
|
39
|
-
Generate a hash for the given string and return it as a hexadecimal string.
|
40
|
-
|
41
|
-
Args:
|
42
|
-
input_string (str): The input string to hash.
|
43
|
-
|
44
|
-
Returns:
|
45
|
-
str: The hexadecimal hash of the input string.
|
46
|
-
"""
|
47
|
-
hash_object = hashlib.sha256(input_string.encode())
|
48
|
-
return hash_object.hexdigest()[:16]
|
49
|
-
|
50
|
-
|
51
|
-
@dataclass
|
52
|
-
class SqlQueryResult:
|
53
|
-
result: List[Dict]
|
54
|
-
|
55
|
-
|
56
|
-
class IndexifyClient:
|
57
|
-
"""
|
58
|
-
IndexifyClient is the main entry point for the SDK.
|
59
|
-
For the full list of client features, see the
|
60
|
-
[httpx Client documentation](https://www.python-httpx.org/api/#client).
|
61
|
-
|
62
|
-
:param service_url: The URL of the Indexify service to connect to.
|
63
|
-
:param args: Arguments to pass to the httpx.Client constructor
|
64
|
-
:param kwargs: Keyword arguments to pass to the httpx.Client constructor
|
65
|
-
|
66
|
-
Example usage:
|
67
|
-
```
|
68
|
-
from indexify import IndexifyClient
|
69
|
-
|
70
|
-
client = IndexifyClient()
|
71
|
-
assert client.heartbeat() == True
|
72
|
-
```
|
73
|
-
"""
|
74
|
-
|
75
|
-
def __init__(
|
76
|
-
self,
|
77
|
-
service_url: str = DEFAULT_SERVICE_URL, # switch this to DEFAULT_SERVICE_URL_HTTPS for TLS
|
78
|
-
namespace: str = "default",
|
79
|
-
config_path: Optional[str] = None,
|
80
|
-
*args,
|
81
|
-
**kwargs,
|
82
|
-
):
|
83
|
-
if config_path:
|
84
|
-
with open(config_path, "r") as file:
|
85
|
-
config = yaml.safe_load(file)
|
86
|
-
|
87
|
-
if config.get("use_tls", False):
|
88
|
-
tls_config = config["tls_config"]
|
89
|
-
self._client = httpx.Client(
|
90
|
-
http2=True,
|
91
|
-
cert=(tls_config["cert_path"], tls_config["key_path"]),
|
92
|
-
verify=tls_config.get("ca_bundle_path", True),
|
93
|
-
)
|
94
|
-
else:
|
95
|
-
self._client = httpx.Client(*args, **kwargs)
|
96
|
-
else:
|
97
|
-
self._client = httpx.Client(*args, **kwargs)
|
98
|
-
|
99
|
-
self.namespace: str = namespace
|
100
|
-
self.extraction_graphs: List[ExtractionGraph] = []
|
101
|
-
self.labels: dict = {}
|
102
|
-
self._service_url = service_url
|
103
|
-
self._timeout = kwargs.get("timeout")
|
104
|
-
|
105
|
-
# get namespace data
|
106
|
-
self.extraction_graphs = self.get_extraction_graphs()
|
107
|
-
|
108
|
-
@classmethod
|
109
|
-
def with_mtls(
|
110
|
-
cls,
|
111
|
-
cert_path: str,
|
112
|
-
key_path: str,
|
113
|
-
ca_bundle_path: Optional[str] = None,
|
114
|
-
service_url: str = DEFAULT_SERVICE_URL_HTTPS,
|
115
|
-
*args,
|
116
|
-
**kwargs,
|
117
|
-
) -> "IndexifyClient":
|
118
|
-
"""
|
119
|
-
Create a client with mutual TLS authentication. Also enables HTTP/2,
|
120
|
-
which is required for mTLS.
|
121
|
-
NOTE: mTLS must be enabled on the Indexify service for this to work.
|
122
|
-
|
123
|
-
:param cert_path: Path to the client certificate. Resolution handled by httpx.
|
124
|
-
:param key_path: Path to the client key. Resolution handled by httpx.
|
125
|
-
:param args: Arguments to pass to the httpx.Client constructor
|
126
|
-
:param kwargs: Keyword arguments to pass to the httpx.Client constructor
|
127
|
-
:return: A client with mTLS authentication
|
128
|
-
|
129
|
-
Example usage:
|
130
|
-
```
|
131
|
-
from indexify import IndexifyClient
|
132
|
-
|
133
|
-
client = IndexifyClient.with_mtls(
|
134
|
-
cert_path="/path/to/cert.pem",
|
135
|
-
key_path="/path/to/key.pem",
|
136
|
-
)
|
137
|
-
assert client.heartbeat() == True
|
138
|
-
```
|
139
|
-
"""
|
140
|
-
if not (cert_path and key_path):
|
141
|
-
raise ValueError("Both cert and key must be provided for mTLS")
|
142
|
-
|
143
|
-
client_certs = (cert_path, key_path)
|
144
|
-
verify_option = ca_bundle_path if ca_bundle_path else True
|
145
|
-
client = IndexifyClient(
|
146
|
-
*args,
|
147
|
-
**kwargs,
|
148
|
-
service_url=service_url,
|
149
|
-
http2=True,
|
150
|
-
cert=client_certs,
|
151
|
-
verify=verify_option,
|
152
|
-
)
|
153
|
-
return client
|
154
|
-
|
155
|
-
def _request(self, method: str, **kwargs) -> httpx.Response:
|
156
|
-
try:
|
157
|
-
response = self._client.request(method, timeout=self._timeout, **kwargs)
|
158
|
-
status_code = str(response.status_code)
|
159
|
-
if status_code.startswith("4"):
|
160
|
-
raise ApiException(
|
161
|
-
"status code: " + status_code + " request args: " + str(kwargs)
|
162
|
-
)
|
163
|
-
if status_code.startswith("5"):
|
164
|
-
raise ApiException(response.text)
|
165
|
-
# error = Error.from_tonic_error_string(str(response.url), response.text)
|
166
|
-
# self.__print_additional_error_context(error)
|
167
|
-
# raise error
|
168
|
-
except httpx.ConnectError:
|
169
|
-
message = (
|
170
|
-
f"Make sure the server is running and accesible at {self._service_url}"
|
171
|
-
)
|
172
|
-
error = Error(status="ConnectionError", message=message)
|
173
|
-
print(error)
|
174
|
-
raise error
|
175
|
-
return response
|
176
|
-
|
177
|
-
def get(self, endpoint: str, **kwargs) -> httpx.Response:
|
178
|
-
"""
|
179
|
-
Make a GET request to the Indexify service.
|
180
|
-
|
181
|
-
:param endpoint: The endpoint to make the request to.
|
182
|
-
|
183
|
-
Example usage:
|
184
|
-
```
|
185
|
-
from indexify import IndexifyClient
|
186
|
-
|
187
|
-
client = IndexifyClient()
|
188
|
-
response = client.get("namespaces")
|
189
|
-
print(response.json())
|
190
|
-
```
|
191
|
-
"""
|
192
|
-
return self._request("GET", url=f"{self._service_url}/{endpoint}", **kwargs)
|
193
|
-
|
194
|
-
def post(self, endpoint: str, **kwargs) -> httpx.Response:
|
195
|
-
"""
|
196
|
-
Make a POST request to the Indexify service.
|
197
|
-
|
198
|
-
:param endpoint: The endpoint to make the request to.
|
199
|
-
|
200
|
-
Example usage:
|
201
|
-
|
202
|
-
```
|
203
|
-
from indexify import IndexifyClient
|
204
|
-
|
205
|
-
client = IndexifyClient()
|
206
|
-
response = client.post("namespaces", json={"name": "my-repo"})
|
207
|
-
print(response.json())
|
208
|
-
```
|
209
|
-
"""
|
210
|
-
return self._request("POST", url=f"{self._service_url}/{endpoint}", **kwargs)
|
211
|
-
|
212
|
-
def put(self, endpoint: str, **kwargs) -> httpx.Response:
|
213
|
-
"""
|
214
|
-
Make a PUT request to the Indexify service.
|
215
|
-
|
216
|
-
:param endpoint: The endpoint to make the request to.
|
217
|
-
|
218
|
-
Example usage:
|
219
|
-
|
220
|
-
```
|
221
|
-
from indexify import IndexifyClient
|
222
|
-
|
223
|
-
client = IndexifyClient()
|
224
|
-
response = client.put("namespaces", json={"name": "my-repo"})
|
225
|
-
print(response.json())
|
226
|
-
```
|
227
|
-
"""
|
228
|
-
return self._request("PUT", url=f"{self._service_url}/{endpoint}", **kwargs)
|
229
|
-
|
230
|
-
def delete(self, endpoint: str, **kwargs) -> httpx.Response:
|
231
|
-
"""
|
232
|
-
Make a DELETE request to the Indexify service.
|
233
|
-
|
234
|
-
:param endpoint: The endpoint to make the request to.
|
235
|
-
|
236
|
-
Example usage:
|
237
|
-
|
238
|
-
```
|
239
|
-
from indexify import IndexifyClient
|
240
|
-
|
241
|
-
client = IndexifyClient()
|
242
|
-
response = client.delete("namespaces")
|
243
|
-
print(response.json())
|
244
|
-
```
|
245
|
-
"""
|
246
|
-
return self._request("DELETE", url=f"{self._service_url}/{endpoint}", **kwargs)
|
247
|
-
|
248
|
-
def close(self):
|
249
|
-
"""
|
250
|
-
Close the underlying httpx.Client.
|
251
|
-
"""
|
252
|
-
self._client.close()
|
253
|
-
|
254
|
-
# __enter__ and __exit__ allow the client to be used as a context manager
|
255
|
-
def __enter__(self):
|
256
|
-
return self
|
257
|
-
|
258
|
-
def __exit__(self, exc_type, exc_value, traceback):
|
259
|
-
self.close()
|
260
|
-
|
261
|
-
def heartbeat(self, heartbeat_response="Indexify Server") -> bool:
|
262
|
-
"""
|
263
|
-
Check if the Indexify service is alive.
|
264
|
-
"""
|
265
|
-
response = self.get(f"")
|
266
|
-
# Server responds with text: "Indexify Server"
|
267
|
-
return response.text == heartbeat_response
|
268
|
-
|
269
|
-
def namespaces(self) -> list[str]:
|
270
|
-
"""
|
271
|
-
Get a list of all namespaces.
|
272
|
-
"""
|
273
|
-
response = self.get(f"namespaces")
|
274
|
-
namespaces_dict = response.json()["namespaces"]
|
275
|
-
namespaces = []
|
276
|
-
for item in namespaces_dict:
|
277
|
-
namespaces.append(item["name"])
|
278
|
-
return namespaces
|
279
|
-
|
280
|
-
@classmethod
|
281
|
-
def create_namespace(
|
282
|
-
self,
|
283
|
-
namespace: str,
|
284
|
-
extraction_graphs: list = [],
|
285
|
-
labels: dict = {},
|
286
|
-
service_url: str = DEFAULT_SERVICE_URL,
|
287
|
-
) -> "IndexifyClient":
|
288
|
-
"""
|
289
|
-
Create a new namespace.
|
290
|
-
|
291
|
-
Returns:
|
292
|
-
IndexifyClient: a new client with the given namespace
|
293
|
-
"""
|
294
|
-
extraction_graphs = []
|
295
|
-
for bd in extraction_graphs:
|
296
|
-
if isinstance(bd, extraction_graphs):
|
297
|
-
extraction_graphs.append(bd.to_dict())
|
298
|
-
else:
|
299
|
-
extraction_graphs.append(bd)
|
300
|
-
|
301
|
-
req = {
|
302
|
-
"name": namespace,
|
303
|
-
"extraction_graphs": extraction_graphs,
|
304
|
-
"labels": labels,
|
305
|
-
}
|
306
|
-
|
307
|
-
with httpx.Client() as client:
|
308
|
-
client.post(f"{service_url}/namespaces", json=req)
|
309
|
-
|
310
|
-
client = IndexifyClient(namespace=namespace, service_url=service_url)
|
311
|
-
return client
|
312
|
-
|
313
|
-
def _add_content_url(self, content):
|
314
|
-
"""
|
315
|
-
Add download content_url url property
|
316
|
-
"""
|
317
|
-
return {
|
318
|
-
**content,
|
319
|
-
"content_url": f"{self._service_url}/namespaces/{self.namespace}/content/{content['id']}/download",
|
320
|
-
}
|
321
|
-
|
322
|
-
def indexes(self) -> dict:
|
323
|
-
"""
|
324
|
-
Get the indexes of the current namespace.
|
325
|
-
|
326
|
-
Returns:
|
327
|
-
List[Index]: list of indexes in the current namespace
|
328
|
-
"""
|
329
|
-
response = self.get(f"namespaces/{self.namespace}/indexes")
|
330
|
-
return response.json()["indexes"]
|
331
|
-
|
332
|
-
def extractors(self) -> List[ExtractorMetadata]:
|
333
|
-
"""
|
334
|
-
Get a list of all extractors.
|
335
|
-
|
336
|
-
Returns:
|
337
|
-
List[Extractor]: list of extractors
|
338
|
-
"""
|
339
|
-
response = self.get(f"extractors")
|
340
|
-
extractors_dict = response.json()["extractors"]
|
341
|
-
extractors = []
|
342
|
-
for ed in extractors_dict:
|
343
|
-
print(ed)
|
344
|
-
extractors.append(ExtractorMetadata.model_validate(ed))
|
345
|
-
return extractors
|
346
|
-
|
347
|
-
def get_extraction_graphs(self) -> List[ExtractionGraph]:
|
348
|
-
"""
|
349
|
-
Retrieve and update the list of extraction policies for the current namespace.
|
350
|
-
"""
|
351
|
-
response = self.get(f"namespaces/{self.namespace}/extraction_graphs")
|
352
|
-
json = response.json()
|
353
|
-
|
354
|
-
self.extraction_graphs = []
|
355
|
-
for graph in json["extraction_graphs"]:
|
356
|
-
self.extraction_graphs.append(ExtractionGraph.from_dict(graph))
|
357
|
-
|
358
|
-
return self.extraction_graphs
|
359
|
-
|
360
|
-
def create_extraction_graph(self, extraction_graph: ExtractionGraph):
|
361
|
-
"""
|
362
|
-
Create a new extraction graph.
|
363
|
-
|
364
|
-
Args:
|
365
|
-
- extraction_graph (ExtractionGraph): the extraction graph to create
|
366
|
-
"""
|
367
|
-
req = extraction_graph.to_dict()
|
368
|
-
req["namespace"] = self.namespace
|
369
|
-
request_body = json.dumps(req, default=json_set_default)
|
370
|
-
response = self.post(
|
371
|
-
f"namespaces/{self.namespace}/extraction_graphs",
|
372
|
-
data=request_body,
|
373
|
-
headers={"Content-Type": "application/json"},
|
374
|
-
)
|
375
|
-
return
|
376
|
-
|
377
|
-
def link_extraction_graphs(
|
378
|
-
self, source_graph: str, content_source: str, linked_graph: str
|
379
|
-
):
|
380
|
-
"""
|
381
|
-
Link an extraction graph to another extraction graph.
|
382
|
-
|
383
|
-
Args:
|
384
|
-
- source_graph (str): source extraction graph
|
385
|
-
- content_source (str): content source in source graph
|
386
|
-
- linked_graph (str): target extraction graph
|
387
|
-
"""
|
388
|
-
req = {
|
389
|
-
"content_source": content_source,
|
390
|
-
"linked_graph_name": linked_graph,
|
391
|
-
}
|
392
|
-
response = self.post(
|
393
|
-
f"namespaces/{self.namespace}/extraction_graphs/{source_graph}/links",
|
394
|
-
json=req,
|
395
|
-
headers={"Content-Type": "application/json"},
|
396
|
-
)
|
397
|
-
return
|
398
|
-
|
399
|
-
def get_content_metadata(self, content_id: str) -> dict:
|
400
|
-
"""
|
401
|
-
Get metadata for a specific content ID in a given index.
|
402
|
-
|
403
|
-
Args:
|
404
|
-
- content_id (str): content id to query
|
405
|
-
"""
|
406
|
-
response = self.get(
|
407
|
-
f"namespaces/{self.namespace}/content/{content_id}/metadata"
|
408
|
-
)
|
409
|
-
return response.json()["content_metadata"]
|
410
|
-
|
411
|
-
def download_content(self, content_id: str) -> bytes:
|
412
|
-
"""
|
413
|
-
Download content from id. Return bytes
|
414
|
-
|
415
|
-
Args:
|
416
|
-
- content_id (str): id of content to download
|
417
|
-
"""
|
418
|
-
response = self.get(
|
419
|
-
f"namespaces/{self.namespace}/content/{content_id}/download"
|
420
|
-
)
|
421
|
-
return response.content
|
422
|
-
|
423
|
-
def add_documents(
|
424
|
-
self,
|
425
|
-
extraction_graphs: Union[str, List[str]],
|
426
|
-
documents: Union[Document, str, List[Union[Document, str]]],
|
427
|
-
doc_id=None,
|
428
|
-
) -> Union[str, List[str]]:
|
429
|
-
"""
|
430
|
-
Add documents to current namespace.
|
431
|
-
|
432
|
-
Args:
|
433
|
-
- documents (Union[Document, str, List[Union[Document, str]]]): this can be a list of strings, list of Documents or a mix of both
|
434
|
-
"""
|
435
|
-
if isinstance(extraction_graphs, str):
|
436
|
-
extraction_graphs = [extraction_graphs]
|
437
|
-
if isinstance(documents, Document):
|
438
|
-
documents = [documents]
|
439
|
-
elif isinstance(documents, str):
|
440
|
-
documents = [Document(documents, {}, id=doc_id)]
|
441
|
-
elif isinstance(documents, list):
|
442
|
-
new_documents = []
|
443
|
-
for item in documents:
|
444
|
-
if isinstance(item, Document):
|
445
|
-
new_documents.append(item)
|
446
|
-
elif isinstance(item, str):
|
447
|
-
new_documents.append(
|
448
|
-
Document(item, {}, id=None)
|
449
|
-
) # don't pass in id for a string content because doesn't make sense to have same content id for all strings
|
450
|
-
else:
|
451
|
-
raise ValueError(
|
452
|
-
"List items must be either Document instances or strings."
|
453
|
-
)
|
454
|
-
documents = new_documents
|
455
|
-
else:
|
456
|
-
raise TypeError(
|
457
|
-
"Invalid type for documents. Expected Document, str, or list of these."
|
458
|
-
)
|
459
|
-
for document in documents:
|
460
|
-
document.labels["mime_type"] = "text/plain"
|
461
|
-
content_ids = []
|
462
|
-
if isinstance(extraction_graphs, str):
|
463
|
-
extraction_graphs = [extraction_graphs]
|
464
|
-
for extraction_graph in extraction_graphs:
|
465
|
-
for document in documents:
|
466
|
-
response = self.post(
|
467
|
-
f"namespaces/{self.namespace}/extraction_graphs/{extraction_graph}/extract",
|
468
|
-
files={"file": document.text},
|
469
|
-
data={"labels": json.dumps(document.labels)},
|
470
|
-
)
|
471
|
-
response_json = response.json()
|
472
|
-
content_id = response_json["content_id"]
|
473
|
-
content_ids.append(content_id)
|
474
|
-
return content_ids
|
475
|
-
|
476
|
-
def delete_documents(self, document_ids: List[str]) -> None:
|
477
|
-
"""
|
478
|
-
Delete documents from current namespace.
|
479
|
-
|
480
|
-
Args:
|
481
|
-
- document_ids (List[str]): list of document ids to delete
|
482
|
-
"""
|
483
|
-
req = {"content_ids": document_ids}
|
484
|
-
response = self.delete(
|
485
|
-
f"namespaces/{self.namespace}/content",
|
486
|
-
json=req,
|
487
|
-
headers={"Content-Type": "application/json"},
|
488
|
-
)
|
489
|
-
|
490
|
-
def update_labels(self, document_id: str, labels: Dict[str, str]) -> None:
|
491
|
-
"""
|
492
|
-
Update labels for a document.
|
493
|
-
|
494
|
-
Args:
|
495
|
-
- document_id (str): id of document to update
|
496
|
-
- labels (Dict[str, str]): labels to update
|
497
|
-
"""
|
498
|
-
req = {"labels": labels}
|
499
|
-
response = self.put(
|
500
|
-
f"namespaces/{self.namespace}/content/{document_id}/labels",
|
501
|
-
json=req,
|
502
|
-
headers={"Content-Type": "application/json"},
|
503
|
-
)
|
504
|
-
|
505
|
-
def update_content(self, document_id: str, path: str) -> None:
|
506
|
-
"""
|
507
|
-
Update a piece of content with a new file
|
508
|
-
|
509
|
-
Args:
|
510
|
-
- path (str): relative path to the file to be uploaded
|
511
|
-
"""
|
512
|
-
with open(path, "rb") as f:
|
513
|
-
response = self.put(
|
514
|
-
f"namespaces/{self.namespace}/content/{document_id}", files={"file": f}
|
515
|
-
)
|
516
|
-
|
517
|
-
def get_structured_data(self, content_id: str) -> dict:
|
518
|
-
"""
|
519
|
-
Query metadata for a specific content ID in a given index.
|
520
|
-
|
521
|
-
Args:
|
522
|
-
- content_id (str): content id to query
|
523
|
-
"""
|
524
|
-
response = self.get(
|
525
|
-
f"namespaces/{self.namespace}/content/{content_id}/metadata"
|
526
|
-
)
|
527
|
-
return response.json().get("metadata", [])
|
528
|
-
|
529
|
-
def search_index(
|
530
|
-
self, name: str, query: str, top_k: int, filters: List[str] = []
|
531
|
-
) -> dict:
|
532
|
-
"""
|
533
|
-
Search index in the current namespace.
|
534
|
-
|
535
|
-
Args:
|
536
|
-
- name (str): name of index to search
|
537
|
-
- query (str): query string
|
538
|
-
- top_k (int): top k nearest neighbors to be returned
|
539
|
-
- filters (List[str]): list of filters to apply
|
540
|
-
"""
|
541
|
-
req = {"query": query, "k": top_k, "filters": filters}
|
542
|
-
response = self.post(
|
543
|
-
f"namespaces/{self.namespace}/indexes/{name}/search",
|
544
|
-
json=req,
|
545
|
-
headers={"Content-Type": "application/json"},
|
546
|
-
)
|
547
|
-
return response.json()["results"]
|
548
|
-
|
549
|
-
def list_content(
|
550
|
-
self,
|
551
|
-
extraction_graph: str,
|
552
|
-
extraction_policy: str = "",
|
553
|
-
labels_filter: List[str] = [],
|
554
|
-
start_id: str = "",
|
555
|
-
limit: int = 10,
|
556
|
-
) -> List[ContentMetadata]:
|
557
|
-
"""
|
558
|
-
List content in the current namespace.
|
559
|
-
|
560
|
-
Args:
|
561
|
-
- extraction_graph (str): extraction graph name
|
562
|
-
- start_index (str): start index for pagination
|
563
|
-
- limit (int): number of items to return
|
564
|
-
"""
|
565
|
-
params = {"start_id": start_id, "limit": limit}
|
566
|
-
if extraction_policy:
|
567
|
-
params["source"] = extraction_policy
|
568
|
-
else:
|
569
|
-
params["source"] = "ingestion"
|
570
|
-
if len(labels_filter) > 0:
|
571
|
-
params["labels_filter"] = labels_filter
|
572
|
-
response = self.get(
|
573
|
-
f"namespaces/{self.namespace}/extraction_graphs/{extraction_graph}/content",
|
574
|
-
params=params,
|
575
|
-
)
|
576
|
-
content_list = response.json()["content_list"]
|
577
|
-
content = []
|
578
|
-
for item in content_list:
|
579
|
-
content.append(ContentMetadata.from_dict(item))
|
580
|
-
return content
|
581
|
-
|
582
|
-
def upload_file(
|
583
|
-
self,
|
584
|
-
extraction_graph: str,
|
585
|
-
path: str,
|
586
|
-
file_bytes: bytes = None,
|
587
|
-
id=None,
|
588
|
-
labels: dict = {},
|
589
|
-
) -> str:
|
590
|
-
"""
|
591
|
-
Upload a file from a path or the bytes.
|
592
|
-
|
593
|
-
Args:
|
594
|
-
- extraction_graph (str): name of the extraction graph to use for extraction
|
595
|
-
- path (Union[str, bytes]): relative path to the file to be uploaded, or the bytes of the file
|
596
|
-
- labels (dict): labels to be associated with the file
|
597
|
-
"""
|
598
|
-
params = {}
|
599
|
-
if id is not None:
|
600
|
-
params["id"] = id
|
601
|
-
|
602
|
-
if file_bytes == None:
|
603
|
-
with open(path, "rb") as f:
|
604
|
-
response = self.post(
|
605
|
-
f"namespaces/{self.namespace}/extraction_graphs/{extraction_graph}/extract",
|
606
|
-
files={"file": f},
|
607
|
-
data={"labels": json.dumps(labels)},
|
608
|
-
params=params,
|
609
|
-
)
|
610
|
-
else:
|
611
|
-
response = self.post(
|
612
|
-
f"namespaces/{self.namespace}/extraction_graphs/{extraction_graph}/extract",
|
613
|
-
files={"file": (path, file_bytes)},
|
614
|
-
data={"labels": json.dumps(labels)},
|
615
|
-
params=params,
|
616
|
-
)
|
617
|
-
file_content = path
|
618
|
-
|
619
|
-
response_json = response.json()
|
620
|
-
content_id = response_json["content_id"]
|
621
|
-
return content_id
|
622
|
-
|
623
|
-
def ingest_from_loader(
|
624
|
-
self, loader: DataLoader, extraction_graph: str
|
625
|
-
) -> List[str]:
|
626
|
-
"""
|
627
|
-
Loads content using the loader, uploads them to Indexify and returns the content ids.
|
628
|
-
loader: DataLoader: The DataLoader object to use for loading content
|
629
|
-
extraction_graph: str: The name of the extraction graph to use for extraction
|
630
|
-
"""
|
631
|
-
content_ids = []
|
632
|
-
files = loader.load()
|
633
|
-
for file_metadata in files:
|
634
|
-
labels = {"file_name": file_metadata.path}
|
635
|
-
content_id = self.upload_file(
|
636
|
-
extraction_graph,
|
637
|
-
file_metadata.path,
|
638
|
-
loader.read_all_bytes(file_metadata),
|
639
|
-
labels=labels,
|
640
|
-
)
|
641
|
-
content_ids.append(content_id)
|
642
|
-
return content_ids
|
643
|
-
|
644
|
-
def list_schemas(self) -> List[str]:
|
645
|
-
"""
|
646
|
-
List all schemas in the current namespace.
|
647
|
-
"""
|
648
|
-
response = self.get(f"namespaces/{self.namespace}/schemas")
|
649
|
-
return response.json()
|
650
|
-
|
651
|
-
def get_extracted_content(
|
652
|
-
self,
|
653
|
-
ingested_content_id: str,
|
654
|
-
graph_name: str,
|
655
|
-
policy_name: str,
|
656
|
-
blocking=False,
|
657
|
-
):
|
658
|
-
"""
|
659
|
-
Get list of child for a given content id and their content up to the specified level.
|
660
|
-
|
661
|
-
Args:
|
662
|
-
- ingested_content_id (str): id of content
|
663
|
-
- graph_name (str): name of extraction graph
|
664
|
-
- policy_name(str): name of extraction policy in the graph
|
665
|
-
- blocking (bool): wait for extraction to complete before returning (default: False)
|
666
|
-
"""
|
667
|
-
if blocking:
|
668
|
-
self.wait_for_extraction(ingested_content_id)
|
669
|
-
response = self.get(
|
670
|
-
f"namespaces/{self.namespace}/extraction_graphs/{graph_name}/content/{ingested_content_id}/extraction_policies/{policy_name}"
|
671
|
-
)
|
672
|
-
content_tree = response.json()
|
673
|
-
child_list = []
|
674
|
-
for item in content_tree["content_tree_metadata"]:
|
675
|
-
if (
|
676
|
-
graph_name in item["extraction_graph_names"]
|
677
|
-
and item["source"] == policy_name
|
678
|
-
):
|
679
|
-
content = self.download_content(item["id"])
|
680
|
-
child_list.append(
|
681
|
-
{
|
682
|
-
"id": item["id"],
|
683
|
-
"mime_type": item["mime_type"],
|
684
|
-
"content": content,
|
685
|
-
}
|
686
|
-
)
|
687
|
-
|
688
|
-
return child_list
|
689
|
-
|
690
|
-
def sql_query(self, query: str):
|
691
|
-
"""
|
692
|
-
Execute a SQL query.
|
693
|
-
|
694
|
-
Args:
|
695
|
-
- query (str): SQL query to be executed
|
696
|
-
"""
|
697
|
-
req = {"query": query}
|
698
|
-
response = self.post(
|
699
|
-
f"namespaces/{self.namespace}/sql_query",
|
700
|
-
json=req,
|
701
|
-
headers={"Content-Type": "application/json"},
|
702
|
-
)
|
703
|
-
result = response.json()
|
704
|
-
rows = []
|
705
|
-
for row in result["rows"]:
|
706
|
-
data = row["data"]
|
707
|
-
rows.append(data)
|
708
|
-
return SqlQueryResult(result=rows)
|
709
|
-
|
710
|
-
def ingest_remote_file(
|
711
|
-
self,
|
712
|
-
extraction_graph: str,
|
713
|
-
url: str,
|
714
|
-
mime_type: str,
|
715
|
-
labels: Dict[str, str] = {},
|
716
|
-
id=None,
|
717
|
-
):
|
718
|
-
req = {
|
719
|
-
"url": url,
|
720
|
-
"mime_type": mime_type,
|
721
|
-
"labels": labels,
|
722
|
-
"id": id,
|
723
|
-
}
|
724
|
-
response = self.post(
|
725
|
-
f"namespaces/{self.namespace}/extraction_graphs/{extraction_graph}/extract_remote",
|
726
|
-
json=req,
|
727
|
-
headers={"Content-Type": "application/json"},
|
728
|
-
)
|
729
|
-
response.raise_for_status()
|
730
|
-
return response.json()["content_id"]
|
731
|
-
|
732
|
-
def wait_for_extraction(self, content_ids: Union[str, List[str]]):
|
733
|
-
"""
|
734
|
-
Wait for extraction to complete for a given content id
|
735
|
-
|
736
|
-
Args:
|
737
|
-
- content_id (str): id of content
|
738
|
-
"""
|
739
|
-
if type(content_ids) == str:
|
740
|
-
content_ids = [content_ids]
|
741
|
-
print(
|
742
|
-
"Waiting for extraction to complete for content id: ", ",".join(content_ids)
|
743
|
-
)
|
744
|
-
for content_id in content_ids:
|
745
|
-
response = self.get(
|
746
|
-
f"namespaces/{self.namespace}/content/{content_id}/wait"
|
747
|
-
)
|
748
|
-
print("Extraction completed for content id: ", content_id)
|
749
|
-
response.raise_for_status()
|
750
|
-
|
751
|
-
def generate_unique_hex_id(self):
|
752
|
-
"""
|
753
|
-
Generate a unique hexadecimal identifier
|
754
|
-
|
755
|
-
Returns:
|
756
|
-
str: a unique hexadecimal string
|
757
|
-
"""
|
758
|
-
logging.warning(
|
759
|
-
"This method is deprecated. Use generate_unique_hex_id from indexify instead."
|
760
|
-
)
|
761
|
-
return uuid.uuid4().hex[:16]
|
762
|
-
|
763
|
-
def generate_hash_from_string(self, input_string: str):
|
764
|
-
"""
|
765
|
-
Generate a hash for the given string and return it as a hexadecimal string.
|
766
|
-
|
767
|
-
Args:
|
768
|
-
input_string (str): The input string to hash.
|
769
|
-
|
770
|
-
Returns:
|
771
|
-
str: The hexadecimal hash of the input string.
|
772
|
-
"""
|
773
|
-
logging.warning(
|
774
|
-
"This method is deprecated. Use generate_hash_from_string from indexify instead."
|
775
|
-
)
|
776
|
-
hash_object = hashlib.sha256(input_string.encode())
|
777
|
-
return hash_object.hexdigest()[:16]
|
778
|
-
|
779
|
-
def __print_additional_error_context(self, error: Error):
|
780
|
-
print(error)
|
781
|
-
|
782
|
-
if error.status == "ExtractionGraphError":
|
783
|
-
graphs = [eg.name for eg in self.extraction_graphs]
|
784
|
-
extractors = [ext.name for ext in self.extractors()]
|
785
|
-
print(f"Available extraction graphs: {graphs}")
|
786
|
-
print(f"Available extractors: {extractors}")
|
787
|
-
|
788
|
-
if error.status == "SearchError":
|
789
|
-
indexes = [index["name"] for index in self.indexes()]
|
790
|
-
print(f"Available indexes: {indexes}")
|
1
|
+
from typing import Optional
|
2
|
+
|
3
|
+
from .base_client import IndexifyClient
|
4
|
+
from .local_client import LocalClient
|
5
|
+
from .remote_client import RemoteClient
|
6
|
+
from .settings import DEFAULT_SERVICE_URL
|
7
|
+
|
8
|
+
|
9
|
+
def create_client(
|
10
|
+
service_url: str = DEFAULT_SERVICE_URL,
|
11
|
+
config_path: Optional[str] = None,
|
12
|
+
in_process: bool = False,
|
13
|
+
*args,
|
14
|
+
**kwargs,
|
15
|
+
) -> IndexifyClient:
|
16
|
+
if in_process:
|
17
|
+
return LocalClient()
|
18
|
+
return RemoteClient(config_path=config_path, service_url=service_url, **kwargs)
|