indexify 0.0.21__tar.gz → 0.0.22__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {indexify-0.0.21 → indexify-0.0.22}/PKG-INFO +1 -1
- {indexify-0.0.21 → indexify-0.0.22}/indexify/client.py +80 -57
- indexify-0.0.22/indexify/error.py +30 -0
- {indexify-0.0.21 → indexify-0.0.22}/pyproject.toml +1 -1
- {indexify-0.0.21 → indexify-0.0.22}/LICENSE.txt +0 -0
- {indexify-0.0.21 → indexify-0.0.22}/README.md +0 -0
- {indexify-0.0.21 → indexify-0.0.22}/indexify/__init__.py +0 -0
- {indexify-0.0.21 → indexify-0.0.22}/indexify/data_containers.py +0 -0
- {indexify-0.0.21 → indexify-0.0.22}/indexify/exceptions.py +0 -0
- {indexify-0.0.21 → indexify-0.0.22}/indexify/extraction_policy.py +0 -0
- {indexify-0.0.21 → indexify-0.0.22}/indexify/extractor.py +0 -0
- {indexify-0.0.21 → indexify-0.0.22}/indexify/index.py +0 -0
- {indexify-0.0.21 → indexify-0.0.22}/indexify/settings.py +0 -0
- {indexify-0.0.21 → indexify-0.0.22}/indexify/utils.py +0 -0
@@ -9,10 +9,10 @@ from .extractor import Extractor
|
|
9
9
|
from .extraction_policy import ExtractionPolicy, ExtractionGraph
|
10
10
|
from .index import Index
|
11
11
|
from .utils import json_set_default
|
12
|
+
from .error import Error
|
12
13
|
from .data_containers import TextChunk
|
13
14
|
from indexify.exceptions import ApiException
|
14
15
|
from dataclasses import dataclass
|
15
|
-
|
16
16
|
from typing import List, Optional, Union, Dict
|
17
17
|
|
18
18
|
Document = namedtuple("Document", ["text", "labels", "id"])
|
@@ -75,12 +75,7 @@ class IndexifyClient:
|
|
75
75
|
self._timeout = kwargs.get("timeout")
|
76
76
|
|
77
77
|
# get namespace data
|
78
|
-
|
79
|
-
response.raise_for_status()
|
80
|
-
resp_json = response.json()
|
81
|
-
# initialize extraction_policies
|
82
|
-
for eb in resp_json["namespace"]["extraction_graphs"]:
|
83
|
-
self.extraction_graphs.append(ExtractionGraph.from_dict(eb))
|
78
|
+
self.extraction_graphs = self.get_extraction_graphs()
|
84
79
|
|
85
80
|
@classmethod
|
86
81
|
def with_mtls(
|
@@ -130,12 +125,18 @@ class IndexifyClient:
|
|
130
125
|
return client
|
131
126
|
|
132
127
|
def _request(self, method: str, **kwargs) -> httpx.Response:
|
133
|
-
response = self._client.request(method, timeout=self._timeout, **kwargs)
|
134
128
|
try:
|
135
|
-
response.
|
136
|
-
|
137
|
-
|
138
|
-
|
129
|
+
response = self._client.request(method, timeout=self._timeout, **kwargs)
|
130
|
+
status_code = str(response.status_code)
|
131
|
+
if status_code.startswith("4") or status_code.startswith("5"):
|
132
|
+
error = Error.from_tonic_error_string(str(response.url), response.text)
|
133
|
+
self.__print_additional_error_context(error)
|
134
|
+
raise error
|
135
|
+
except httpx.ConnectError:
|
136
|
+
message = f"Make sure the server is running and accesible at {self._service_url}"
|
137
|
+
error = Error(status="ConnectionError", message=message)
|
138
|
+
print(error)
|
139
|
+
raise error
|
139
140
|
return response
|
140
141
|
|
141
142
|
def get(self, endpoint: str, **kwargs) -> httpx.Response:
|
@@ -291,7 +292,6 @@ class IndexifyClient:
|
|
291
292
|
List[Index]: list of indexes in the current namespace
|
292
293
|
"""
|
293
294
|
response = self.get(f"namespaces/{self.namespace}/indexes")
|
294
|
-
response.raise_for_status()
|
295
295
|
return response.json()["indexes"]
|
296
296
|
|
297
297
|
def extractors(self) -> List[Extractor]:
|
@@ -308,17 +308,18 @@ class IndexifyClient:
|
|
308
308
|
extractors.append(Extractor.from_dict(ed))
|
309
309
|
return extractors
|
310
310
|
|
311
|
-
def
|
311
|
+
def get_extraction_graphs(self) -> List[ExtractionGraph]:
|
312
312
|
"""
|
313
313
|
Retrieve and update the list of extraction policies for the current namespace.
|
314
314
|
"""
|
315
315
|
response = self.get(f"namespaces/{self.namespace}")
|
316
|
-
response.
|
316
|
+
json = response.json()
|
317
317
|
|
318
|
-
self.
|
319
|
-
for
|
320
|
-
self.
|
321
|
-
|
318
|
+
self.extraction_graphs = []
|
319
|
+
for graph in json["namespace"]["extraction_graphs"]:
|
320
|
+
self.extraction_graphs.append(ExtractionGraph.from_dict(graph))
|
321
|
+
|
322
|
+
return self.extraction_graphs
|
322
323
|
|
323
324
|
def create_extraction_graph(self, extraction_graph: ExtractionGraph):
|
324
325
|
"""
|
@@ -335,7 +336,6 @@ class IndexifyClient:
|
|
335
336
|
data=request_body,
|
336
337
|
headers={"Content-Type": "application/json"},
|
337
338
|
)
|
338
|
-
response.raise_for_status()
|
339
339
|
return
|
340
340
|
|
341
341
|
def get_content_metadata(self, content_id: str) -> dict:
|
@@ -346,29 +346,8 @@ class IndexifyClient:
|
|
346
346
|
- content_id (str): content id to query
|
347
347
|
"""
|
348
348
|
response = self.get(f"namespaces/{self.namespace}/content/{content_id}")
|
349
|
-
response.raise_for_status()
|
350
349
|
return response.json()
|
351
|
-
|
352
|
-
def get_extracted_content(
|
353
|
-
self,
|
354
|
-
content_id: str = None,
|
355
|
-
):
|
356
|
-
"""
|
357
|
-
Get list of content from current namespace.
|
358
|
-
|
359
|
-
Args:
|
360
|
-
- parent_id (str): Optional filter for parent id
|
361
|
-
- labels_eq (str): Optional filter for labels
|
362
|
-
"""
|
363
|
-
params = {"parent_id": content_id}
|
364
|
-
|
365
|
-
response = self.get(f"namespaces/{self.namespace}/content", params=params)
|
366
|
-
response.raise_for_status()
|
367
|
-
return [
|
368
|
-
self._add_content_url(content)
|
369
|
-
for content in response.json()["content_list"]
|
370
|
-
]
|
371
|
-
|
350
|
+
|
372
351
|
def download_content(self, id: str) -> bytes:
|
373
352
|
"""
|
374
353
|
Download content from id. Return bytes
|
@@ -377,18 +356,14 @@ class IndexifyClient:
|
|
377
356
|
- id (str): id of content to download
|
378
357
|
"""
|
379
358
|
response = self.get(f"namespaces/{self.namespace}/content/{id}/download")
|
380
|
-
|
381
|
-
response.raise_for_status()
|
382
|
-
return response.content
|
383
|
-
except httpx.HTTPStatusError as exc:
|
384
|
-
raise ApiException(exc.response.text)
|
359
|
+
return response.content
|
385
360
|
|
386
361
|
def add_documents(
|
387
362
|
self,
|
388
363
|
extraction_graphs: Union[str, List[str]],
|
389
364
|
documents: Union[Document, str, List[Union[Document, str]]],
|
390
365
|
doc_id=None,
|
391
|
-
) ->
|
366
|
+
) -> Union[str, List[str]]:
|
392
367
|
"""
|
393
368
|
Add documents to current namespace.
|
394
369
|
|
@@ -430,6 +405,11 @@ class IndexifyClient:
|
|
430
405
|
headers={"Content-Type": "application/json"},
|
431
406
|
)
|
432
407
|
response.raise_for_status()
|
408
|
+
response_json = response.json()
|
409
|
+
content_ids = response_json["content_ids"]
|
410
|
+
if len(documents) == 1 and len(content_ids) == 1:
|
411
|
+
return content_ids[0]
|
412
|
+
return content_ids
|
433
413
|
|
434
414
|
def delete_documents(self, document_ids: List[str]) -> None:
|
435
415
|
"""
|
@@ -444,7 +424,6 @@ class IndexifyClient:
|
|
444
424
|
json=req,
|
445
425
|
headers={"Content-Type": "application/json"},
|
446
426
|
)
|
447
|
-
response.raise_for_status()
|
448
427
|
|
449
428
|
def update_content(self, document_id: str, path: str) -> None:
|
450
429
|
"""
|
@@ -457,7 +436,6 @@ class IndexifyClient:
|
|
457
436
|
response = self.put(
|
458
437
|
f"namespaces/{self.namespace}/content/{document_id}", files={"file": f}
|
459
438
|
)
|
460
|
-
response.raise_for_status()
|
461
439
|
|
462
440
|
def get_structured_data(self, content_id: str) -> dict:
|
463
441
|
"""
|
@@ -469,7 +447,6 @@ class IndexifyClient:
|
|
469
447
|
response = self.get(
|
470
448
|
f"namespaces/{self.namespace}/content/{content_id}/metadata"
|
471
449
|
)
|
472
|
-
response.raise_for_status()
|
473
450
|
return response.json().get("metadata", [])
|
474
451
|
|
475
452
|
def search_index(
|
@@ -490,7 +467,6 @@ class IndexifyClient:
|
|
490
467
|
json=req,
|
491
468
|
headers={"Content-Type": "application/json"},
|
492
469
|
)
|
493
|
-
response.raise_for_status()
|
494
470
|
return response.json()["results"]
|
495
471
|
|
496
472
|
def upload_file(self, extraction_graphs: Union[str, List[str]], path: str, id=None, labels: dict = {}) -> str:
|
@@ -513,7 +489,6 @@ class IndexifyClient:
|
|
513
489
|
data=labels,
|
514
490
|
params=params,
|
515
491
|
)
|
516
|
-
response.raise_for_status()
|
517
492
|
response_json = response.json()
|
518
493
|
return response_json["content_id"]
|
519
494
|
|
@@ -522,7 +497,6 @@ class IndexifyClient:
|
|
522
497
|
List all schemas in the current namespace.
|
523
498
|
"""
|
524
499
|
response = self.get(f"namespaces/{self.namespace}/schemas")
|
525
|
-
response.raise_for_status()
|
526
500
|
return response.json()
|
527
501
|
|
528
502
|
def get_content_tree(self, content_id: str):
|
@@ -535,9 +509,35 @@ class IndexifyClient:
|
|
535
509
|
response = self.get(
|
536
510
|
f"namespaces/{self.namespace}/content/{content_id}/content-tree"
|
537
511
|
)
|
538
|
-
response.raise_for_status()
|
539
512
|
return response.json()
|
540
513
|
|
514
|
+
def get_extracted_content(self, content_id: str, level: int = 0):
|
515
|
+
"""
|
516
|
+
Get list of child for a given content id and their content up to the specified level.
|
517
|
+
|
518
|
+
Args:
|
519
|
+
- content_id (str): id of content
|
520
|
+
- level (int): depth of content retrieval (default: 0)
|
521
|
+
"""
|
522
|
+
content_tree = self.get_content_tree(content_id)
|
523
|
+
child_list = []
|
524
|
+
|
525
|
+
def traverse_content(parent_id, current_level):
|
526
|
+
if current_level > level:
|
527
|
+
return
|
528
|
+
|
529
|
+
for item in content_tree['content_tree_metadata']:
|
530
|
+
if item['parent_id'] == parent_id:
|
531
|
+
child_id = item['id']
|
532
|
+
content = self.download_content(child_id)
|
533
|
+
child_list.append({'id': child_id, 'content': content})
|
534
|
+
|
535
|
+
traverse_content(child_id, current_level + 1)
|
536
|
+
|
537
|
+
traverse_content(content_id, 0)
|
538
|
+
|
539
|
+
return child_list
|
540
|
+
|
541
541
|
def sql_query(self, query: str):
|
542
542
|
"""
|
543
543
|
Execute a SQL query.
|
@@ -551,7 +551,6 @@ class IndexifyClient:
|
|
551
551
|
json=req,
|
552
552
|
headers={"Content-Type": "application/json"},
|
553
553
|
)
|
554
|
-
response.raise_for_status()
|
555
554
|
result = response.json()
|
556
555
|
rows = []
|
557
556
|
for row in result["rows"]:
|
@@ -570,8 +569,19 @@ class IndexifyClient:
|
|
570
569
|
json=req,
|
571
570
|
headers={"Content-Type": "application/json"},
|
572
571
|
)
|
573
|
-
response.raise_for_status()
|
574
572
|
return response.json()
|
573
|
+
|
574
|
+
def wait_for_extraction(self, content_id: str):
|
575
|
+
"""
|
576
|
+
Wait for extraction to complete for a given content id
|
577
|
+
|
578
|
+
Args:
|
579
|
+
- content_id (str): id of content
|
580
|
+
"""
|
581
|
+
response = self.get(
|
582
|
+
f"namespaces/{self.namespace}/content/{content_id}/wait"
|
583
|
+
)
|
584
|
+
response.raise_for_status()
|
575
585
|
|
576
586
|
def generate_unique_hex_id(self):
|
577
587
|
"""
|
@@ -594,3 +604,16 @@ class IndexifyClient:
|
|
594
604
|
"""
|
595
605
|
hash_object = hashlib.sha256(input_string.encode())
|
596
606
|
return hash_object.hexdigest()[:16]
|
607
|
+
|
608
|
+
def __print_additional_error_context(self, error: Error):
|
609
|
+
print(error)
|
610
|
+
|
611
|
+
if error.status == "ExtractionGraphError":
|
612
|
+
graphs = [eg.name for eg in self.extraction_graphs]
|
613
|
+
extractors = [ext.name for ext in self.extractors()]
|
614
|
+
print(f"Available extraction graphs: {graphs}")
|
615
|
+
print(f"Available extractors: {extractors}")
|
616
|
+
|
617
|
+
if error.status == "SearchError":
|
618
|
+
indexes = [index["name"] for index in self.indexes()]
|
619
|
+
print(f"Available indexes: {indexes}")
|
@@ -0,0 +1,30 @@
|
|
1
|
+
class Error(Exception):
|
2
|
+
status: str
|
3
|
+
message: str
|
4
|
+
|
5
|
+
def __init__(self, status: str, message: str):
|
6
|
+
self.status = status
|
7
|
+
self.message = message
|
8
|
+
|
9
|
+
@staticmethod
|
10
|
+
def from_tonic_error_string(url: str, error: str) -> "Error":
|
11
|
+
data = error.split(", ")
|
12
|
+
|
13
|
+
message = data[1].split(": ", 1)[1]
|
14
|
+
if message.startswith('"') and message.endswith('"'):
|
15
|
+
message = message[1:-1]
|
16
|
+
|
17
|
+
status = "GeneralError"
|
18
|
+
if "extraction_graph" in url:
|
19
|
+
status = "ExtractionGraphError"
|
20
|
+
elif "search" in url:
|
21
|
+
status = "SearchError"
|
22
|
+
|
23
|
+
error = Error(status, message)
|
24
|
+
return error
|
25
|
+
|
26
|
+
def __str__(self):
|
27
|
+
return f"{self.status} | {self.message.capitalize()}"
|
28
|
+
|
29
|
+
def __repr__(self):
|
30
|
+
return f"Error(status={self.status!r}, message={self.message!r})"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|