indexify 0.0.21__py3-none-any.whl → 0.0.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/__init__.py +4 -2
- indexify/client.py +106 -57
- indexify/error.py +30 -0
- {indexify-0.0.21.dist-info → indexify-0.0.23.dist-info}/METADATA +1 -1
- {indexify-0.0.21.dist-info → indexify-0.0.23.dist-info}/RECORD +7 -6
- {indexify-0.0.21.dist-info → indexify-0.0.23.dist-info}/LICENSE.txt +0 -0
- {indexify-0.0.21.dist-info → indexify-0.0.23.dist-info}/WHEEL +0 -0
indexify/__init__.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from .index import Index
|
2
2
|
from .client import IndexifyClient
|
3
|
-
from .extraction_policy import
|
4
|
-
from .client import IndexifyClient, Document
|
3
|
+
from .extraction_policy import ExtractionGraph
|
4
|
+
from .client import IndexifyClient, Document, generate_hash_from_string, generate_unique_hex_id
|
5
5
|
from .settings import DEFAULT_SERVICE_URL
|
6
6
|
|
7
7
|
__all__ = [
|
@@ -11,4 +11,6 @@ __all__ = [
|
|
11
11
|
"ExtractionGraph",
|
12
12
|
"ExtractionGraphBuilder" "ExtractionPolicy",
|
13
13
|
"DEFAULT_SERVICE_URL",
|
14
|
+
"generate_hash_from_string",
|
15
|
+
"generate_unique_hex_id",
|
14
16
|
]
|
indexify/client.py
CHANGED
@@ -9,16 +9,39 @@ from .extractor import Extractor
|
|
9
9
|
from .extraction_policy import ExtractionPolicy, ExtractionGraph
|
10
10
|
from .index import Index
|
11
11
|
from .utils import json_set_default
|
12
|
+
from .error import Error
|
12
13
|
from .data_containers import TextChunk
|
13
14
|
from indexify.exceptions import ApiException
|
14
15
|
from dataclasses import dataclass
|
15
|
-
|
16
16
|
from typing import List, Optional, Union, Dict
|
17
|
+
import logging
|
17
18
|
|
18
19
|
Document = namedtuple("Document", ["text", "labels", "id"])
|
19
20
|
|
20
21
|
SQLQueryRow = namedtuple("SQLQueryRow", ["content_id", "data"])
|
21
22
|
|
23
|
+
def generate_unique_hex_id():
|
24
|
+
"""
|
25
|
+
Generate a unique hexadecimal identifier
|
26
|
+
|
27
|
+
Returns:
|
28
|
+
str: a unique hexadecimal string
|
29
|
+
"""
|
30
|
+
return uuid.uuid4().hex[:16]
|
31
|
+
|
32
|
+
def generate_hash_from_string(input_string: str):
|
33
|
+
"""
|
34
|
+
Generate a hash for the given string and return it as a hexadecimal string.
|
35
|
+
|
36
|
+
Args:
|
37
|
+
input_string (str): The input string to hash.
|
38
|
+
|
39
|
+
Returns:
|
40
|
+
str: The hexadecimal hash of the input string.
|
41
|
+
"""
|
42
|
+
hash_object = hashlib.sha256(input_string.encode())
|
43
|
+
return hash_object.hexdigest()[:16]
|
44
|
+
|
22
45
|
|
23
46
|
@dataclass
|
24
47
|
class SqlQueryResult:
|
@@ -75,12 +98,7 @@ class IndexifyClient:
|
|
75
98
|
self._timeout = kwargs.get("timeout")
|
76
99
|
|
77
100
|
# get namespace data
|
78
|
-
|
79
|
-
response.raise_for_status()
|
80
|
-
resp_json = response.json()
|
81
|
-
# initialize extraction_policies
|
82
|
-
for eb in resp_json["namespace"]["extraction_graphs"]:
|
83
|
-
self.extraction_graphs.append(ExtractionGraph.from_dict(eb))
|
101
|
+
self.extraction_graphs = self.get_extraction_graphs()
|
84
102
|
|
85
103
|
@classmethod
|
86
104
|
def with_mtls(
|
@@ -130,12 +148,19 @@ class IndexifyClient:
|
|
130
148
|
return client
|
131
149
|
|
132
150
|
def _request(self, method: str, **kwargs) -> httpx.Response:
|
133
|
-
response = self._client.request(method, timeout=self._timeout, **kwargs)
|
134
151
|
try:
|
135
|
-
response.
|
136
|
-
|
137
|
-
|
138
|
-
|
152
|
+
response = self._client.request(method, timeout=self._timeout, **kwargs)
|
153
|
+
status_code = str(response.status_code)
|
154
|
+
if status_code.startswith("4") or status_code.startswith("5"):
|
155
|
+
raise ApiException(response.text)
|
156
|
+
#error = Error.from_tonic_error_string(str(response.url), response.text)
|
157
|
+
#self.__print_additional_error_context(error)
|
158
|
+
#raise error
|
159
|
+
except httpx.ConnectError:
|
160
|
+
message = f"Make sure the server is running and accesible at {self._service_url}"
|
161
|
+
error = Error(status="ConnectionError", message=message)
|
162
|
+
print(error)
|
163
|
+
raise error
|
139
164
|
return response
|
140
165
|
|
141
166
|
def get(self, endpoint: str, **kwargs) -> httpx.Response:
|
@@ -291,7 +316,6 @@ class IndexifyClient:
|
|
291
316
|
List[Index]: list of indexes in the current namespace
|
292
317
|
"""
|
293
318
|
response = self.get(f"namespaces/{self.namespace}/indexes")
|
294
|
-
response.raise_for_status()
|
295
319
|
return response.json()["indexes"]
|
296
320
|
|
297
321
|
def extractors(self) -> List[Extractor]:
|
@@ -308,17 +332,18 @@ class IndexifyClient:
|
|
308
332
|
extractors.append(Extractor.from_dict(ed))
|
309
333
|
return extractors
|
310
334
|
|
311
|
-
def
|
335
|
+
def get_extraction_graphs(self) -> List[ExtractionGraph]:
|
312
336
|
"""
|
313
337
|
Retrieve and update the list of extraction policies for the current namespace.
|
314
338
|
"""
|
315
339
|
response = self.get(f"namespaces/{self.namespace}")
|
316
|
-
response.
|
340
|
+
json = response.json()
|
341
|
+
|
342
|
+
self.extraction_graphs = []
|
343
|
+
for graph in json["namespace"]["extraction_graphs"]:
|
344
|
+
self.extraction_graphs.append(ExtractionGraph.from_dict(graph))
|
317
345
|
|
318
|
-
self.
|
319
|
-
for eb in response.json()["namespace"]["extraction_policies"]:
|
320
|
-
self.extraction_policies.append(ExtractionPolicy.from_dict(eb))
|
321
|
-
return self.extraction_policies
|
346
|
+
return self.extraction_graphs
|
322
347
|
|
323
348
|
def create_extraction_graph(self, extraction_graph: ExtractionGraph):
|
324
349
|
"""
|
@@ -335,7 +360,6 @@ class IndexifyClient:
|
|
335
360
|
data=request_body,
|
336
361
|
headers={"Content-Type": "application/json"},
|
337
362
|
)
|
338
|
-
response.raise_for_status()
|
339
363
|
return
|
340
364
|
|
341
365
|
def get_content_metadata(self, content_id: str) -> dict:
|
@@ -346,29 +370,8 @@ class IndexifyClient:
|
|
346
370
|
- content_id (str): content id to query
|
347
371
|
"""
|
348
372
|
response = self.get(f"namespaces/{self.namespace}/content/{content_id}")
|
349
|
-
response.raise_for_status()
|
350
373
|
return response.json()
|
351
|
-
|
352
|
-
def get_extracted_content(
|
353
|
-
self,
|
354
|
-
content_id: str = None,
|
355
|
-
):
|
356
|
-
"""
|
357
|
-
Get list of content from current namespace.
|
358
|
-
|
359
|
-
Args:
|
360
|
-
- parent_id (str): Optional filter for parent id
|
361
|
-
- labels_eq (str): Optional filter for labels
|
362
|
-
"""
|
363
|
-
params = {"parent_id": content_id}
|
364
|
-
|
365
|
-
response = self.get(f"namespaces/{self.namespace}/content", params=params)
|
366
|
-
response.raise_for_status()
|
367
|
-
return [
|
368
|
-
self._add_content_url(content)
|
369
|
-
for content in response.json()["content_list"]
|
370
|
-
]
|
371
|
-
|
374
|
+
|
372
375
|
def download_content(self, id: str) -> bytes:
|
373
376
|
"""
|
374
377
|
Download content from id. Return bytes
|
@@ -377,18 +380,14 @@ class IndexifyClient:
|
|
377
380
|
- id (str): id of content to download
|
378
381
|
"""
|
379
382
|
response = self.get(f"namespaces/{self.namespace}/content/{id}/download")
|
380
|
-
|
381
|
-
response.raise_for_status()
|
382
|
-
return response.content
|
383
|
-
except httpx.HTTPStatusError as exc:
|
384
|
-
raise ApiException(exc.response.text)
|
383
|
+
return response.content
|
385
384
|
|
386
385
|
def add_documents(
|
387
386
|
self,
|
388
387
|
extraction_graphs: Union[str, List[str]],
|
389
388
|
documents: Union[Document, str, List[Union[Document, str]]],
|
390
389
|
doc_id=None,
|
391
|
-
) ->
|
390
|
+
) -> Union[str, List[str]]:
|
392
391
|
"""
|
393
392
|
Add documents to current namespace.
|
394
393
|
|
@@ -430,6 +429,11 @@ class IndexifyClient:
|
|
430
429
|
headers={"Content-Type": "application/json"},
|
431
430
|
)
|
432
431
|
response.raise_for_status()
|
432
|
+
response_json = response.json()
|
433
|
+
content_ids = response_json["content_ids"]
|
434
|
+
if len(documents) == 1 and len(content_ids) == 1:
|
435
|
+
return content_ids[0]
|
436
|
+
return content_ids
|
433
437
|
|
434
438
|
def delete_documents(self, document_ids: List[str]) -> None:
|
435
439
|
"""
|
@@ -444,7 +448,6 @@ class IndexifyClient:
|
|
444
448
|
json=req,
|
445
449
|
headers={"Content-Type": "application/json"},
|
446
450
|
)
|
447
|
-
response.raise_for_status()
|
448
451
|
|
449
452
|
def update_content(self, document_id: str, path: str) -> None:
|
450
453
|
"""
|
@@ -457,7 +460,6 @@ class IndexifyClient:
|
|
457
460
|
response = self.put(
|
458
461
|
f"namespaces/{self.namespace}/content/{document_id}", files={"file": f}
|
459
462
|
)
|
460
|
-
response.raise_for_status()
|
461
463
|
|
462
464
|
def get_structured_data(self, content_id: str) -> dict:
|
463
465
|
"""
|
@@ -469,7 +471,6 @@ class IndexifyClient:
|
|
469
471
|
response = self.get(
|
470
472
|
f"namespaces/{self.namespace}/content/{content_id}/metadata"
|
471
473
|
)
|
472
|
-
response.raise_for_status()
|
473
474
|
return response.json().get("metadata", [])
|
474
475
|
|
475
476
|
def search_index(
|
@@ -490,7 +491,6 @@ class IndexifyClient:
|
|
490
491
|
json=req,
|
491
492
|
headers={"Content-Type": "application/json"},
|
492
493
|
)
|
493
|
-
response.raise_for_status()
|
494
494
|
return response.json()["results"]
|
495
495
|
|
496
496
|
def upload_file(self, extraction_graphs: Union[str, List[str]], path: str, id=None, labels: dict = {}) -> str:
|
@@ -513,7 +513,6 @@ class IndexifyClient:
|
|
513
513
|
data=labels,
|
514
514
|
params=params,
|
515
515
|
)
|
516
|
-
response.raise_for_status()
|
517
516
|
response_json = response.json()
|
518
517
|
return response_json["content_id"]
|
519
518
|
|
@@ -522,7 +521,6 @@ class IndexifyClient:
|
|
522
521
|
List all schemas in the current namespace.
|
523
522
|
"""
|
524
523
|
response = self.get(f"namespaces/{self.namespace}/schemas")
|
525
|
-
response.raise_for_status()
|
526
524
|
return response.json()
|
527
525
|
|
528
526
|
def get_content_tree(self, content_id: str):
|
@@ -535,9 +533,35 @@ class IndexifyClient:
|
|
535
533
|
response = self.get(
|
536
534
|
f"namespaces/{self.namespace}/content/{content_id}/content-tree"
|
537
535
|
)
|
538
|
-
response.raise_for_status()
|
539
536
|
return response.json()
|
540
537
|
|
538
|
+
def get_extracted_content(self, content_id: str, level: int = 0):
|
539
|
+
"""
|
540
|
+
Get list of child for a given content id and their content up to the specified level.
|
541
|
+
|
542
|
+
Args:
|
543
|
+
- content_id (str): id of content
|
544
|
+
- level (int): depth of content retrieval (default: 0)
|
545
|
+
"""
|
546
|
+
content_tree = self.get_content_tree(content_id)
|
547
|
+
child_list = []
|
548
|
+
|
549
|
+
def traverse_content(parent_id, current_level):
|
550
|
+
if current_level > level:
|
551
|
+
return
|
552
|
+
|
553
|
+
for item in content_tree['content_tree_metadata']:
|
554
|
+
if item['parent_id'] == parent_id:
|
555
|
+
child_id = item['id']
|
556
|
+
content = self.download_content(child_id)
|
557
|
+
child_list.append({'id': child_id, 'content': content})
|
558
|
+
|
559
|
+
traverse_content(child_id, current_level + 1)
|
560
|
+
|
561
|
+
traverse_content(content_id, 0)
|
562
|
+
|
563
|
+
return child_list
|
564
|
+
|
541
565
|
def sql_query(self, query: str):
|
542
566
|
"""
|
543
567
|
Execute a SQL query.
|
@@ -551,7 +575,6 @@ class IndexifyClient:
|
|
551
575
|
json=req,
|
552
576
|
headers={"Content-Type": "application/json"},
|
553
577
|
)
|
554
|
-
response.raise_for_status()
|
555
578
|
result = response.json()
|
556
579
|
rows = []
|
557
580
|
for row in result["rows"]:
|
@@ -570,8 +593,19 @@ class IndexifyClient:
|
|
570
593
|
json=req,
|
571
594
|
headers={"Content-Type": "application/json"},
|
572
595
|
)
|
573
|
-
response.raise_for_status()
|
574
596
|
return response.json()
|
597
|
+
|
598
|
+
def wait_for_extraction(self, content_id: str):
|
599
|
+
"""
|
600
|
+
Wait for extraction to complete for a given content id
|
601
|
+
|
602
|
+
Args:
|
603
|
+
- content_id (str): id of content
|
604
|
+
"""
|
605
|
+
response = self.get(
|
606
|
+
f"namespaces/{self.namespace}/content/{content_id}/wait"
|
607
|
+
)
|
608
|
+
response.raise_for_status()
|
575
609
|
|
576
610
|
def generate_unique_hex_id(self):
|
577
611
|
"""
|
@@ -580,6 +614,7 @@ class IndexifyClient:
|
|
580
614
|
Returns:
|
581
615
|
str: a unique hexadecimal string
|
582
616
|
"""
|
617
|
+
logging.warning("This method is deprecated. Use generate_unique_hex_id from indexify instead.")
|
583
618
|
return uuid.uuid4().hex[:16]
|
584
619
|
|
585
620
|
def generate_hash_from_string(self, input_string: str):
|
@@ -592,5 +627,19 @@ class IndexifyClient:
|
|
592
627
|
Returns:
|
593
628
|
str: The hexadecimal hash of the input string.
|
594
629
|
"""
|
630
|
+
logging.warning("This method is deprecated. Use generate_hash_from_string from indexify instead.")
|
595
631
|
hash_object = hashlib.sha256(input_string.encode())
|
596
632
|
return hash_object.hexdigest()[:16]
|
633
|
+
|
634
|
+
def __print_additional_error_context(self, error: Error):
|
635
|
+
print(error)
|
636
|
+
|
637
|
+
if error.status == "ExtractionGraphError":
|
638
|
+
graphs = [eg.name for eg in self.extraction_graphs]
|
639
|
+
extractors = [ext.name for ext in self.extractors()]
|
640
|
+
print(f"Available extraction graphs: {graphs}")
|
641
|
+
print(f"Available extractors: {extractors}")
|
642
|
+
|
643
|
+
if error.status == "SearchError":
|
644
|
+
indexes = [index["name"] for index in self.indexes()]
|
645
|
+
print(f"Available indexes: {indexes}")
|
indexify/error.py
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
class Error(Exception):
|
2
|
+
status: str
|
3
|
+
message: str
|
4
|
+
|
5
|
+
def __init__(self, status: str, message: str):
|
6
|
+
self.status = status
|
7
|
+
self.message = message
|
8
|
+
|
9
|
+
@staticmethod
|
10
|
+
def from_tonic_error_string(url: str, error: str) -> "Error":
|
11
|
+
data = error.split(", ")
|
12
|
+
|
13
|
+
message = data[1].split(": ", 1)[1]
|
14
|
+
if message.startswith('"') and message.endswith('"'):
|
15
|
+
message = message[1:-1]
|
16
|
+
|
17
|
+
status = "GeneralError"
|
18
|
+
if "extraction_graph" in url:
|
19
|
+
status = "ExtractionGraphError"
|
20
|
+
elif "search" in url:
|
21
|
+
status = "SearchError"
|
22
|
+
|
23
|
+
error = Error(status, message)
|
24
|
+
return error
|
25
|
+
|
26
|
+
def __str__(self):
|
27
|
+
return f"{self.status} | {self.message.capitalize()}"
|
28
|
+
|
29
|
+
def __repr__(self):
|
30
|
+
return f"Error(status={self.status!r}, message={self.message!r})"
|
@@ -1,13 +1,14 @@
|
|
1
|
-
indexify/__init__.py,sha256=
|
2
|
-
indexify/client.py,sha256=
|
1
|
+
indexify/__init__.py,sha256=Y40-Ur_tL7kGGs-reh9BTfEYGe-KyGxgdg-CmoFsXRQ,473
|
2
|
+
indexify/client.py,sha256=Q6QJ_yzJMmH_h0x3EwXL69qmp-TPrU7lcQedw__rRnk,21238
|
3
3
|
indexify/data_containers.py,sha256=r1wxJPtsmXbyKvb17fqxm-dPjKz51oZ62f8A8Zxls1c,361
|
4
|
+
indexify/error.py,sha256=3umTeYb0ugtUyehV1ibfvaeACxAONPyWPc-1HRN4d1M,856
|
4
5
|
indexify/exceptions.py,sha256=vjd5SPPNFIEW35GorSIodsqvm9RKHQm9kdp8t9gv-WM,111
|
5
6
|
indexify/extraction_policy.py,sha256=dIyQK3N-QOpQ0BPjiZ_635o8A5ITNxaz1syQ_FPaE0k,1851
|
6
7
|
indexify/extractor.py,sha256=sWFLlXHgEfWlmiKAXN6ytUt_uG7th-XGNHqz-TG39gs,1216
|
7
8
|
indexify/index.py,sha256=RvxYhJXEth-GKvqzlMiz5PuN1eIbZk84pt20piA1Gsw,504
|
8
9
|
indexify/settings.py,sha256=LSaWZ0ADIVmUv6o6dHWRC3-Ry5uLbCw2sBSg1e_U7UM,99
|
9
10
|
indexify/utils.py,sha256=rDN2lrsAs9noJEIjfx6ukmC2SAIyrlUt7QU-kaBjujM,125
|
10
|
-
indexify-0.0.
|
11
|
-
indexify-0.0.
|
12
|
-
indexify-0.0.
|
13
|
-
indexify-0.0.
|
11
|
+
indexify-0.0.23.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
12
|
+
indexify-0.0.23.dist-info/METADATA,sha256=vQqfHcLrf52YvCNbuAc1m9yLh-rVSGkRqfMKbcTuSb0,1753
|
13
|
+
indexify-0.0.23.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
14
|
+
indexify-0.0.23.dist-info/RECORD,,
|
File without changes
|
File without changes
|