indexify 0.0.21__py3-none-any.whl → 0.0.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
indexify/__init__.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from .index import Index
2
2
  from .client import IndexifyClient
3
- from .extraction_policy import ExtractionPolicy, ExtractionGraphBuilder, ExtractionGraph
4
- from .client import IndexifyClient, Document
3
+ from .extraction_policy import ExtractionGraph
4
+ from .client import IndexifyClient, Document, generate_hash_from_string, generate_unique_hex_id
5
5
  from .settings import DEFAULT_SERVICE_URL
6
6
 
7
7
  __all__ = [
@@ -11,4 +11,6 @@ __all__ = [
11
11
  "ExtractionGraph",
12
12
  "ExtractionGraphBuilder" "ExtractionPolicy",
13
13
  "DEFAULT_SERVICE_URL",
14
+ "generate_hash_from_string",
15
+ "generate_unique_hex_id",
14
16
  ]
indexify/client.py CHANGED
@@ -9,16 +9,39 @@ from .extractor import Extractor
9
9
  from .extraction_policy import ExtractionPolicy, ExtractionGraph
10
10
  from .index import Index
11
11
  from .utils import json_set_default
12
+ from .error import Error
12
13
  from .data_containers import TextChunk
13
14
  from indexify.exceptions import ApiException
14
15
  from dataclasses import dataclass
15
-
16
16
  from typing import List, Optional, Union, Dict
17
+ import logging
17
18
 
18
19
  Document = namedtuple("Document", ["text", "labels", "id"])
19
20
 
20
21
  SQLQueryRow = namedtuple("SQLQueryRow", ["content_id", "data"])
21
22
 
23
+ def generate_unique_hex_id():
24
+ """
25
+ Generate a unique hexadecimal identifier
26
+
27
+ Returns:
28
+ str: a unique hexadecimal string
29
+ """
30
+ return uuid.uuid4().hex[:16]
31
+
32
+ def generate_hash_from_string(input_string: str):
33
+ """
34
+ Generate a hash for the given string and return it as a hexadecimal string.
35
+
36
+ Args:
37
+ input_string (str): The input string to hash.
38
+
39
+ Returns:
40
+ str: The hexadecimal hash of the input string.
41
+ """
42
+ hash_object = hashlib.sha256(input_string.encode())
43
+ return hash_object.hexdigest()[:16]
44
+
22
45
 
23
46
  @dataclass
24
47
  class SqlQueryResult:
@@ -75,12 +98,7 @@ class IndexifyClient:
75
98
  self._timeout = kwargs.get("timeout")
76
99
 
77
100
  # get namespace data
78
- response = self.get(f"namespaces/{self.namespace}")
79
- response.raise_for_status()
80
- resp_json = response.json()
81
- # initialize extraction_policies
82
- for eb in resp_json["namespace"]["extraction_graphs"]:
83
- self.extraction_graphs.append(ExtractionGraph.from_dict(eb))
101
+ self.extraction_graphs = self.get_extraction_graphs()
84
102
 
85
103
  @classmethod
86
104
  def with_mtls(
@@ -130,12 +148,19 @@ class IndexifyClient:
130
148
  return client
131
149
 
132
150
  def _request(self, method: str, **kwargs) -> httpx.Response:
133
- response = self._client.request(method, timeout=self._timeout, **kwargs)
134
151
  try:
135
- response.raise_for_status()
136
- except httpx.HTTPStatusError as exc:
137
- print(f"exception: {exc}, response text: {response.text}")
138
- raise exc
152
+ response = self._client.request(method, timeout=self._timeout, **kwargs)
153
+ status_code = str(response.status_code)
154
+ if status_code.startswith("4") or status_code.startswith("5"):
155
+ raise ApiException(response.text)
156
+ #error = Error.from_tonic_error_string(str(response.url), response.text)
157
+ #self.__print_additional_error_context(error)
158
+ #raise error
159
+ except httpx.ConnectError:
160
+ message = f"Make sure the server is running and accesible at {self._service_url}"
161
+ error = Error(status="ConnectionError", message=message)
162
+ print(error)
163
+ raise error
139
164
  return response
140
165
 
141
166
  def get(self, endpoint: str, **kwargs) -> httpx.Response:
@@ -291,7 +316,6 @@ class IndexifyClient:
291
316
  List[Index]: list of indexes in the current namespace
292
317
  """
293
318
  response = self.get(f"namespaces/{self.namespace}/indexes")
294
- response.raise_for_status()
295
319
  return response.json()["indexes"]
296
320
 
297
321
  def extractors(self) -> List[Extractor]:
@@ -308,17 +332,18 @@ class IndexifyClient:
308
332
  extractors.append(Extractor.from_dict(ed))
309
333
  return extractors
310
334
 
311
- def get_extraction_policies(self):
335
+ def get_extraction_graphs(self) -> List[ExtractionGraph]:
312
336
  """
313
337
  Retrieve and update the list of extraction policies for the current namespace.
314
338
  """
315
339
  response = self.get(f"namespaces/{self.namespace}")
316
- response.raise_for_status()
340
+ json = response.json()
341
+
342
+ self.extraction_graphs = []
343
+ for graph in json["namespace"]["extraction_graphs"]:
344
+ self.extraction_graphs.append(ExtractionGraph.from_dict(graph))
317
345
 
318
- self.extraction_policies = []
319
- for eb in response.json()["namespace"]["extraction_policies"]:
320
- self.extraction_policies.append(ExtractionPolicy.from_dict(eb))
321
- return self.extraction_policies
346
+ return self.extraction_graphs
322
347
 
323
348
  def create_extraction_graph(self, extraction_graph: ExtractionGraph):
324
349
  """
@@ -335,7 +360,6 @@ class IndexifyClient:
335
360
  data=request_body,
336
361
  headers={"Content-Type": "application/json"},
337
362
  )
338
- response.raise_for_status()
339
363
  return
340
364
 
341
365
  def get_content_metadata(self, content_id: str) -> dict:
@@ -346,29 +370,8 @@ class IndexifyClient:
346
370
  - content_id (str): content id to query
347
371
  """
348
372
  response = self.get(f"namespaces/{self.namespace}/content/{content_id}")
349
- response.raise_for_status()
350
373
  return response.json()
351
-
352
- def get_extracted_content(
353
- self,
354
- content_id: str = None,
355
- ):
356
- """
357
- Get list of content from current namespace.
358
-
359
- Args:
360
- - parent_id (str): Optional filter for parent id
361
- - labels_eq (str): Optional filter for labels
362
- """
363
- params = {"parent_id": content_id}
364
-
365
- response = self.get(f"namespaces/{self.namespace}/content", params=params)
366
- response.raise_for_status()
367
- return [
368
- self._add_content_url(content)
369
- for content in response.json()["content_list"]
370
- ]
371
-
374
+
372
375
  def download_content(self, id: str) -> bytes:
373
376
  """
374
377
  Download content from id. Return bytes
@@ -377,18 +380,14 @@ class IndexifyClient:
377
380
  - id (str): id of content to download
378
381
  """
379
382
  response = self.get(f"namespaces/{self.namespace}/content/{id}/download")
380
- try:
381
- response.raise_for_status()
382
- return response.content
383
- except httpx.HTTPStatusError as exc:
384
- raise ApiException(exc.response.text)
383
+ return response.content
385
384
 
386
385
  def add_documents(
387
386
  self,
388
387
  extraction_graphs: Union[str, List[str]],
389
388
  documents: Union[Document, str, List[Union[Document, str]]],
390
389
  doc_id=None,
391
- ) -> None:
390
+ ) -> Union[str, List[str]]:
392
391
  """
393
392
  Add documents to current namespace.
394
393
 
@@ -430,6 +429,11 @@ class IndexifyClient:
430
429
  headers={"Content-Type": "application/json"},
431
430
  )
432
431
  response.raise_for_status()
432
+ response_json = response.json()
433
+ content_ids = response_json["content_ids"]
434
+ if len(documents) == 1 and len(content_ids) == 1:
435
+ return content_ids[0]
436
+ return content_ids
433
437
 
434
438
  def delete_documents(self, document_ids: List[str]) -> None:
435
439
  """
@@ -444,7 +448,6 @@ class IndexifyClient:
444
448
  json=req,
445
449
  headers={"Content-Type": "application/json"},
446
450
  )
447
- response.raise_for_status()
448
451
 
449
452
  def update_content(self, document_id: str, path: str) -> None:
450
453
  """
@@ -457,7 +460,6 @@ class IndexifyClient:
457
460
  response = self.put(
458
461
  f"namespaces/{self.namespace}/content/{document_id}", files={"file": f}
459
462
  )
460
- response.raise_for_status()
461
463
 
462
464
  def get_structured_data(self, content_id: str) -> dict:
463
465
  """
@@ -469,7 +471,6 @@ class IndexifyClient:
469
471
  response = self.get(
470
472
  f"namespaces/{self.namespace}/content/{content_id}/metadata"
471
473
  )
472
- response.raise_for_status()
473
474
  return response.json().get("metadata", [])
474
475
 
475
476
  def search_index(
@@ -490,7 +491,6 @@ class IndexifyClient:
490
491
  json=req,
491
492
  headers={"Content-Type": "application/json"},
492
493
  )
493
- response.raise_for_status()
494
494
  return response.json()["results"]
495
495
 
496
496
  def upload_file(self, extraction_graphs: Union[str, List[str]], path: str, id=None, labels: dict = {}) -> str:
@@ -513,7 +513,6 @@ class IndexifyClient:
513
513
  data=labels,
514
514
  params=params,
515
515
  )
516
- response.raise_for_status()
517
516
  response_json = response.json()
518
517
  return response_json["content_id"]
519
518
 
@@ -522,7 +521,6 @@ class IndexifyClient:
522
521
  List all schemas in the current namespace.
523
522
  """
524
523
  response = self.get(f"namespaces/{self.namespace}/schemas")
525
- response.raise_for_status()
526
524
  return response.json()
527
525
 
528
526
  def get_content_tree(self, content_id: str):
@@ -535,9 +533,35 @@ class IndexifyClient:
535
533
  response = self.get(
536
534
  f"namespaces/{self.namespace}/content/{content_id}/content-tree"
537
535
  )
538
- response.raise_for_status()
539
536
  return response.json()
540
537
 
538
+ def get_extracted_content(self, content_id: str, level: int = 0):
539
+ """
540
+ Get list of child for a given content id and their content up to the specified level.
541
+
542
+ Args:
543
+ - content_id (str): id of content
544
+ - level (int): depth of content retrieval (default: 0)
545
+ """
546
+ content_tree = self.get_content_tree(content_id)
547
+ child_list = []
548
+
549
+ def traverse_content(parent_id, current_level):
550
+ if current_level > level:
551
+ return
552
+
553
+ for item in content_tree['content_tree_metadata']:
554
+ if item['parent_id'] == parent_id:
555
+ child_id = item['id']
556
+ content = self.download_content(child_id)
557
+ child_list.append({'id': child_id, 'content': content})
558
+
559
+ traverse_content(child_id, current_level + 1)
560
+
561
+ traverse_content(content_id, 0)
562
+
563
+ return child_list
564
+
541
565
  def sql_query(self, query: str):
542
566
  """
543
567
  Execute a SQL query.
@@ -551,7 +575,6 @@ class IndexifyClient:
551
575
  json=req,
552
576
  headers={"Content-Type": "application/json"},
553
577
  )
554
- response.raise_for_status()
555
578
  result = response.json()
556
579
  rows = []
557
580
  for row in result["rows"]:
@@ -570,8 +593,19 @@ class IndexifyClient:
570
593
  json=req,
571
594
  headers={"Content-Type": "application/json"},
572
595
  )
573
- response.raise_for_status()
574
596
  return response.json()
597
+
598
+ def wait_for_extraction(self, content_id: str):
599
+ """
600
+ Wait for extraction to complete for a given content id
601
+
602
+ Args:
603
+ - content_id (str): id of content
604
+ """
605
+ response = self.get(
606
+ f"namespaces/{self.namespace}/content/{content_id}/wait"
607
+ )
608
+ response.raise_for_status()
575
609
 
576
610
  def generate_unique_hex_id(self):
577
611
  """
@@ -580,6 +614,7 @@ class IndexifyClient:
580
614
  Returns:
581
615
  str: a unique hexadecimal string
582
616
  """
617
+ logging.warning("This method is deprecated. Use generate_unique_hex_id from indexify instead.")
583
618
  return uuid.uuid4().hex[:16]
584
619
 
585
620
  def generate_hash_from_string(self, input_string: str):
@@ -592,5 +627,19 @@ class IndexifyClient:
592
627
  Returns:
593
628
  str: The hexadecimal hash of the input string.
594
629
  """
630
+ logging.warning("This method is deprecated. Use generate_hash_from_string from indexify instead.")
595
631
  hash_object = hashlib.sha256(input_string.encode())
596
632
  return hash_object.hexdigest()[:16]
633
+
634
+ def __print_additional_error_context(self, error: Error):
635
+ print(error)
636
+
637
+ if error.status == "ExtractionGraphError":
638
+ graphs = [eg.name for eg in self.extraction_graphs]
639
+ extractors = [ext.name for ext in self.extractors()]
640
+ print(f"Available extraction graphs: {graphs}")
641
+ print(f"Available extractors: {extractors}")
642
+
643
+ if error.status == "SearchError":
644
+ indexes = [index["name"] for index in self.indexes()]
645
+ print(f"Available indexes: {indexes}")
indexify/error.py ADDED
@@ -0,0 +1,30 @@
1
+ class Error(Exception):
2
+ status: str
3
+ message: str
4
+
5
+ def __init__(self, status: str, message: str):
6
+ self.status = status
7
+ self.message = message
8
+
9
+ @staticmethod
10
+ def from_tonic_error_string(url: str, error: str) -> "Error":
11
+ data = error.split(", ")
12
+
13
+ message = data[1].split(": ", 1)[1]
14
+ if message.startswith('"') and message.endswith('"'):
15
+ message = message[1:-1]
16
+
17
+ status = "GeneralError"
18
+ if "extraction_graph" in url:
19
+ status = "ExtractionGraphError"
20
+ elif "search" in url:
21
+ status = "SearchError"
22
+
23
+ error = Error(status, message)
24
+ return error
25
+
26
+ def __str__(self):
27
+ return f"{self.status} | {self.message.capitalize()}"
28
+
29
+ def __repr__(self):
30
+ return f"Error(status={self.status!r}, message={self.message!r})"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: indexify
3
- Version: 0.0.21
3
+ Version: 0.0.23
4
4
  Summary: Python Client for Indexify
5
5
  Home-page: https://github.com/tensorlakeai/indexify
6
6
  License: Apache 2.0
@@ -1,13 +1,14 @@
1
- indexify/__init__.py,sha256=hhDqRvJo4gCW1eqVgFblxKiBzArCFfo2eFGOBsQkDOc,401
2
- indexify/client.py,sha256=s2Xflh75574WvNp0lbG6PGtK2Dy3CMfME5MDK1iDgR4,19334
1
+ indexify/__init__.py,sha256=Y40-Ur_tL7kGGs-reh9BTfEYGe-KyGxgdg-CmoFsXRQ,473
2
+ indexify/client.py,sha256=Q6QJ_yzJMmH_h0x3EwXL69qmp-TPrU7lcQedw__rRnk,21238
3
3
  indexify/data_containers.py,sha256=r1wxJPtsmXbyKvb17fqxm-dPjKz51oZ62f8A8Zxls1c,361
4
+ indexify/error.py,sha256=3umTeYb0ugtUyehV1ibfvaeACxAONPyWPc-1HRN4d1M,856
4
5
  indexify/exceptions.py,sha256=vjd5SPPNFIEW35GorSIodsqvm9RKHQm9kdp8t9gv-WM,111
5
6
  indexify/extraction_policy.py,sha256=dIyQK3N-QOpQ0BPjiZ_635o8A5ITNxaz1syQ_FPaE0k,1851
6
7
  indexify/extractor.py,sha256=sWFLlXHgEfWlmiKAXN6ytUt_uG7th-XGNHqz-TG39gs,1216
7
8
  indexify/index.py,sha256=RvxYhJXEth-GKvqzlMiz5PuN1eIbZk84pt20piA1Gsw,504
8
9
  indexify/settings.py,sha256=LSaWZ0ADIVmUv6o6dHWRC3-Ry5uLbCw2sBSg1e_U7UM,99
9
10
  indexify/utils.py,sha256=rDN2lrsAs9noJEIjfx6ukmC2SAIyrlUt7QU-kaBjujM,125
10
- indexify-0.0.21.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
11
- indexify-0.0.21.dist-info/METADATA,sha256=Rb_7fwsIiJKuJaLnmJp7Cw4exYLhHcdx48OfBcFzaO4,1753
12
- indexify-0.0.21.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
13
- indexify-0.0.21.dist-info/RECORD,,
11
+ indexify-0.0.23.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
12
+ indexify-0.0.23.dist-info/METADATA,sha256=vQqfHcLrf52YvCNbuAc1m9yLh-rVSGkRqfMKbcTuSb0,1753
13
+ indexify-0.0.23.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
14
+ indexify-0.0.23.dist-info/RECORD,,