indexify 0.0.21__tar.gz → 0.0.22__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: indexify
3
- Version: 0.0.21
3
+ Version: 0.0.22
4
4
  Summary: Python Client for Indexify
5
5
  Home-page: https://github.com/tensorlakeai/indexify
6
6
  License: Apache 2.0
@@ -9,10 +9,10 @@ from .extractor import Extractor
9
9
  from .extraction_policy import ExtractionPolicy, ExtractionGraph
10
10
  from .index import Index
11
11
  from .utils import json_set_default
12
+ from .error import Error
12
13
  from .data_containers import TextChunk
13
14
  from indexify.exceptions import ApiException
14
15
  from dataclasses import dataclass
15
-
16
16
  from typing import List, Optional, Union, Dict
17
17
 
18
18
  Document = namedtuple("Document", ["text", "labels", "id"])
@@ -75,12 +75,7 @@ class IndexifyClient:
75
75
  self._timeout = kwargs.get("timeout")
76
76
 
77
77
  # get namespace data
78
- response = self.get(f"namespaces/{self.namespace}")
79
- response.raise_for_status()
80
- resp_json = response.json()
81
- # initialize extraction_policies
82
- for eb in resp_json["namespace"]["extraction_graphs"]:
83
- self.extraction_graphs.append(ExtractionGraph.from_dict(eb))
78
+ self.extraction_graphs = self.get_extraction_graphs()
84
79
 
85
80
  @classmethod
86
81
  def with_mtls(
@@ -130,12 +125,18 @@ class IndexifyClient:
130
125
  return client
131
126
 
132
127
  def _request(self, method: str, **kwargs) -> httpx.Response:
133
- response = self._client.request(method, timeout=self._timeout, **kwargs)
134
128
  try:
135
- response.raise_for_status()
136
- except httpx.HTTPStatusError as exc:
137
- print(f"exception: {exc}, response text: {response.text}")
138
- raise exc
129
+ response = self._client.request(method, timeout=self._timeout, **kwargs)
130
+ status_code = str(response.status_code)
131
+ if status_code.startswith("4") or status_code.startswith("5"):
132
+ error = Error.from_tonic_error_string(str(response.url), response.text)
133
+ self.__print_additional_error_context(error)
134
+ raise error
135
+ except httpx.ConnectError:
136
+ message = f"Make sure the server is running and accesible at {self._service_url}"
137
+ error = Error(status="ConnectionError", message=message)
138
+ print(error)
139
+ raise error
139
140
  return response
140
141
 
141
142
  def get(self, endpoint: str, **kwargs) -> httpx.Response:
@@ -291,7 +292,6 @@ class IndexifyClient:
291
292
  List[Index]: list of indexes in the current namespace
292
293
  """
293
294
  response = self.get(f"namespaces/{self.namespace}/indexes")
294
- response.raise_for_status()
295
295
  return response.json()["indexes"]
296
296
 
297
297
  def extractors(self) -> List[Extractor]:
@@ -308,17 +308,18 @@ class IndexifyClient:
308
308
  extractors.append(Extractor.from_dict(ed))
309
309
  return extractors
310
310
 
311
- def get_extraction_policies(self):
311
+ def get_extraction_graphs(self) -> List[ExtractionGraph]:
312
312
  """
313
313
  Retrieve and update the list of extraction policies for the current namespace.
314
314
  """
315
315
  response = self.get(f"namespaces/{self.namespace}")
316
- response.raise_for_status()
316
+ json = response.json()
317
317
 
318
- self.extraction_policies = []
319
- for eb in response.json()["namespace"]["extraction_policies"]:
320
- self.extraction_policies.append(ExtractionPolicy.from_dict(eb))
321
- return self.extraction_policies
318
+ self.extraction_graphs = []
319
+ for graph in json["namespace"]["extraction_graphs"]:
320
+ self.extraction_graphs.append(ExtractionGraph.from_dict(graph))
321
+
322
+ return self.extraction_graphs
322
323
 
323
324
  def create_extraction_graph(self, extraction_graph: ExtractionGraph):
324
325
  """
@@ -335,7 +336,6 @@ class IndexifyClient:
335
336
  data=request_body,
336
337
  headers={"Content-Type": "application/json"},
337
338
  )
338
- response.raise_for_status()
339
339
  return
340
340
 
341
341
  def get_content_metadata(self, content_id: str) -> dict:
@@ -346,29 +346,8 @@ class IndexifyClient:
346
346
  - content_id (str): content id to query
347
347
  """
348
348
  response = self.get(f"namespaces/{self.namespace}/content/{content_id}")
349
- response.raise_for_status()
350
349
  return response.json()
351
-
352
- def get_extracted_content(
353
- self,
354
- content_id: str = None,
355
- ):
356
- """
357
- Get list of content from current namespace.
358
-
359
- Args:
360
- - parent_id (str): Optional filter for parent id
361
- - labels_eq (str): Optional filter for labels
362
- """
363
- params = {"parent_id": content_id}
364
-
365
- response = self.get(f"namespaces/{self.namespace}/content", params=params)
366
- response.raise_for_status()
367
- return [
368
- self._add_content_url(content)
369
- for content in response.json()["content_list"]
370
- ]
371
-
350
+
372
351
  def download_content(self, id: str) -> bytes:
373
352
  """
374
353
  Download content from id. Return bytes
@@ -377,18 +356,14 @@ class IndexifyClient:
377
356
  - id (str): id of content to download
378
357
  """
379
358
  response = self.get(f"namespaces/{self.namespace}/content/{id}/download")
380
- try:
381
- response.raise_for_status()
382
- return response.content
383
- except httpx.HTTPStatusError as exc:
384
- raise ApiException(exc.response.text)
359
+ return response.content
385
360
 
386
361
  def add_documents(
387
362
  self,
388
363
  extraction_graphs: Union[str, List[str]],
389
364
  documents: Union[Document, str, List[Union[Document, str]]],
390
365
  doc_id=None,
391
- ) -> None:
366
+ ) -> Union[str, List[str]]:
392
367
  """
393
368
  Add documents to current namespace.
394
369
 
@@ -430,6 +405,11 @@ class IndexifyClient:
430
405
  headers={"Content-Type": "application/json"},
431
406
  )
432
407
  response.raise_for_status()
408
+ response_json = response.json()
409
+ content_ids = response_json["content_ids"]
410
+ if len(documents) == 1 and len(content_ids) == 1:
411
+ return content_ids[0]
412
+ return content_ids
433
413
 
434
414
  def delete_documents(self, document_ids: List[str]) -> None:
435
415
  """
@@ -444,7 +424,6 @@ class IndexifyClient:
444
424
  json=req,
445
425
  headers={"Content-Type": "application/json"},
446
426
  )
447
- response.raise_for_status()
448
427
 
449
428
  def update_content(self, document_id: str, path: str) -> None:
450
429
  """
@@ -457,7 +436,6 @@ class IndexifyClient:
457
436
  response = self.put(
458
437
  f"namespaces/{self.namespace}/content/{document_id}", files={"file": f}
459
438
  )
460
- response.raise_for_status()
461
439
 
462
440
  def get_structured_data(self, content_id: str) -> dict:
463
441
  """
@@ -469,7 +447,6 @@ class IndexifyClient:
469
447
  response = self.get(
470
448
  f"namespaces/{self.namespace}/content/{content_id}/metadata"
471
449
  )
472
- response.raise_for_status()
473
450
  return response.json().get("metadata", [])
474
451
 
475
452
  def search_index(
@@ -490,7 +467,6 @@ class IndexifyClient:
490
467
  json=req,
491
468
  headers={"Content-Type": "application/json"},
492
469
  )
493
- response.raise_for_status()
494
470
  return response.json()["results"]
495
471
 
496
472
  def upload_file(self, extraction_graphs: Union[str, List[str]], path: str, id=None, labels: dict = {}) -> str:
@@ -513,7 +489,6 @@ class IndexifyClient:
513
489
  data=labels,
514
490
  params=params,
515
491
  )
516
- response.raise_for_status()
517
492
  response_json = response.json()
518
493
  return response_json["content_id"]
519
494
 
@@ -522,7 +497,6 @@ class IndexifyClient:
522
497
  List all schemas in the current namespace.
523
498
  """
524
499
  response = self.get(f"namespaces/{self.namespace}/schemas")
525
- response.raise_for_status()
526
500
  return response.json()
527
501
 
528
502
  def get_content_tree(self, content_id: str):
@@ -535,9 +509,35 @@ class IndexifyClient:
535
509
  response = self.get(
536
510
  f"namespaces/{self.namespace}/content/{content_id}/content-tree"
537
511
  )
538
- response.raise_for_status()
539
512
  return response.json()
540
513
 
514
+ def get_extracted_content(self, content_id: str, level: int = 0):
515
+ """
516
+ Get list of child for a given content id and their content up to the specified level.
517
+
518
+ Args:
519
+ - content_id (str): id of content
520
+ - level (int): depth of content retrieval (default: 0)
521
+ """
522
+ content_tree = self.get_content_tree(content_id)
523
+ child_list = []
524
+
525
+ def traverse_content(parent_id, current_level):
526
+ if current_level > level:
527
+ return
528
+
529
+ for item in content_tree['content_tree_metadata']:
530
+ if item['parent_id'] == parent_id:
531
+ child_id = item['id']
532
+ content = self.download_content(child_id)
533
+ child_list.append({'id': child_id, 'content': content})
534
+
535
+ traverse_content(child_id, current_level + 1)
536
+
537
+ traverse_content(content_id, 0)
538
+
539
+ return child_list
540
+
541
541
  def sql_query(self, query: str):
542
542
  """
543
543
  Execute a SQL query.
@@ -551,7 +551,6 @@ class IndexifyClient:
551
551
  json=req,
552
552
  headers={"Content-Type": "application/json"},
553
553
  )
554
- response.raise_for_status()
555
554
  result = response.json()
556
555
  rows = []
557
556
  for row in result["rows"]:
@@ -570,8 +569,19 @@ class IndexifyClient:
570
569
  json=req,
571
570
  headers={"Content-Type": "application/json"},
572
571
  )
573
- response.raise_for_status()
574
572
  return response.json()
573
+
574
+ def wait_for_extraction(self, content_id: str):
575
+ """
576
+ Wait for extraction to complete for a given content id
577
+
578
+ Args:
579
+ - content_id (str): id of content
580
+ """
581
+ response = self.get(
582
+ f"namespaces/{self.namespace}/content/{content_id}/wait"
583
+ )
584
+ response.raise_for_status()
575
585
 
576
586
  def generate_unique_hex_id(self):
577
587
  """
@@ -594,3 +604,16 @@ class IndexifyClient:
594
604
  """
595
605
  hash_object = hashlib.sha256(input_string.encode())
596
606
  return hash_object.hexdigest()[:16]
607
+
608
+ def __print_additional_error_context(self, error: Error):
609
+ print(error)
610
+
611
+ if error.status == "ExtractionGraphError":
612
+ graphs = [eg.name for eg in self.extraction_graphs]
613
+ extractors = [ext.name for ext in self.extractors()]
614
+ print(f"Available extraction graphs: {graphs}")
615
+ print(f"Available extractors: {extractors}")
616
+
617
+ if error.status == "SearchError":
618
+ indexes = [index["name"] for index in self.indexes()]
619
+ print(f"Available indexes: {indexes}")
@@ -0,0 +1,30 @@
1
+ class Error(Exception):
2
+ status: str
3
+ message: str
4
+
5
+ def __init__(self, status: str, message: str):
6
+ self.status = status
7
+ self.message = message
8
+
9
+ @staticmethod
10
+ def from_tonic_error_string(url: str, error: str) -> "Error":
11
+ data = error.split(", ")
12
+
13
+ message = data[1].split(": ", 1)[1]
14
+ if message.startswith('"') and message.endswith('"'):
15
+ message = message[1:-1]
16
+
17
+ status = "GeneralError"
18
+ if "extraction_graph" in url:
19
+ status = "ExtractionGraphError"
20
+ elif "search" in url:
21
+ status = "SearchError"
22
+
23
+ error = Error(status, message)
24
+ return error
25
+
26
+ def __str__(self):
27
+ return f"{self.status} | {self.message.capitalize()}"
28
+
29
+ def __repr__(self):
30
+ return f"Error(status={self.status!r}, message={self.message!r})"
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "indexify"
3
- version = "0.0.21"
3
+ version = "0.0.22"
4
4
  description = "Python Client for Indexify"
5
5
  authors = ["Diptanu Gon Choudhury <diptanuc@gmail.com>", "Vijay Parthasarathy <vijay2win@gmail.com>"]
6
6
  license = "Apache 2.0"
File without changes
File without changes
File without changes
File without changes