indexify 0.0.28__py3-none-any.whl → 0.0.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
indexify/__init__.py CHANGED
@@ -2,10 +2,12 @@ from .index import Index
2
2
  from .client import IndexifyClient
3
3
  from .extraction_policy import ExtractionGraph
4
4
  from .client import IndexifyClient, Document, generate_hash_from_string, generate_unique_hex_id
5
+ from .data_containers import Content
5
6
  from .settings import DEFAULT_SERVICE_URL
6
7
 
7
8
  __all__ = [
8
9
  "Index",
10
+ "Content",
9
11
  "Document",
10
12
  "IndexifyClient",
11
13
  "ExtractionGraph",
indexify/client.py CHANGED
@@ -10,7 +10,7 @@ from .extraction_policy import ExtractionPolicy, ExtractionGraph
10
10
  from .index import Index
11
11
  from .utils import json_set_default
12
12
  from .error import Error
13
- from .data_containers import TextChunk
13
+ from .data_containers import TextChunk, Content
14
14
  from indexify.exceptions import ApiException
15
15
  from dataclasses import dataclass
16
16
  from typing import List, Optional, Union, Dict
@@ -153,7 +153,11 @@ class IndexifyClient:
153
153
  try:
154
154
  response = self._client.request(method, timeout=self._timeout, **kwargs)
155
155
  status_code = str(response.status_code)
156
- if status_code.startswith("4") or status_code.startswith("5"):
156
+ if status_code.startswith("4"):
157
+ raise ApiException(
158
+ "status code: " + status_code + " request args: " + str(kwargs)
159
+ )
160
+ if status_code.startswith("5"):
157
161
  raise ApiException(response.text)
158
162
  # error = Error.from_tonic_error_string(str(response.url), response.text)
159
163
  # self.__print_additional_error_context(error)
@@ -340,11 +344,11 @@ class IndexifyClient:
340
344
  """
341
345
  Retrieve and update the list of extraction policies for the current namespace.
342
346
  """
343
- response = self.get(f"namespaces/{self.namespace}")
347
+ response = self.get(f"namespaces/{self.namespace}/extraction_graphs")
344
348
  json = response.json()
345
349
 
346
350
  self.extraction_graphs = []
347
- for graph in json["namespace"]["extraction_graphs"]:
351
+ for graph in json["extraction_graphs"]:
348
352
  self.extraction_graphs.append(ExtractionGraph.from_dict(graph))
349
353
 
350
354
  return self.extraction_graphs
@@ -366,6 +370,28 @@ class IndexifyClient:
366
370
  )
367
371
  return
368
372
 
373
+ def link_extraction_graphs(
374
+ self, source_graph: str, content_source: str, linked_graph: str
375
+ ):
376
+ """
377
+ Link an extraction graph to another extraction graph.
378
+
379
+ Args:
380
+ - source_graph (str): source extraction graph
381
+ - content_source (str): content source in source graph
382
+ - linked_graph (str): target extraction graph
383
+ """
384
+ req = {
385
+ "content_source": content_source,
386
+ "linked_graph_name": linked_graph,
387
+ }
388
+ response = self.post(
389
+ f"namespaces/{self.namespace}/extraction_graphs/{source_graph}/links",
390
+ json=req,
391
+ headers={"Content-Type": "application/json"},
392
+ )
393
+ return
394
+
369
395
  def get_content_metadata(self, content_id: str) -> dict:
370
396
  """
371
397
  Get metadata for a specific content ID in a given index.
@@ -373,17 +399,17 @@ class IndexifyClient:
373
399
  Args:
374
400
  - content_id (str): content id to query
375
401
  """
376
- response = self.get(f"namespaces/{self.namespace}/content/{content_id}")
402
+ response = self.get(f"namespaces/{self.namespace}/content/{content_id}/metadata")
377
403
  return response.json()
378
404
 
379
- def download_content(self, id: str) -> bytes:
405
+ def download_content(self, content_id: str) -> bytes:
380
406
  """
381
407
  Download content from id. Return bytes
382
408
 
383
409
  Args:
384
- - id (str): id of content to download
410
+ - content_id (str): id of content to download
385
411
  """
386
- response = self.get(f"namespaces/{self.namespace}/content/{id}/download")
412
+ response = self.get(f"namespaces/{self.namespace}/content/{content_id}/download")
387
413
  return response.content
388
414
 
389
415
  def add_documents(
@@ -422,21 +448,21 @@ class IndexifyClient:
422
448
  raise TypeError(
423
449
  "Invalid type for documents. Expected Document, str, or list of these."
424
450
  )
425
-
426
- req = {
427
- "documents": [doc._asdict() for doc in documents],
428
- "extraction_graph_names": extraction_graphs,
429
- }
430
- response = self.post(
431
- f"namespaces/{self.namespace}/add_texts",
432
- json=req,
433
- headers={"Content-Type": "application/json"},
434
- )
435
- response.raise_for_status()
436
- response_json = response.json()
437
- content_ids = response_json["content_ids"]
438
- if len(documents) == 1 and len(content_ids) == 1:
439
- return content_ids[0]
451
+ for document in documents:
452
+ document.labels["mime_type"] = "text/plain"
453
+ content_ids = []
454
+ if isinstance(extraction_graphs, str):
455
+ extraction_graphs = [extraction_graphs]
456
+ for extraction_graph in extraction_graphs:
457
+ for document in documents:
458
+ response = self.post(
459
+ f"namespaces/{self.namespace}/extraction_graphs/{extraction_graph}/extract",
460
+ files={"file": document.text},
461
+ data={"labels": json.dumps(document.labels)},
462
+ )
463
+ response_json = response.json()
464
+ content_id = response_json["content_id"]
465
+ content_ids.append(content_id)
440
466
  return content_ids
441
467
 
442
468
  def delete_documents(self, document_ids: List[str]) -> None:
@@ -504,14 +530,47 @@ class IndexifyClient:
504
530
  - top_k (int): top k nearest neighbors to be returned
505
531
  - filters (List[str]): list of filters to apply
506
532
  """
507
- req = {"index": name, "query": query, "k": top_k, "filters": filters}
533
+ req = {"query": query, "k": top_k, "filters": filters}
508
534
  response = self.post(
509
- f"namespaces/{self.namespace}/search",
535
+ f"namespaces/{self.namespace}/indexes/{name}/search",
510
536
  json=req,
511
537
  headers={"Content-Type": "application/json"},
512
538
  )
513
539
  return response.json()["results"]
514
540
 
541
+ def list_content(
542
+ self,
543
+ extraction_graph: str,
544
+ extraction_policy: str = "",
545
+ labels_filter: List[str] = [],
546
+ start_id: str = "",
547
+ limit: int = 10,
548
+ ) -> List[Content]:
549
+ """
550
+ List content in the current namespace.
551
+
552
+ Args:
553
+ - extraction_graph (str): extraction graph name
554
+ - start_index (str): start index for pagination
555
+ - limit (int): number of items to return
556
+ """
557
+ params = {"graph": extraction_graph, "start_id": start_id, "limit": limit}
558
+ if extraction_policy:
559
+ params["source"] = extraction_policy
560
+ else:
561
+ params["source"] = "ingestion"
562
+ if len(labels_filter) > 0:
563
+ params["labels_filter"] = labels_filter
564
+ response = self.get(
565
+ f"namespaces/{self.namespace}/content",
566
+ params=params,
567
+ )
568
+ content_list = response.json()["content_list"]
569
+ content = []
570
+ for item in content_list:
571
+ content.append(Content.from_dict(item))
572
+ return content
573
+
515
574
  def upload_file(
516
575
  self,
517
576
  extraction_graphs: Union[str, List[str]],
@@ -528,18 +587,20 @@ class IndexifyClient:
528
587
  """
529
588
  if isinstance(extraction_graphs, str):
530
589
  extraction_graphs = [extraction_graphs]
531
- params = {"extraction_graph_names": extraction_graphs}
590
+ params = {}
532
591
  if id is not None:
533
592
  params["id"] = id
534
593
  with open(path, "rb") as f:
535
- response = self.post(
536
- f"namespaces/{self.namespace}/upload_file",
537
- files={"file": f},
538
- data={"labels": json.dumps(labels)},
539
- params=params,
540
- )
594
+ for extraction_graph in extraction_graphs:
595
+ response = self.post(
596
+ f"namespaces/{self.namespace}/extraction_graphs/{extraction_graph}/extract",
597
+ files={"file": f},
598
+ data={"labels": json.dumps(labels)},
599
+ params=params,
600
+ )
541
601
  response_json = response.json()
542
- return response_json["content_id"]
602
+ content_id = response_json["content_id"]
603
+ return content_id
543
604
 
544
605
  def list_schemas(self) -> List[str]:
545
606
  """
@@ -548,35 +609,32 @@ class IndexifyClient:
548
609
  response = self.get(f"namespaces/{self.namespace}/schemas")
549
610
  return response.json()
550
611
 
551
- def get_content_tree(self, content_id: str):
612
+ def get_extracted_content(
613
+ self, ingested_content_id: str, graph_name: str, extractor_name: str, blocking=False
614
+ ):
552
615
  """
553
- Get content tree for a given content id
616
+ Get list of child for a given content id and their content up to the specified level.
554
617
 
555
618
  Args:
556
- - content_id (str): id of content
619
+ - ingested_content_id (str): id of content
620
+ - graph_name (str): name of extraction graph
621
+ - extractor_name (str): name of extractor
622
+ - blocking (bool): wait for extraction to complete before returning (default: False)
557
623
  """
624
+ if blocking:
625
+ self.wait_for_extraction(ingested_content_id)
558
626
  response = self.get(
559
- f"namespaces/{self.namespace}/content/{content_id}/content-tree"
627
+ f"namespaces/{self.namespace}/extraction_graphs/{graph_name}/extraction_policies/{extractor_name}/content/{ingested_content_id}"
560
628
  )
561
- return response.json()
562
-
563
- def get_extracted_content(self, content_id: str, graph_name: str, policy_name: str):
564
- """
565
- Get list of child for a given content id and their content up to the specified level.
566
-
567
- Args:
568
- - content_id (str): id of content
569
- - level (int): depth of content retrieval (default: 0)
570
- """
571
- content_tree = self.get_content_tree(content_id)
629
+ content_tree = response.json()
572
630
  child_list = []
573
631
  for item in content_tree["content_tree_metadata"]:
574
632
  if (
575
633
  graph_name in item["extraction_graph_names"]
576
- and item["source"] == policy_name
634
+ and item["source"] == extractor_name
577
635
  ):
578
636
  content = self.download_content(item["id"])
579
- child_list.append({"id": item["id"], "content": content})
637
+ child_list.append({"id": item["id"], "mime_type": item["mime_type"], "content": content})
580
638
 
581
639
  return child_list
582
640
 
@@ -634,9 +692,13 @@ class IndexifyClient:
634
692
  """
635
693
  if type(content_ids) == str:
636
694
  content_ids = [content_ids]
637
- print("Waiting for extraction to complete for content id: ", ",".join(content_ids))
695
+ print(
696
+ "Waiting for extraction to complete for content id: ", ",".join(content_ids)
697
+ )
638
698
  for content_id in content_ids:
639
- response = self.get(f"namespaces/{self.namespace}/content/{content_id}/wait")
699
+ response = self.get(
700
+ f"namespaces/{self.namespace}/content/{content_id}/wait"
701
+ )
640
702
  print("Extraction completed for content id: ", content_id)
641
703
  response.raise_for_status()
642
704
 
@@ -2,6 +2,25 @@ from enum import Enum
2
2
  from typing import List
3
3
  from dataclasses import dataclass, field
4
4
 
5
+ @dataclass
6
+ class Content:
7
+ id: str
8
+ parent_id: str
9
+ labels: dict[str, any]
10
+ extraction_graph_names: List[str]
11
+ extraction_policy: str
12
+ mime_type: str
13
+
14
+ @classmethod
15
+ def from_dict(cls, json: dict):
16
+ return Content(
17
+ id=json["id"],
18
+ parent_id=json["parent_id"],
19
+ labels=json["labels"],
20
+ extraction_graph_names=json["extraction_graph_names"],
21
+ extraction_policy=json["source"],
22
+ mime_type=json["mime_type"],
23
+ )
5
24
 
6
25
  @dataclass
7
26
  class TextChunk:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: indexify
3
- Version: 0.0.28
3
+ Version: 0.0.31
4
4
  Summary: Python Client for Indexify
5
5
  Home-page: https://github.com/tensorlakeai/indexify
6
6
  License: Apache 2.0
@@ -35,8 +35,8 @@ pip install indexify
35
35
 
36
36
  ## Usage
37
37
 
38
- See the [getting started](https://getindexify.com/getting_started/) guide for examples of how to use the client.
39
- Look at the [examples](examples) directory for more examples.
38
+ See the [getting started](https://docs.getindexify.com/getting_started/) guide for examples of how to use the client.
39
+ Look at the [examples](https://github.com/tensorlakeai/indexify/tree/main/examples) directory for more examples.
40
40
 
41
41
  ## Development
42
42
 
@@ -1,6 +1,6 @@
1
- indexify/__init__.py,sha256=Y40-Ur_tL7kGGs-reh9BTfEYGe-KyGxgdg-CmoFsXRQ,473
2
- indexify/client.py,sha256=qECQvvVPvzMDra8OQ-94u7J5ABFWSEmEx_uIEdJ3RiU,22159
3
- indexify/data_containers.py,sha256=r1wxJPtsmXbyKvb17fqxm-dPjKz51oZ62f8A8Zxls1c,361
1
+ indexify/__init__.py,sha256=xqymbwqaiHiWXFpm7Cll2j-_V1lNQH2EEGlevtCTZK4,525
2
+ indexify/client.py,sha256=YkNhM1xDe0VcPx9Z3yLdl3y_msoOrGAj3ykefcItVhE,24653
3
+ indexify/data_containers.py,sha256=fIX_rghpojrCUtmZ0grywoq_HWniDgN1mnR7yXDej-Y,874
4
4
  indexify/error.py,sha256=3umTeYb0ugtUyehV1ibfvaeACxAONPyWPc-1HRN4d1M,856
5
5
  indexify/exceptions.py,sha256=vjd5SPPNFIEW35GorSIodsqvm9RKHQm9kdp8t9gv-WM,111
6
6
  indexify/extraction_policy.py,sha256=POluredrBw6DzTN0OyfPLaLFP5-2DoWGRK0V6w68R28,2080
@@ -8,7 +8,7 @@ indexify/extractor.py,sha256=sWFLlXHgEfWlmiKAXN6ytUt_uG7th-XGNHqz-TG39gs,1216
8
8
  indexify/index.py,sha256=RvxYhJXEth-GKvqzlMiz5PuN1eIbZk84pt20piA1Gsw,504
9
9
  indexify/settings.py,sha256=LSaWZ0ADIVmUv6o6dHWRC3-Ry5uLbCw2sBSg1e_U7UM,99
10
10
  indexify/utils.py,sha256=rDN2lrsAs9noJEIjfx6ukmC2SAIyrlUt7QU-kaBjujM,125
11
- indexify-0.0.28.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
12
- indexify-0.0.28.dist-info/METADATA,sha256=3r9wATECrfXFfmI68yU2cUOtrOBR4nT_ajPnag8IdZg,1798
13
- indexify-0.0.28.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
14
- indexify-0.0.28.dist-info/RECORD,,
11
+ indexify-0.0.31.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
12
+ indexify-0.0.31.dist-info/METADATA,sha256=TYHwRuFojns0bh0g8IsHDGyVUCo01dp0ysbRfnE-y20,1854
13
+ indexify-0.0.31.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
14
+ indexify-0.0.31.dist-info/RECORD,,