indexify 0.0.28__tar.gz → 0.0.31__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {indexify-0.0.28 → indexify-0.0.31}/PKG-INFO +3 -3
- {indexify-0.0.28 → indexify-0.0.31}/README.md +2 -2
- {indexify-0.0.28 → indexify-0.0.31}/indexify/__init__.py +2 -0
- {indexify-0.0.28 → indexify-0.0.31}/indexify/client.py +114 -52
- indexify-0.0.31/indexify/data_containers.py +37 -0
- {indexify-0.0.28 → indexify-0.0.31}/pyproject.toml +1 -1
- indexify-0.0.28/indexify/data_containers.py +0 -18
- {indexify-0.0.28 → indexify-0.0.31}/LICENSE.txt +0 -0
- {indexify-0.0.28 → indexify-0.0.31}/indexify/error.py +0 -0
- {indexify-0.0.28 → indexify-0.0.31}/indexify/exceptions.py +0 -0
- {indexify-0.0.28 → indexify-0.0.31}/indexify/extraction_policy.py +0 -0
- {indexify-0.0.28 → indexify-0.0.31}/indexify/extractor.py +0 -0
- {indexify-0.0.28 → indexify-0.0.31}/indexify/index.py +0 -0
- {indexify-0.0.28 → indexify-0.0.31}/indexify/settings.py +0 -0
- {indexify-0.0.28 → indexify-0.0.31}/indexify/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: indexify
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.31
|
4
4
|
Summary: Python Client for Indexify
|
5
5
|
Home-page: https://github.com/tensorlakeai/indexify
|
6
6
|
License: Apache 2.0
|
@@ -35,8 +35,8 @@ pip install indexify
|
|
35
35
|
|
36
36
|
## Usage
|
37
37
|
|
38
|
-
See the [getting started](https://getindexify.com/getting_started/) guide for examples of how to use the client.
|
39
|
-
Look at the [examples](examples) directory for more examples.
|
38
|
+
See the [getting started](https://docs.getindexify.com/getting_started/) guide for examples of how to use the client.
|
39
|
+
Look at the [examples](https://github.com/tensorlakeai/indexify/tree/main/examples) directory for more examples.
|
40
40
|
|
41
41
|
## Development
|
42
42
|
|
@@ -15,8 +15,8 @@ pip install indexify
|
|
15
15
|
|
16
16
|
## Usage
|
17
17
|
|
18
|
-
See the [getting started](https://getindexify.com/getting_started/) guide for examples of how to use the client.
|
19
|
-
Look at the [examples](examples) directory for more examples.
|
18
|
+
See the [getting started](https://docs.getindexify.com/getting_started/) guide for examples of how to use the client.
|
19
|
+
Look at the [examples](https://github.com/tensorlakeai/indexify/tree/main/examples) directory for more examples.
|
20
20
|
|
21
21
|
## Development
|
22
22
|
|
@@ -2,10 +2,12 @@ from .index import Index
|
|
2
2
|
from .client import IndexifyClient
|
3
3
|
from .extraction_policy import ExtractionGraph
|
4
4
|
from .client import IndexifyClient, Document, generate_hash_from_string, generate_unique_hex_id
|
5
|
+
from .data_containers import Content
|
5
6
|
from .settings import DEFAULT_SERVICE_URL
|
6
7
|
|
7
8
|
__all__ = [
|
8
9
|
"Index",
|
10
|
+
"Content",
|
9
11
|
"Document",
|
10
12
|
"IndexifyClient",
|
11
13
|
"ExtractionGraph",
|
@@ -10,7 +10,7 @@ from .extraction_policy import ExtractionPolicy, ExtractionGraph
|
|
10
10
|
from .index import Index
|
11
11
|
from .utils import json_set_default
|
12
12
|
from .error import Error
|
13
|
-
from .data_containers import TextChunk
|
13
|
+
from .data_containers import TextChunk, Content
|
14
14
|
from indexify.exceptions import ApiException
|
15
15
|
from dataclasses import dataclass
|
16
16
|
from typing import List, Optional, Union, Dict
|
@@ -153,7 +153,11 @@ class IndexifyClient:
|
|
153
153
|
try:
|
154
154
|
response = self._client.request(method, timeout=self._timeout, **kwargs)
|
155
155
|
status_code = str(response.status_code)
|
156
|
-
if status_code.startswith("4")
|
156
|
+
if status_code.startswith("4"):
|
157
|
+
raise ApiException(
|
158
|
+
"status code: " + status_code + " request args: " + str(kwargs)
|
159
|
+
)
|
160
|
+
if status_code.startswith("5"):
|
157
161
|
raise ApiException(response.text)
|
158
162
|
# error = Error.from_tonic_error_string(str(response.url), response.text)
|
159
163
|
# self.__print_additional_error_context(error)
|
@@ -340,11 +344,11 @@ class IndexifyClient:
|
|
340
344
|
"""
|
341
345
|
Retrieve and update the list of extraction policies for the current namespace.
|
342
346
|
"""
|
343
|
-
response = self.get(f"namespaces/{self.namespace}")
|
347
|
+
response = self.get(f"namespaces/{self.namespace}/extraction_graphs")
|
344
348
|
json = response.json()
|
345
349
|
|
346
350
|
self.extraction_graphs = []
|
347
|
-
for graph in json["
|
351
|
+
for graph in json["extraction_graphs"]:
|
348
352
|
self.extraction_graphs.append(ExtractionGraph.from_dict(graph))
|
349
353
|
|
350
354
|
return self.extraction_graphs
|
@@ -366,6 +370,28 @@ class IndexifyClient:
|
|
366
370
|
)
|
367
371
|
return
|
368
372
|
|
373
|
+
def link_extraction_graphs(
|
374
|
+
self, source_graph: str, content_source: str, linked_graph: str
|
375
|
+
):
|
376
|
+
"""
|
377
|
+
Link an extraction graph to another extraction graph.
|
378
|
+
|
379
|
+
Args:
|
380
|
+
- source_graph (str): source extraction graph
|
381
|
+
- content_source (str): content source in source graph
|
382
|
+
- linked_graph (str): target extraction graph
|
383
|
+
"""
|
384
|
+
req = {
|
385
|
+
"content_source": content_source,
|
386
|
+
"linked_graph_name": linked_graph,
|
387
|
+
}
|
388
|
+
response = self.post(
|
389
|
+
f"namespaces/{self.namespace}/extraction_graphs/{source_graph}/links",
|
390
|
+
json=req,
|
391
|
+
headers={"Content-Type": "application/json"},
|
392
|
+
)
|
393
|
+
return
|
394
|
+
|
369
395
|
def get_content_metadata(self, content_id: str) -> dict:
|
370
396
|
"""
|
371
397
|
Get metadata for a specific content ID in a given index.
|
@@ -373,17 +399,17 @@ class IndexifyClient:
|
|
373
399
|
Args:
|
374
400
|
- content_id (str): content id to query
|
375
401
|
"""
|
376
|
-
response = self.get(f"namespaces/{self.namespace}/content/{content_id}")
|
402
|
+
response = self.get(f"namespaces/{self.namespace}/content/{content_id}/metadata")
|
377
403
|
return response.json()
|
378
404
|
|
379
|
-
def download_content(self,
|
405
|
+
def download_content(self, content_id: str) -> bytes:
|
380
406
|
"""
|
381
407
|
Download content from id. Return bytes
|
382
408
|
|
383
409
|
Args:
|
384
|
-
-
|
410
|
+
- content_id (str): id of content to download
|
385
411
|
"""
|
386
|
-
response = self.get(f"namespaces/{self.namespace}/content/{
|
412
|
+
response = self.get(f"namespaces/{self.namespace}/content/{content_id}/download")
|
387
413
|
return response.content
|
388
414
|
|
389
415
|
def add_documents(
|
@@ -422,21 +448,21 @@ class IndexifyClient:
|
|
422
448
|
raise TypeError(
|
423
449
|
"Invalid type for documents. Expected Document, str, or list of these."
|
424
450
|
)
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
451
|
+
for document in documents:
|
452
|
+
document.labels["mime_type"] = "text/plain"
|
453
|
+
content_ids = []
|
454
|
+
if isinstance(extraction_graphs, str):
|
455
|
+
extraction_graphs = [extraction_graphs]
|
456
|
+
for extraction_graph in extraction_graphs:
|
457
|
+
for document in documents:
|
458
|
+
response = self.post(
|
459
|
+
f"namespaces/{self.namespace}/extraction_graphs/{extraction_graph}/extract",
|
460
|
+
files={"file": document.text},
|
461
|
+
data={"labels": json.dumps(document.labels)},
|
462
|
+
)
|
463
|
+
response_json = response.json()
|
464
|
+
content_id = response_json["content_id"]
|
465
|
+
content_ids.append(content_id)
|
440
466
|
return content_ids
|
441
467
|
|
442
468
|
def delete_documents(self, document_ids: List[str]) -> None:
|
@@ -504,14 +530,47 @@ class IndexifyClient:
|
|
504
530
|
- top_k (int): top k nearest neighbors to be returned
|
505
531
|
- filters (List[str]): list of filters to apply
|
506
532
|
"""
|
507
|
-
req = {"
|
533
|
+
req = {"query": query, "k": top_k, "filters": filters}
|
508
534
|
response = self.post(
|
509
|
-
f"namespaces/{self.namespace}/search",
|
535
|
+
f"namespaces/{self.namespace}/indexes/{name}/search",
|
510
536
|
json=req,
|
511
537
|
headers={"Content-Type": "application/json"},
|
512
538
|
)
|
513
539
|
return response.json()["results"]
|
514
540
|
|
541
|
+
def list_content(
|
542
|
+
self,
|
543
|
+
extraction_graph: str,
|
544
|
+
extraction_policy: str = "",
|
545
|
+
labels_filter: List[str] = [],
|
546
|
+
start_id: str = "",
|
547
|
+
limit: int = 10,
|
548
|
+
) -> List[Content]:
|
549
|
+
"""
|
550
|
+
List content in the current namespace.
|
551
|
+
|
552
|
+
Args:
|
553
|
+
- extraction_graph (str): extraction graph name
|
554
|
+
- start_index (str): start index for pagination
|
555
|
+
- limit (int): number of items to return
|
556
|
+
"""
|
557
|
+
params = {"graph": extraction_graph, "start_id": start_id, "limit": limit}
|
558
|
+
if extraction_policy:
|
559
|
+
params["source"] = extraction_policy
|
560
|
+
else:
|
561
|
+
params["source"] = "ingestion"
|
562
|
+
if len(labels_filter) > 0:
|
563
|
+
params["labels_filter"] = labels_filter
|
564
|
+
response = self.get(
|
565
|
+
f"namespaces/{self.namespace}/content",
|
566
|
+
params=params,
|
567
|
+
)
|
568
|
+
content_list = response.json()["content_list"]
|
569
|
+
content = []
|
570
|
+
for item in content_list:
|
571
|
+
content.append(Content.from_dict(item))
|
572
|
+
return content
|
573
|
+
|
515
574
|
def upload_file(
|
516
575
|
self,
|
517
576
|
extraction_graphs: Union[str, List[str]],
|
@@ -528,18 +587,20 @@ class IndexifyClient:
|
|
528
587
|
"""
|
529
588
|
if isinstance(extraction_graphs, str):
|
530
589
|
extraction_graphs = [extraction_graphs]
|
531
|
-
params = {
|
590
|
+
params = {}
|
532
591
|
if id is not None:
|
533
592
|
params["id"] = id
|
534
593
|
with open(path, "rb") as f:
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
594
|
+
for extraction_graph in extraction_graphs:
|
595
|
+
response = self.post(
|
596
|
+
f"namespaces/{self.namespace}/extraction_graphs/{extraction_graph}/extract",
|
597
|
+
files={"file": f},
|
598
|
+
data={"labels": json.dumps(labels)},
|
599
|
+
params=params,
|
600
|
+
)
|
541
601
|
response_json = response.json()
|
542
|
-
|
602
|
+
content_id = response_json["content_id"]
|
603
|
+
return content_id
|
543
604
|
|
544
605
|
def list_schemas(self) -> List[str]:
|
545
606
|
"""
|
@@ -548,35 +609,32 @@ class IndexifyClient:
|
|
548
609
|
response = self.get(f"namespaces/{self.namespace}/schemas")
|
549
610
|
return response.json()
|
550
611
|
|
551
|
-
def
|
612
|
+
def get_extracted_content(
|
613
|
+
self, ingested_content_id: str, graph_name: str, extractor_name: str, blocking=False
|
614
|
+
):
|
552
615
|
"""
|
553
|
-
Get
|
616
|
+
Get list of child for a given content id and their content up to the specified level.
|
554
617
|
|
555
618
|
Args:
|
556
|
-
|
619
|
+
- ingested_content_id (str): id of content
|
620
|
+
- graph_name (str): name of extraction graph
|
621
|
+
- extractor_name (str): name of extractor
|
622
|
+
- blocking (bool): wait for extraction to complete before returning (default: False)
|
557
623
|
"""
|
624
|
+
if blocking:
|
625
|
+
self.wait_for_extraction(ingested_content_id)
|
558
626
|
response = self.get(
|
559
|
-
f"namespaces/{self.namespace}/
|
627
|
+
f"namespaces/{self.namespace}/extraction_graphs/{graph_name}/extraction_policies/{extractor_name}/content/{ingested_content_id}"
|
560
628
|
)
|
561
|
-
|
562
|
-
|
563
|
-
def get_extracted_content(self, content_id: str, graph_name: str, policy_name: str):
|
564
|
-
"""
|
565
|
-
Get list of child for a given content id and their content up to the specified level.
|
566
|
-
|
567
|
-
Args:
|
568
|
-
- content_id (str): id of content
|
569
|
-
- level (int): depth of content retrieval (default: 0)
|
570
|
-
"""
|
571
|
-
content_tree = self.get_content_tree(content_id)
|
629
|
+
content_tree = response.json()
|
572
630
|
child_list = []
|
573
631
|
for item in content_tree["content_tree_metadata"]:
|
574
632
|
if (
|
575
633
|
graph_name in item["extraction_graph_names"]
|
576
|
-
and item["source"] ==
|
634
|
+
and item["source"] == extractor_name
|
577
635
|
):
|
578
636
|
content = self.download_content(item["id"])
|
579
|
-
child_list.append({"id": item["id"], "content": content})
|
637
|
+
child_list.append({"id": item["id"], "mime_type": item["mime_type"], "content": content})
|
580
638
|
|
581
639
|
return child_list
|
582
640
|
|
@@ -634,9 +692,13 @@ class IndexifyClient:
|
|
634
692
|
"""
|
635
693
|
if type(content_ids) == str:
|
636
694
|
content_ids = [content_ids]
|
637
|
-
print(
|
695
|
+
print(
|
696
|
+
"Waiting for extraction to complete for content id: ", ",".join(content_ids)
|
697
|
+
)
|
638
698
|
for content_id in content_ids:
|
639
|
-
response = self.get(
|
699
|
+
response = self.get(
|
700
|
+
f"namespaces/{self.namespace}/content/{content_id}/wait"
|
701
|
+
)
|
640
702
|
print("Extraction completed for content id: ", content_id)
|
641
703
|
response.raise_for_status()
|
642
704
|
|
@@ -0,0 +1,37 @@
|
|
1
|
+
from enum import Enum
|
2
|
+
from typing import List
|
3
|
+
from dataclasses import dataclass, field
|
4
|
+
|
5
|
+
@dataclass
|
6
|
+
class Content:
|
7
|
+
id: str
|
8
|
+
parent_id: str
|
9
|
+
labels: dict[str, any]
|
10
|
+
extraction_graph_names: List[str]
|
11
|
+
extraction_policy: str
|
12
|
+
mime_type: str
|
13
|
+
|
14
|
+
@classmethod
|
15
|
+
def from_dict(cls, json: dict):
|
16
|
+
return Content(
|
17
|
+
id=json["id"],
|
18
|
+
parent_id=json["parent_id"],
|
19
|
+
labels=json["labels"],
|
20
|
+
extraction_graph_names=json["extraction_graph_names"],
|
21
|
+
extraction_policy=json["source"],
|
22
|
+
mime_type=json["mime_type"],
|
23
|
+
)
|
24
|
+
|
25
|
+
@dataclass
|
26
|
+
class TextChunk:
|
27
|
+
text: str
|
28
|
+
metadata: dict[str, any] = field(default_factory=dict)
|
29
|
+
score: float = 0.0
|
30
|
+
|
31
|
+
def to_dict(self):
|
32
|
+
return {"text": self.text, "metadata": self.metadata}
|
33
|
+
|
34
|
+
|
35
|
+
@dataclass
|
36
|
+
class SearchResult:
|
37
|
+
results: List[TextChunk]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "indexify"
|
3
|
-
version = "0.0.
|
3
|
+
version = "0.0.31"
|
4
4
|
description = "Python Client for Indexify"
|
5
5
|
authors = ["Diptanu Gon Choudhury <diptanuc@gmail.com>", "Lucas Jackson <lucas@tensorlake.ai>", "Vijay Parthasarathy <vijay2win@gmail.com>"]
|
6
6
|
license = "Apache 2.0"
|
@@ -1,18 +0,0 @@
|
|
1
|
-
from enum import Enum
|
2
|
-
from typing import List
|
3
|
-
from dataclasses import dataclass, field
|
4
|
-
|
5
|
-
|
6
|
-
@dataclass
|
7
|
-
class TextChunk:
|
8
|
-
text: str
|
9
|
-
metadata: dict[str, any] = field(default_factory=dict)
|
10
|
-
score: float = 0.0
|
11
|
-
|
12
|
-
def to_dict(self):
|
13
|
-
return {"text": self.text, "metadata": self.metadata}
|
14
|
-
|
15
|
-
|
16
|
-
@dataclass
|
17
|
-
class SearchResult:
|
18
|
-
results: List[TextChunk]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|