indexify 0.0.34__tar.gz → 0.0.36__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {indexify-0.0.34 → indexify-0.0.36}/PKG-INFO +2 -1
- indexify-0.0.36/indexify/__init__.py +30 -0
- {indexify-0.0.34 → indexify-0.0.36}/indexify/client.py +63 -27
- indexify-0.0.36/indexify/data.py +83 -0
- indexify-0.0.36/indexify/data_loaders/__init__.py +55 -0
- indexify-0.0.36/indexify/data_loaders/local_directory_loader.py +27 -0
- {indexify-0.0.34 → indexify-0.0.36}/indexify/extraction_policy.py +1 -1
- indexify-0.0.36/indexify/extractor.py +120 -0
- indexify-0.0.36/indexify/extractor_utils.py +108 -0
- indexify-0.0.36/indexify/graph.py +49 -0
- indexify-0.0.36/indexify/local_runner.py +53 -0
- {indexify-0.0.34 → indexify-0.0.36}/pyproject.toml +2 -1
- indexify-0.0.34/indexify/__init__.py +0 -18
- indexify-0.0.34/indexify/data_containers.py +0 -37
- indexify-0.0.34/indexify/extractor.py +0 -47
- indexify-0.0.34/indexify/index.py +0 -17
- {indexify-0.0.34 → indexify-0.0.36}/LICENSE.txt +0 -0
- {indexify-0.0.34 → indexify-0.0.36}/README.md +0 -0
- {indexify-0.0.34 → indexify-0.0.36}/indexify/error.py +0 -0
- {indexify-0.0.34 → indexify-0.0.36}/indexify/exceptions.py +0 -0
- {indexify-0.0.34 → indexify-0.0.36}/indexify/settings.py +0 -0
- {indexify-0.0.34 → indexify-0.0.36}/indexify/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: indexify
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.36
|
4
4
|
Summary: Python Client for Indexify
|
5
5
|
Home-page: https://github.com/tensorlakeai/indexify
|
6
6
|
License: Apache 2.0
|
@@ -14,6 +14,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.11
|
15
15
|
Classifier: Programming Language :: Python :: 3.12
|
16
16
|
Requires-Dist: httpx[http2] (>=0.26,<0.27)
|
17
|
+
Requires-Dist: pydantic (>=2.8,<3.0)
|
17
18
|
Requires-Dist: pyyaml (>=6.0.1,<7.0.0)
|
18
19
|
Project-URL: Repository, https://github.com/tensorlakeai/indexify
|
19
20
|
Description-Content-Type: text/markdown
|
@@ -0,0 +1,30 @@
|
|
1
|
+
from .client import IndexifyClient
|
2
|
+
from .extraction_policy import ExtractionGraph
|
3
|
+
from .client import (
|
4
|
+
IndexifyClient,
|
5
|
+
Document,
|
6
|
+
generate_hash_from_string,
|
7
|
+
generate_unique_hex_id,
|
8
|
+
)
|
9
|
+
from .data import ContentMetadata, Content, Feature
|
10
|
+
from .extractor import Extractor, extractor, EmbeddingSchema
|
11
|
+
from .settings import DEFAULT_SERVICE_URL
|
12
|
+
from . import data_loaders
|
13
|
+
|
14
|
+
__all__ = [
|
15
|
+
"ContentMetadata",
|
16
|
+
"Content",
|
17
|
+
"data_loaders",
|
18
|
+
"Feature",
|
19
|
+
"Extractor",
|
20
|
+
"extractor",
|
21
|
+
"EmbeddingSchema",
|
22
|
+
"extractor",
|
23
|
+
"Document",
|
24
|
+
"IndexifyClient",
|
25
|
+
"ExtractionGraph",
|
26
|
+
"ExtractionGraphBuilder" "ExtractionPolicy",
|
27
|
+
"DEFAULT_SERVICE_URL",
|
28
|
+
"generate_hash_from_string",
|
29
|
+
"generate_unique_hex_id",
|
30
|
+
]
|
@@ -6,11 +6,11 @@ import json
|
|
6
6
|
from collections import namedtuple
|
7
7
|
from .settings import DEFAULT_SERVICE_URL, DEFAULT_SERVICE_URL_HTTPS
|
8
8
|
from .extractor import Extractor
|
9
|
-
from .extraction_policy import
|
10
|
-
from .index import Index
|
9
|
+
from .extraction_policy import ExtractionGraph
|
11
10
|
from .utils import json_set_default
|
12
11
|
from .error import Error
|
13
|
-
from .
|
12
|
+
from .data import Content, ContentMetadata
|
13
|
+
from .data_loaders import DataLoader
|
14
14
|
from indexify.exceptions import ApiException
|
15
15
|
from dataclasses import dataclass
|
16
16
|
from typing import List, Optional, Union, Dict
|
@@ -316,7 +316,7 @@ class IndexifyClient:
|
|
316
316
|
"content_url": f"{self._service_url}/namespaces/{self.namespace}/content/{content['id']}/download",
|
317
317
|
}
|
318
318
|
|
319
|
-
def indexes(self) ->
|
319
|
+
def indexes(self) -> dict:
|
320
320
|
"""
|
321
321
|
Get the indexes of the current namespace.
|
322
322
|
|
@@ -399,8 +399,10 @@ class IndexifyClient:
|
|
399
399
|
Args:
|
400
400
|
- content_id (str): content id to query
|
401
401
|
"""
|
402
|
-
response = self.get(
|
403
|
-
|
402
|
+
response = self.get(
|
403
|
+
f"namespaces/{self.namespace}/content/{content_id}/metadata"
|
404
|
+
)
|
405
|
+
return response.json()["content_metadata"]
|
404
406
|
|
405
407
|
def download_content(self, content_id: str) -> bytes:
|
406
408
|
"""
|
@@ -409,7 +411,9 @@ class IndexifyClient:
|
|
409
411
|
Args:
|
410
412
|
- content_id (str): id of content to download
|
411
413
|
"""
|
412
|
-
response = self.get(
|
414
|
+
response = self.get(
|
415
|
+
f"namespaces/{self.namespace}/content/{content_id}/download"
|
416
|
+
)
|
413
417
|
return response.content
|
414
418
|
|
415
419
|
def add_documents(
|
@@ -520,7 +524,7 @@ class IndexifyClient:
|
|
520
524
|
|
521
525
|
def search_index(
|
522
526
|
self, name: str, query: str, top_k: int, filters: List[str] = []
|
523
|
-
) ->
|
527
|
+
) -> dict:
|
524
528
|
"""
|
525
529
|
Search index in the current namespace.
|
526
530
|
|
@@ -573,34 +577,59 @@ class IndexifyClient:
|
|
573
577
|
|
574
578
|
def upload_file(
|
575
579
|
self,
|
576
|
-
|
577
|
-
path: str,
|
580
|
+
extraction_graph: str,
|
581
|
+
path: str,
|
582
|
+
file_bytes:bytes=None,
|
578
583
|
id=None,
|
579
584
|
labels: dict = {},
|
580
585
|
) -> str:
|
581
586
|
"""
|
582
|
-
Upload a file.
|
587
|
+
Upload a file from a path or the bytes.
|
583
588
|
|
584
589
|
Args:
|
585
|
-
-
|
590
|
+
- extraction_graph (str): name of the extraction graph to use for extraction
|
591
|
+
- path (Union[str, bytes]): relative path to the file to be uploaded, or the bytes of the file
|
586
592
|
- labels (dict): labels to be associated with the file
|
587
593
|
"""
|
588
|
-
if isinstance(extraction_graphs, str):
|
589
|
-
extraction_graphs = [extraction_graphs]
|
590
594
|
params = {}
|
591
595
|
if id is not None:
|
592
596
|
params["id"] = id
|
593
|
-
|
594
|
-
|
597
|
+
|
598
|
+
if file_bytes == None:
|
599
|
+
with open(path, "rb") as f:
|
595
600
|
response = self.post(
|
596
601
|
f"namespaces/{self.namespace}/extraction_graphs/{extraction_graph}/extract",
|
597
602
|
files={"file": f},
|
598
603
|
data={"labels": json.dumps(labels)},
|
599
604
|
params=params,
|
600
605
|
)
|
601
|
-
|
602
|
-
|
603
|
-
|
606
|
+
else:
|
607
|
+
response = self.post(
|
608
|
+
f"namespaces/{self.namespace}/extraction_graphs/{extraction_graph}/extract",
|
609
|
+
files={"file": (path, file_bytes)},
|
610
|
+
data={"labels": json.dumps(labels)},
|
611
|
+
params=params,
|
612
|
+
)
|
613
|
+
file_content = path
|
614
|
+
|
615
|
+
response_json = response.json()
|
616
|
+
content_id = response_json["content_id"]
|
617
|
+
return content_id
|
618
|
+
|
619
|
+
def ingest_from_loader(self, loader: DataLoader, extraction_graph: str) -> List[str]:
|
620
|
+
"""
|
621
|
+
Loads content using the loader, uploads them to Indexify and returns the content ids.
|
622
|
+
loader: DataLoader: The DataLoader object to use for loading content
|
623
|
+
extraction_graph: str: The name of the extraction graph to use for extraction
|
624
|
+
"""
|
625
|
+
content_ids = []
|
626
|
+
files = loader.load()
|
627
|
+
for file_metadata in files:
|
628
|
+
labels={"file_name": file_metadata.path}
|
629
|
+
print(labels)
|
630
|
+
content_id = self.upload_file(extraction_graph, file_metadata.path, file_metadata.read_all_bytes(), labels=labels)
|
631
|
+
content_ids.append(content_id)
|
632
|
+
return content_ids
|
604
633
|
|
605
634
|
def list_schemas(self) -> List[str]:
|
606
635
|
"""
|
@@ -610,7 +639,11 @@ class IndexifyClient:
|
|
610
639
|
return response.json()
|
611
640
|
|
612
641
|
def get_extracted_content(
|
613
|
-
self,
|
642
|
+
self,
|
643
|
+
ingested_content_id: str,
|
644
|
+
graph_name: str,
|
645
|
+
policy_name: str,
|
646
|
+
blocking=False,
|
614
647
|
):
|
615
648
|
"""
|
616
649
|
Get list of child for a given content id and their content up to the specified level.
|
@@ -631,10 +664,16 @@ class IndexifyClient:
|
|
631
664
|
for item in content_tree["content_tree_metadata"]:
|
632
665
|
if (
|
633
666
|
graph_name in item["extraction_graph_names"]
|
634
|
-
and item["source"] == policy_name
|
667
|
+
and item["source"] == policy_name
|
635
668
|
):
|
636
669
|
content = self.download_content(item["id"])
|
637
|
-
child_list.append(
|
670
|
+
child_list.append(
|
671
|
+
{
|
672
|
+
"id": item["id"],
|
673
|
+
"mime_type": item["mime_type"],
|
674
|
+
"content": content,
|
675
|
+
}
|
676
|
+
)
|
638
677
|
|
639
678
|
return child_list
|
640
679
|
|
@@ -660,23 +699,20 @@ class IndexifyClient:
|
|
660
699
|
|
661
700
|
def ingest_remote_file(
|
662
701
|
self,
|
663
|
-
|
702
|
+
extraction_graph: str,
|
664
703
|
url: str,
|
665
704
|
mime_type: str,
|
666
705
|
labels: Dict[str, str],
|
667
706
|
id=None,
|
668
707
|
):
|
669
|
-
if isinstance(extraction_graphs, str):
|
670
|
-
extraction_graphs = [extraction_graphs]
|
671
708
|
req = {
|
672
709
|
"url": url,
|
673
710
|
"mime_type": mime_type,
|
674
711
|
"labels": labels,
|
675
712
|
"id": id,
|
676
|
-
"extraction_graph_names": extraction_graphs,
|
677
713
|
}
|
678
714
|
response = self.post(
|
679
|
-
f"namespaces/{self.namespace}/
|
715
|
+
f"namespaces/{self.namespace}/extraction_graphs/{extraction_graph}/extract_remote",
|
680
716
|
json=req,
|
681
717
|
headers={"Content-Type": "application/json"},
|
682
718
|
)
|
@@ -0,0 +1,83 @@
|
|
1
|
+
from typing import Any, List, Optional, Literal, Dict
|
2
|
+
from pydantic import BaseModel, Json, Field
|
3
|
+
import json
|
4
|
+
|
5
|
+
|
6
|
+
class Feature(BaseModel):
|
7
|
+
feature_type: Literal["embedding", "metadata"]
|
8
|
+
name: str
|
9
|
+
value: Json
|
10
|
+
comment: Optional[Json] = Field(default=None)
|
11
|
+
|
12
|
+
@classmethod
|
13
|
+
def embedding(cls, values: List[float], name: str = "embedding", distance="cosine"):
|
14
|
+
return cls(
|
15
|
+
feature_type="embedding",
|
16
|
+
name=name,
|
17
|
+
value={values: values, distance: distance},
|
18
|
+
comment=None,
|
19
|
+
)
|
20
|
+
|
21
|
+
@classmethod
|
22
|
+
def metadata(cls, value: Json, comment: Json = None, name: str = "metadata"):
|
23
|
+
value = json.dumps(value)
|
24
|
+
comment = json.dumps(comment) if comment is not None else None
|
25
|
+
return cls(feature_type="metadata", name=name, value=value)
|
26
|
+
|
27
|
+
|
28
|
+
class Content(BaseModel):
|
29
|
+
id: str
|
30
|
+
content_type: Optional[str]
|
31
|
+
data: bytes
|
32
|
+
features: List[Feature] = []
|
33
|
+
|
34
|
+
@classmethod
|
35
|
+
def from_text(
|
36
|
+
cls,
|
37
|
+
text: str,
|
38
|
+
features: List[Feature] = [],
|
39
|
+
):
|
40
|
+
return Content(
|
41
|
+
id="none-for-now",
|
42
|
+
content_type="text/plain",
|
43
|
+
data=bytes(text, "utf-8"),
|
44
|
+
features=features,
|
45
|
+
)
|
46
|
+
|
47
|
+
@classmethod
|
48
|
+
def from_json(cls, json_data: Json, features: List[Feature] = []):
|
49
|
+
return cls(
|
50
|
+
content_type="application/json",
|
51
|
+
data=bytes(json.dumps(json_data), "utf-8"),
|
52
|
+
features=features,
|
53
|
+
)
|
54
|
+
|
55
|
+
@classmethod
|
56
|
+
def from_file(cls, path: str):
|
57
|
+
import mimetypes
|
58
|
+
|
59
|
+
m, _ = mimetypes.guess_type(path)
|
60
|
+
with open(path, "rb") as f:
|
61
|
+
return cls(content_type=m, data=f.read())
|
62
|
+
|
63
|
+
|
64
|
+
class ContentMetadata(BaseModel):
|
65
|
+
id: str
|
66
|
+
parent_id: str
|
67
|
+
labels: Dict[str, Any]
|
68
|
+
extraction_graph_names: List[str]
|
69
|
+
extraction_policy: str
|
70
|
+
mime_type: str
|
71
|
+
extracted_metadata: Dict[str, Any] = {}
|
72
|
+
|
73
|
+
@classmethod
|
74
|
+
def from_dict(cls, json: Dict):
|
75
|
+
return cls(
|
76
|
+
id=json["id"],
|
77
|
+
parent_id=json["parent_id"],
|
78
|
+
labels=json["labels"],
|
79
|
+
extraction_graph_names=json["extraction_graph_names"],
|
80
|
+
extraction_policy=json["source"],
|
81
|
+
mime_type=json["mime_type"],
|
82
|
+
extracted_metadata=json["extracted_metadata"],
|
83
|
+
)
|
@@ -0,0 +1,55 @@
|
|
1
|
+
from pydantic import BaseModel
|
2
|
+
|
3
|
+
from abc import ABC, abstractmethod
|
4
|
+
from typing import List
|
5
|
+
import os
|
6
|
+
import mimetypes
|
7
|
+
import hashlib
|
8
|
+
|
9
|
+
class FileMetadata(BaseModel):
|
10
|
+
path: str
|
11
|
+
file_size: int
|
12
|
+
mime_type: str
|
13
|
+
md5_hash: str
|
14
|
+
created_at: int
|
15
|
+
updated_at: int
|
16
|
+
|
17
|
+
@classmethod
|
18
|
+
def from_path(cls, path: str):
|
19
|
+
file_size = os.path.getsize(path)
|
20
|
+
mime_type = mimetypes.guess_type(path)[0]
|
21
|
+
|
22
|
+
# Compute MD5 hash
|
23
|
+
hash_md5 = hashlib.md5()
|
24
|
+
with open(path, "rb") as f:
|
25
|
+
for chunk in iter(lambda: f.read(4096), b""):
|
26
|
+
hash_md5.update(chunk)
|
27
|
+
md5_hash = hash_md5.hexdigest()
|
28
|
+
|
29
|
+
created_at = int(os.path.getctime(path))
|
30
|
+
updated_at = int(os.path.getmtime(path))
|
31
|
+
|
32
|
+
return cls(
|
33
|
+
path=path,
|
34
|
+
file_size=file_size,
|
35
|
+
mime_type=str(mime_type),
|
36
|
+
md5_hash=md5_hash,
|
37
|
+
created_at=created_at,
|
38
|
+
updated_at=updated_at,
|
39
|
+
)
|
40
|
+
|
41
|
+
def read_all_bytes(self) -> bytes:
|
42
|
+
with open(self.path, "rb") as f:
|
43
|
+
return f.read()
|
44
|
+
|
45
|
+
|
46
|
+
class DataLoader(ABC):
|
47
|
+
@abstractmethod
|
48
|
+
def load(self) -> List[FileMetadata]:
|
49
|
+
pass
|
50
|
+
|
51
|
+
@abstractmethod
|
52
|
+
def state(self) -> dict:
|
53
|
+
pass
|
54
|
+
|
55
|
+
from .local_directory_loader import LocalDirectoryLoader
|
@@ -0,0 +1,27 @@
|
|
1
|
+
from . import DataLoader, FileMetadata
|
2
|
+
from typing import List, Optional
|
3
|
+
import os
|
4
|
+
|
5
|
+
|
6
|
+
class LocalDirectoryLoader(DataLoader):
|
7
|
+
def __init__(self, directory: str, file_extensions: Optional[List[str]] = None, state: dict ={}):
|
8
|
+
self.directory = directory
|
9
|
+
self.file_extensions = file_extensions
|
10
|
+
self.processed_files = set(state.get("processed_files", []))
|
11
|
+
|
12
|
+
def load(self) -> List[FileMetadata]:
|
13
|
+
file_metadata_list = []
|
14
|
+
for root, _, files in os.walk(self.directory):
|
15
|
+
for file in files:
|
16
|
+
if self.file_extensions is None or any(
|
17
|
+
file.endswith(ext) for ext in self.file_extensions
|
18
|
+
):
|
19
|
+
file_path = os.path.join(root, file)
|
20
|
+
if file_path not in self.processed_files:
|
21
|
+
file_metadata_list.append(FileMetadata.from_path(file_path))
|
22
|
+
self.processed_files.add(file_path)
|
23
|
+
|
24
|
+
return file_metadata_list
|
25
|
+
|
26
|
+
def state(self) -> dict:
|
27
|
+
return {"processed_files": list(self.processed_files)}
|
@@ -0,0 +1,120 @@
|
|
1
|
+
from typing import Union, Optional, List, Type, Tuple, Callable, get_type_hints, Dict
|
2
|
+
import inspect
|
3
|
+
from pydantic import BaseModel
|
4
|
+
from abc import ABC, abstractmethod
|
5
|
+
from .data import Content, Feature
|
6
|
+
import json
|
7
|
+
|
8
|
+
class EmbeddingSchema(BaseModel):
|
9
|
+
dimension: int
|
10
|
+
|
11
|
+
class Extractor(ABC):
|
12
|
+
name: str = ""
|
13
|
+
|
14
|
+
version: str = "0.0.0"
|
15
|
+
|
16
|
+
system_dependencies: List[str] = []
|
17
|
+
|
18
|
+
python_dependencies: List[str] = []
|
19
|
+
|
20
|
+
description: str = ""
|
21
|
+
|
22
|
+
input_mime_types = ["text/plain"]
|
23
|
+
|
24
|
+
def extract(
|
25
|
+
self, content: Content, params: Type[BaseModel] = None
|
26
|
+
) -> List[Union[Feature, Content]]:
|
27
|
+
"""
|
28
|
+
Extracts information from the content. Returns a list of features to add
|
29
|
+
to the content.
|
30
|
+
It can also return a list of Content objects, which will be added to storage
|
31
|
+
and any extraction policies defined will be applied to them.
|
32
|
+
"""
|
33
|
+
pass
|
34
|
+
|
35
|
+
def extract_batch(
|
36
|
+
self, content_list: List[Content], params: List[Type[BaseModel]] = None
|
37
|
+
) -> List[List[Union[Feature, Content]]]:
|
38
|
+
"""
|
39
|
+
Extracts information from the content. Returns a list of features to add
|
40
|
+
to the content.
|
41
|
+
It can also return a list of Content objects, which will be added to storage
|
42
|
+
and any extraction policies defined will be applied to them.
|
43
|
+
"""
|
44
|
+
pass
|
45
|
+
|
46
|
+
@classmethod
|
47
|
+
@abstractmethod
|
48
|
+
def sample_input(cls) -> Tuple[Content, Type[BaseModel]]:
|
49
|
+
pass
|
50
|
+
|
51
|
+
@classmethod
|
52
|
+
@abstractmethod
|
53
|
+
def embedding_schemas(cls) -> Dict[str, EmbeddingSchema]:
|
54
|
+
raise NotImplementedError
|
55
|
+
|
56
|
+
def describe(self) -> Dict:
|
57
|
+
embedding_schemas = {}
|
58
|
+
try:
|
59
|
+
embedding_schemas = self.embedding_schemas()
|
60
|
+
except NotImplementedError:
|
61
|
+
pass
|
62
|
+
|
63
|
+
json_schema = (
|
64
|
+
self._param_cls.model_json_schema() if self._param_cls is not None else None
|
65
|
+
)
|
66
|
+
|
67
|
+
return {
|
68
|
+
"name": self.name,
|
69
|
+
"version": self.version,
|
70
|
+
"description": self.description,
|
71
|
+
"system_dependencies": self.system_dependencies,
|
72
|
+
"python_dependencies": self.python_dependencies,
|
73
|
+
"input_mime_types": self.input_mime_types,
|
74
|
+
"embedding_schemas": embedding_schemas,
|
75
|
+
"input_params": json.dumps(json_schema),
|
76
|
+
}
|
77
|
+
|
78
|
+
def extractor(
|
79
|
+
name: Optional[str] = None,
|
80
|
+
description: Optional[str] = "",
|
81
|
+
version: Optional[str] = "",
|
82
|
+
python_dependencies: Optional[List[str]] = None,
|
83
|
+
system_dependencies: Optional[List[str]] = None,
|
84
|
+
input_mime_types: Optional[List[str]] = None,
|
85
|
+
embedding_schemas: Optional[Dict[str, EmbeddingSchema]] = None,
|
86
|
+
sample_content: Optional[Callable] = None,
|
87
|
+
):
|
88
|
+
args = locals()
|
89
|
+
del args["sample_content"]
|
90
|
+
|
91
|
+
def construct(fn):
|
92
|
+
def wrapper():
|
93
|
+
hint = get_type_hints(fn).get("params", dict)
|
94
|
+
|
95
|
+
if not args.get("name"):
|
96
|
+
args["name"] = (
|
97
|
+
f"{inspect.getmodule(inspect.stack()[1][0]).__name__}:{fn.__name__}"
|
98
|
+
)
|
99
|
+
|
100
|
+
class DecoratedFn(Extractor):
|
101
|
+
@classmethod
|
102
|
+
def extract(cls, content: Content, params: hint) -> List[Content]: # type: ignore
|
103
|
+
# TODO we can force all the functions to take in a parms object
|
104
|
+
# or check if someone adds a params
|
105
|
+
if params is None:
|
106
|
+
return fn(content)
|
107
|
+
else:
|
108
|
+
return fn(content, params)
|
109
|
+
|
110
|
+
def sample_input(self) -> Content:
|
111
|
+
return sample_content() if sample_content else self.sample_text()
|
112
|
+
|
113
|
+
for key, val in args.items():
|
114
|
+
setattr(DecoratedFn, key, val)
|
115
|
+
|
116
|
+
return DecoratedFn
|
117
|
+
|
118
|
+
return wrapper
|
119
|
+
|
120
|
+
return construct
|
@@ -0,0 +1,108 @@
|
|
1
|
+
import os
|
2
|
+
import httpx
|
3
|
+
from typing import List
|
4
|
+
from .data import Content, Feature
|
5
|
+
|
6
|
+
class SampleExtractorData:
|
7
|
+
|
8
|
+
def _download_file(self, url, filename):
|
9
|
+
if os.path.exists(filename):
|
10
|
+
# file exists skip
|
11
|
+
return
|
12
|
+
try:
|
13
|
+
with httpx.get(url, stream=True) as r:
|
14
|
+
r.raise_for_status() # Raises an HTTPError if the response status code is 4XX/5XX
|
15
|
+
with open(filename, "wb") as f:
|
16
|
+
for chunk in r.iter_content(chunk_size=8192):
|
17
|
+
f.write(chunk)
|
18
|
+
except httpx.exceptions.RequestException as e:
|
19
|
+
print(f"Error downloading the file: {e}")
|
20
|
+
|
21
|
+
def sample_mp3(self, features: List[Feature] = []) -> Content:
|
22
|
+
file_name = "sample.mp3"
|
23
|
+
self._download_file(
|
24
|
+
"https://extractor-files.diptanu-6d5.workers.dev/sample-000009.mp3",
|
25
|
+
file_name,
|
26
|
+
)
|
27
|
+
f = open(file_name, "rb")
|
28
|
+
return Content(content_type="audio/mpeg", data=f.read(), features=features)
|
29
|
+
|
30
|
+
def sample_mp4(self, features: List[Feature] = []) -> Content:
|
31
|
+
file_name = "sample.mp4"
|
32
|
+
self._download_file(
|
33
|
+
"https://extractor-files.diptanu-6d5.workers.dev/sample.mp4",
|
34
|
+
file_name,
|
35
|
+
)
|
36
|
+
f = open(file_name, "rb")
|
37
|
+
return Content(content_type="video/mp4", data=f.read(), features=features)
|
38
|
+
|
39
|
+
def sample_jpg(self, features: List[Feature] = []) -> Content:
|
40
|
+
file_name = "sample.jpg"
|
41
|
+
self._download_file(
|
42
|
+
"https://extractor-files.diptanu-6d5.workers.dev/people-standing.jpg",
|
43
|
+
file_name,
|
44
|
+
)
|
45
|
+
f = open(file_name, "rb")
|
46
|
+
return Content(content_type="image/jpg", data=f.read(), features=features)
|
47
|
+
|
48
|
+
def sample_invoice_jpg(self, features: List[Feature] = []) -> Content:
|
49
|
+
file_name = "sample.jpg"
|
50
|
+
self._download_file(
|
51
|
+
"https://extractor-files.diptanu-6d5.workers.dev/invoice-example.jpg",
|
52
|
+
file_name,
|
53
|
+
)
|
54
|
+
f = open(file_name, "rb")
|
55
|
+
return Content(content_type="image/jpg", data=f.read(), features=features)
|
56
|
+
|
57
|
+
def sample_invoice_pdf(self, features: List[Feature] = []) -> Content:
|
58
|
+
file_name = "sample.pdf"
|
59
|
+
self._download_file(
|
60
|
+
"https://extractor-files.diptanu-6d5.workers.dev/invoice-example.pdf",
|
61
|
+
file_name,
|
62
|
+
)
|
63
|
+
f = open(file_name, "rb")
|
64
|
+
return Content(content_type="application/pdf", data=f.read(), features=features)
|
65
|
+
|
66
|
+
def sample_image_based_pdf(self, features: List[Feature] = []) -> Content:
|
67
|
+
file_name = "sample.pdf"
|
68
|
+
self._download_file(
|
69
|
+
"https://extractor-files.diptanu-6d5.workers.dev/image-based.pdf",
|
70
|
+
file_name,
|
71
|
+
)
|
72
|
+
f = open(file_name, "rb")
|
73
|
+
return Content(content_type="application/pdf", data=f.read(), features=features)
|
74
|
+
|
75
|
+
def sample_scientific_pdf(self, features: List[Feature] = []) -> Content:
|
76
|
+
file_name = "sample.pdf"
|
77
|
+
self._download_file(
|
78
|
+
"https://extractor-files.diptanu-6d5.workers.dev/scientific-paper-example.pdf",
|
79
|
+
file_name,
|
80
|
+
)
|
81
|
+
f = open(file_name, "rb")
|
82
|
+
return Content(content_type="application/pdf", data=f.read(), features=features)
|
83
|
+
|
84
|
+
def sample_presentation(self, features: List[Feature] = []) -> Content:
|
85
|
+
file_name = "test.pptx"
|
86
|
+
self._download_file(
|
87
|
+
"https://raw.githubusercontent.com/tensorlakeai/indexify/main/docs/docs/files/test.pptx",
|
88
|
+
file_name,
|
89
|
+
)
|
90
|
+
f = open(file_name, "rb")
|
91
|
+
return Content(
|
92
|
+
content_type="application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
93
|
+
data=f.read(),
|
94
|
+
features=features,
|
95
|
+
)
|
96
|
+
|
97
|
+
def sample_text(self, features: List[Feature] = []) -> Content:
|
98
|
+
article = """New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband. Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the 2010 marriage license application, according to court documents. Prosecutors said the marriages were part of an immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002. All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages. Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted. The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces up to four years in prison. Her next court appearance is scheduled for May 18."""
|
99
|
+
return Content(content_type="text/plain", data=article, features=features)
|
100
|
+
|
101
|
+
def sample_html(self, features: List[Feature] = []) -> Content:
|
102
|
+
file_name = "sample.html"
|
103
|
+
self._download_file(
|
104
|
+
"https://extractor-files.diptanu-6d5.workers.dev/sample.html",
|
105
|
+
file_name,
|
106
|
+
)
|
107
|
+
f = open(file_name, "rb")
|
108
|
+
return Content(content_type="text/html", data=f.read(), features=features)
|
@@ -0,0 +1,49 @@
|
|
1
|
+
from indexify import Content, extractor
|
2
|
+
from indexify.extractor import Extractor
|
3
|
+
|
4
|
+
from collections import defaultdict
|
5
|
+
from typing import Any, Callable, Dict, List, Optional
|
6
|
+
|
7
|
+
|
8
|
+
@extractor(description="id function")
|
9
|
+
def _id(content: Content) -> List[Content]:
|
10
|
+
return [content]
|
11
|
+
|
12
|
+
class Graph:
|
13
|
+
def __init__(self, name: str):
|
14
|
+
# TODO check for cycles
|
15
|
+
self.name = name
|
16
|
+
|
17
|
+
self.nodes: Dict[str, Callable] = {}
|
18
|
+
self.params: Dict[str, Any] = {}
|
19
|
+
|
20
|
+
self.edges: Dict[str, List[(str, str)]] = defaultdict(list)
|
21
|
+
|
22
|
+
self.results: Dict[str, Any] = defaultdict(list) # TODO should the Any be Content?
|
23
|
+
|
24
|
+
self.nodes["start"] = _id
|
25
|
+
self.nodes["end"] = _id
|
26
|
+
|
27
|
+
self._topo_counter = defaultdict(int)
|
28
|
+
|
29
|
+
self._start_node = None
|
30
|
+
|
31
|
+
def node(self, name: str, closure: Extractor, params: Any = None) -> None:
|
32
|
+
if name in self.nodes:
|
33
|
+
raise Exception(f"Cannot insert node, node with name: `{name}` already exists")
|
34
|
+
|
35
|
+
self.nodes[name] = closure
|
36
|
+
self.params[name] = params
|
37
|
+
|
38
|
+
# assign each node a rank of 1 to init the graph
|
39
|
+
self._topo_counter[name] = 1
|
40
|
+
|
41
|
+
def edge(self, from_node: str, to_node: str, prefilter_predicates: Optional[str] = None) -> None:
|
42
|
+
self.edges[from_node].append((to_node, prefilter_predicates))
|
43
|
+
|
44
|
+
self._topo_counter[to_node] += 1
|
45
|
+
|
46
|
+
def _assign_start_node(self):
|
47
|
+
# this method should be called before a graph can be run
|
48
|
+
nodes = sorted(self._topo_counter.items(), key=lambda x: x[1])
|
49
|
+
self._start_node = nodes[0][0]
|
@@ -0,0 +1,53 @@
|
|
1
|
+
from indexify import Content
|
2
|
+
|
3
|
+
from collections import defaultdict
|
4
|
+
from typing import Any, Callable, Dict, Optional
|
5
|
+
|
6
|
+
class LocalRunner:
|
7
|
+
def __init__(self):
|
8
|
+
self.results: Dict[str, Any] = defaultdict(list) # TODO should the Any be Content?
|
9
|
+
|
10
|
+
def run(self, g, content: Content):
|
11
|
+
g._assign_start_node()
|
12
|
+
return self._run(g, content=content, node_name=g._start_node)
|
13
|
+
|
14
|
+
def _run(self, g, content: Content, node_name: str):
|
15
|
+
extractor_construct: Callable = g.nodes[node_name]
|
16
|
+
params = g.params.get(node_name, None)
|
17
|
+
|
18
|
+
res = extractor_construct().extract(content=content, params=params)
|
19
|
+
|
20
|
+
self.results[node_name].extend(res)
|
21
|
+
|
22
|
+
for out_edge, pre_filter_predicate in g.edges[node_name]:
|
23
|
+
# TODO there are no reductions yet, each recursion finishes it's path and returns
|
24
|
+
for r in res:
|
25
|
+
if self._prefilter_content(content=r, prefilter_predicate=pre_filter_predicate):
|
26
|
+
continue
|
27
|
+
|
28
|
+
self._run(g, content=r, node_name=out_edge)
|
29
|
+
|
30
|
+
def _prefilter_content(self, content: Content, prefilter_predicate: Optional[str]) -> bool:
|
31
|
+
if prefilter_predicate is None:
|
32
|
+
return False
|
33
|
+
|
34
|
+
atoms = prefilter_predicate.split('and')
|
35
|
+
if len(atoms) == 0 or len(atoms) == 1:
|
36
|
+
return False
|
37
|
+
|
38
|
+
# TODO For now only support `and` and `=` and `string values`
|
39
|
+
bools = []
|
40
|
+
for feature in content.features:
|
41
|
+
if feature.feature_type == 'metadata':
|
42
|
+
values = feature.value
|
43
|
+
|
44
|
+
print(f'{prefilter_predicate, atoms}')
|
45
|
+
for atom in atoms:
|
46
|
+
l, r = atom.split('=')
|
47
|
+
if l in values:
|
48
|
+
bools.append(values[l] == r)
|
49
|
+
|
50
|
+
return all(bools)
|
51
|
+
|
52
|
+
def get_result(self, node_name: str) -> Content:
|
53
|
+
return self.results[node_name]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "indexify"
|
3
|
-
version = "0.0.
|
3
|
+
version = "0.0.36"
|
4
4
|
description = "Python Client for Indexify"
|
5
5
|
authors = ["Diptanu Gon Choudhury <diptanuc@gmail.com>", "Lucas Jackson <lucas@tensorlake.ai>", "Vijay Parthasarathy <vijay2win@gmail.com>"]
|
6
6
|
license = "Apache 2.0"
|
@@ -12,6 +12,7 @@ repository = "https://github.com/tensorlakeai/indexify"
|
|
12
12
|
python = "^3.9"
|
13
13
|
httpx = { version = "^0.26", extras = ["http2"] }
|
14
14
|
pyyaml = "^6.0.1"
|
15
|
+
pydantic = "^2.8"
|
15
16
|
|
16
17
|
[tool.poetry.dev-dependencies]
|
17
18
|
black = "^22.3.0"
|
@@ -1,18 +0,0 @@
|
|
1
|
-
from .index import Index
|
2
|
-
from .client import IndexifyClient
|
3
|
-
from .extraction_policy import ExtractionGraph
|
4
|
-
from .client import IndexifyClient, Document, generate_hash_from_string, generate_unique_hex_id
|
5
|
-
from .data_containers import Content
|
6
|
-
from .settings import DEFAULT_SERVICE_URL
|
7
|
-
|
8
|
-
__all__ = [
|
9
|
-
"Index",
|
10
|
-
"Content",
|
11
|
-
"Document",
|
12
|
-
"IndexifyClient",
|
13
|
-
"ExtractionGraph",
|
14
|
-
"ExtractionGraphBuilder" "ExtractionPolicy",
|
15
|
-
"DEFAULT_SERVICE_URL",
|
16
|
-
"generate_hash_from_string",
|
17
|
-
"generate_unique_hex_id",
|
18
|
-
]
|
@@ -1,37 +0,0 @@
|
|
1
|
-
from enum import Enum
|
2
|
-
from typing import List
|
3
|
-
from dataclasses import dataclass, field
|
4
|
-
|
5
|
-
@dataclass
|
6
|
-
class Content:
|
7
|
-
id: str
|
8
|
-
parent_id: str
|
9
|
-
labels: dict[str, any]
|
10
|
-
extraction_graph_names: List[str]
|
11
|
-
extraction_policy: str
|
12
|
-
mime_type: str
|
13
|
-
|
14
|
-
@classmethod
|
15
|
-
def from_dict(cls, json: dict):
|
16
|
-
return Content(
|
17
|
-
id=json["id"],
|
18
|
-
parent_id=json["parent_id"],
|
19
|
-
labels=json["labels"],
|
20
|
-
extraction_graph_names=json["extraction_graph_names"],
|
21
|
-
extraction_policy=json["source"],
|
22
|
-
mime_type=json["mime_type"],
|
23
|
-
)
|
24
|
-
|
25
|
-
@dataclass
|
26
|
-
class TextChunk:
|
27
|
-
text: str
|
28
|
-
metadata: dict[str, any] = field(default_factory=dict)
|
29
|
-
score: float = 0.0
|
30
|
-
|
31
|
-
def to_dict(self):
|
32
|
-
return {"text": self.text, "metadata": self.metadata}
|
33
|
-
|
34
|
-
|
35
|
-
@dataclass
|
36
|
-
class SearchResult:
|
37
|
-
results: List[TextChunk]
|
@@ -1,47 +0,0 @@
|
|
1
|
-
from dataclasses import dataclass
|
2
|
-
from typing import Union
|
3
|
-
|
4
|
-
from .settings import DEFAULT_SERVICE_URL
|
5
|
-
|
6
|
-
|
7
|
-
@dataclass
|
8
|
-
class EmbeddingSchema:
|
9
|
-
distance: str
|
10
|
-
dim: int
|
11
|
-
|
12
|
-
|
13
|
-
@dataclass
|
14
|
-
class ExtractorSchema:
|
15
|
-
outputs: dict[str, Union[EmbeddingSchema, dict]]
|
16
|
-
|
17
|
-
|
18
|
-
class Extractor:
|
19
|
-
def __init__(
|
20
|
-
self,
|
21
|
-
name: str,
|
22
|
-
description: str,
|
23
|
-
input_params: dict,
|
24
|
-
outputs: ExtractorSchema,
|
25
|
-
input_mime_types: list[str],
|
26
|
-
):
|
27
|
-
self.name = name
|
28
|
-
self.description = description
|
29
|
-
self.input_params = input_params
|
30
|
-
self.outputs = outputs
|
31
|
-
self.input_mime_types = input_mime_types
|
32
|
-
|
33
|
-
@classmethod
|
34
|
-
def from_dict(cls, data):
|
35
|
-
return Extractor(
|
36
|
-
name=data["name"],
|
37
|
-
description=data["description"],
|
38
|
-
input_params=data["input_params"],
|
39
|
-
input_mime_types=data["input_mime_types"],
|
40
|
-
outputs=data["outputs"],
|
41
|
-
)
|
42
|
-
|
43
|
-
def __repr__(self) -> str:
|
44
|
-
return f"Extractor(name={self.name}, description={self.description}, input_params={self.input_params}, input_mime_types={self.input_mime_types}, outputs={self.outputs})"
|
45
|
-
|
46
|
-
def __str__(self) -> str:
|
47
|
-
return self.__repr__()
|
@@ -1,17 +0,0 @@
|
|
1
|
-
import httpx
|
2
|
-
|
3
|
-
from .data_containers import TextChunk
|
4
|
-
|
5
|
-
|
6
|
-
class Index:
|
7
|
-
def __init__(self, service_url, index):
|
8
|
-
self._service_url = service_url
|
9
|
-
self._index = index
|
10
|
-
|
11
|
-
def search(self, query: str, top_k: int) -> list[TextChunk]:
|
12
|
-
req = {"index": self._index, "query": query, "k": top_k}
|
13
|
-
response = httpx.post(
|
14
|
-
f"{self._service_url}/indexes/{self._index}/search", json=req
|
15
|
-
)
|
16
|
-
response.raise_for_status()
|
17
|
-
return response.json()["results"]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|