indexify 0.0.1__tar.gz → 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,53 @@
1
+ Metadata-Version: 2.1
2
+ Name: indexify
3
+ Version: 0.0.3
4
+ Summary: Python Client for Indexify
5
+ Home-page: https://github.com/diptanu/indexify
6
+ License: Apache 2.0
7
+ Author: Diptanu Gon Choudhury
8
+ Author-email: diptanuc@gmail.com
9
+ Requires-Python: >=3.10.0,<4.0.0
10
+ Classifier: License :: Other/Proprietary License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Requires-Dist: aiohttp (>=3.4,<4.0)
15
+ Project-URL: Repository, https://github.com/diptanu/indexify
16
+ Description-Content-Type: text/markdown
17
+
18
+ # Indexify Python Client
19
+
20
+ ## Installation
21
+
22
+ This is the Python client for interacting with the Indexify service.
23
+
24
+ To install it, simply run:
25
+
26
+ ```shell
27
+ pip install indexify
28
+ ```
29
+
30
+ ## Usage
31
+
32
+ See the [getting started](https://getindexify.com/getting_started/) guide for examples of how to use the client.
33
+
34
+ ## Development
35
+
36
+ For first time setup, follow the steps [here](https://getindexify.com/develop/).
37
+
38
+ ### Steps for restarting dev server after updating server code
39
+
40
+ ```shell
41
+ ./install_python_deps.sh
42
+ # use `-e`` if you're developing extractors
43
+ (cd extractors && pip install -e .)
44
+ # use `-e`` if you're developing sdk-py
45
+ (cd sdk-py && pip install -e .)
46
+
47
+ cargo build
48
+ make local-dev
49
+
50
+ # start the server
51
+ ./target/debug/indexify start-server -d -c local_config.yaml
52
+ ```
53
+
@@ -0,0 +1,35 @@
1
+ # Indexify Python Client
2
+
3
+ ## Installation
4
+
5
+ This is the Python client for interacting with the Indexify service.
6
+
7
+ To install it, simply run:
8
+
9
+ ```shell
10
+ pip install indexify
11
+ ```
12
+
13
+ ## Usage
14
+
15
+ See the [getting started](https://getindexify.com/getting_started/) guide for examples of how to use the client.
16
+
17
+ ## Development
18
+
19
+ For first time setup, follow the steps [here](https://getindexify.com/develop/).
20
+
21
+ ### Steps for restarting dev server after updating server code
22
+
23
+ ```shell
24
+ ./install_python_deps.sh
25
+ # use `-e`` if you're developing extractors
26
+ (cd extractors && pip install -e .)
27
+ # use `-e`` if you're developing sdk-py
28
+ (cd sdk-py && pip install -e .)
29
+
30
+ cargo build
31
+ make local-dev
32
+
33
+ # start the server
34
+ ./target/debug/indexify start-server -d -c local_config.yaml
35
+ ```
@@ -0,0 +1,11 @@
1
+ from .index import Index, AIndex
2
+ from .client import IndexifyClient
3
+ from .memory import Memory, AMemory
4
+ from .repository import Repository, ARepository, create_repository, list_repositories
5
+ from .data_containers import TextChunk, Message
6
+ from .utils import wait_until
7
+ from .settings import DEFAULT_SERVICE_URL
8
+
9
+
10
+ __all__ = ["Index", "Memory", "Repository", "AIndex", "AMemory", "ARepository",
11
+ "Message", "TextChunk", "DEFAULT_SERVICE_URL", "wait_until", "IndexifyMemory"]
@@ -0,0 +1,30 @@
1
+ from .extractor import Extractor, list_extractors
2
+ from .repository import Repository, create_repository, list_repositories
3
+ from .settings import DEFAULT_SERVICE_URL
4
+
5
+
6
+ class IndexifyClient:
7
+
8
+ def __init__(self, service_url: str = DEFAULT_SERVICE_URL):
9
+ self._service_url = service_url
10
+
11
+ def create_repository(self, name: str, extractors: list = [], metadata: dict = {}) -> dict:
12
+ return create_repository(name, extractors, metadata, self._service_url)
13
+
14
+ @property
15
+ def extractors(self) -> list[Extractor]:
16
+ return [Extractor(**extractor) for extractor in list_extractors(self._service_url)]
17
+
18
+ def get_or_create_repository(self, name: str) -> Repository:
19
+ return Repository(name=name, service_url=self._service_url)
20
+
21
+ def list_extractors(self) -> list[dict]:
22
+ return list_extractors(base_url=self._service_url)
23
+
24
+ def list_repositories(self) -> list[dict]:
25
+ return list_repositories(service_url=self._service_url)
26
+
27
+ @property
28
+ def repositories(self) -> list[Repository]:
29
+ # TODO: implement this
30
+ pass
@@ -0,0 +1,46 @@
1
+ from enum import Enum
2
+ from typing import List
3
+ from dataclasses import dataclass, field
4
+
5
+
6
+ class TextSplitter(str, Enum):
7
+ NEWLINE = "new_line"
8
+ REGEX = "regex"
9
+ NOOP = "noop"
10
+
11
+ def __str__(self) -> str:
12
+ return self.value.lower()
13
+
14
+
15
+ @dataclass
16
+ class TextChunk:
17
+ text: str
18
+ metadata: dict[str, any] = field(default_factory=dict)
19
+
20
+ def to_dict(self):
21
+ return {"text": self.text, "metadata": self.metadata}
22
+
23
+
24
+ @dataclass
25
+ class Message:
26
+ role: str
27
+ text: str
28
+ metadata: dict[str, any] = field(default_factory=dict)
29
+
30
+ def to_dict(self):
31
+ return {"role": self.role, "text": self.text, "metadata": self.metadata}
32
+
33
+
34
+ @dataclass
35
+ class SearchChunk:
36
+ index: str
37
+ query: str
38
+ k: int
39
+
40
+ def to_dict(self):
41
+ return {"index": self.index, "query": self.query, "k": self.k}
42
+
43
+
44
+ @dataclass
45
+ class SearchResult:
46
+ results: List[TextChunk]
@@ -0,0 +1,25 @@
1
+ import requests
2
+
3
+ from .settings import DEFAULT_SERVICE_URL
4
+
5
+
6
+ def list_extractors(base_url: str = DEFAULT_SERVICE_URL) -> list[dict]:
7
+ response = requests.get(f"{base_url}/extractors")
8
+ response.raise_for_status()
9
+ return response.json()['extractors']
10
+
11
+
12
+ # TODO: consider naming this IndexifyExtractor
13
+ # TODO: consider making this a dataclass
14
+ class Extractor:
15
+
16
+ def __init__(self, name: str, description: str, extractor_type : dict):
17
+ self.name = name
18
+ self.description = description
19
+ self.extractor_type = extractor_type
20
+
21
+ def __repr__(self) -> str:
22
+ return f"Extractor(name={self.name}, description={self.description})"
23
+
24
+ def __str__(self) -> str:
25
+ return self.__repr__()
@@ -0,0 +1,31 @@
1
+ import aiohttp
2
+
3
+ from .data_containers import SearchChunk, TextChunk
4
+ from .utils import _get_payload, wait_until
5
+
6
+
7
+ class AIndex:
8
+
9
+ def __init__(self, url: str, index: str = "default/default"):
10
+ self._url = url
11
+ self._index = index
12
+
13
+ async def search(self, query: str, top_k: int) -> list[TextChunk]:
14
+ req = SearchChunk(index=self._index, query=query, k=top_k)
15
+ async with aiohttp.ClientSession() as session:
16
+ async with session.get(f"{self._url}/index/search", json=req.to_dict()) as resp:
17
+ payload = await _get_payload(resp)
18
+ result = []
19
+ for res in payload["results"]:
20
+ result.append(TextChunk(text=res["text"], metadata=res["metadata"]))
21
+ return result
22
+
23
+
24
+ class Index(AIndex):
25
+
26
+ def __init__(self, url, index):
27
+ AIndex.__init__(self, url, index)
28
+
29
+ def search(self, query: str, top_k: int) -> list[TextChunk]:
30
+ wait_until(AIndex.search(self, query, top_k))
31
+
@@ -0,0 +1,53 @@
1
+ import aiohttp
2
+
3
+ from .data_containers import *
4
+ from .utils import _get_payload, wait_until
5
+
6
+
7
+ class AMemory:
8
+
9
+ def __init__(self, url, repository="default"):
10
+ self._session_id = None
11
+ self._url = url
12
+ self._repo = repository
13
+
14
+ async def create(self) -> str:
15
+ async with aiohttp.ClientSession() as session:
16
+ async with session.post(f"{self._url}/memory/create", json={"repository": self._repo}) as resp:
17
+ resp = await _get_payload(resp)
18
+ self._session_id = resp["session_id"]
19
+ return self._session_id
20
+
21
+ async def add(self, *messages: Message) -> None:
22
+ parsed_messages = []
23
+ for message in messages:
24
+ parsed_messages.append(message.to_dict())
25
+
26
+ req = {"session_id": self._session_id, "repository": self._repo, "messages": parsed_messages}
27
+ async with aiohttp.ClientSession() as session:
28
+ async with session.post(f"{self._url}/memory/add", json=req) as resp:
29
+ return await _get_payload(resp)
30
+
31
+ async def all(self) -> list[Message]:
32
+ req = {"session_id": self._session_id, "repository": self._repo}
33
+ async with aiohttp.ClientSession() as session:
34
+ async with session.get(f"{self._url}/memory/get", json=req) as resp:
35
+ payload = await _get_payload(resp)
36
+ messages = []
37
+ for raw_message in payload["messages"]:
38
+ messages.append(Message(raw_message["role"], raw_message["text"], raw_message["metadata"]))
39
+ return messages
40
+
41
+
42
+ class Memory(AMemory):
43
+ def __init__(self, url, repository="default"):
44
+ AMemory.__init__(self, url, repository)
45
+
46
+ def create(self) -> str:
47
+ return wait_until(AMemory.create(self))
48
+
49
+ def add(self, *messages: Message) -> None:
50
+ wait_until(AMemory.add(self, *messages))
51
+
52
+ def all(self) -> list[Message]:
53
+ return wait_until(AMemory.all(self))
@@ -0,0 +1,161 @@
1
+ import aiohttp
2
+ import requests
3
+
4
+ from .index import Index
5
+ from .data_containers import TextChunk
6
+ from .settings import DEFAULT_SERVICE_URL
7
+ from .utils import _get_payload, wait_until
8
+
9
+
10
+ def create_repository(name: str, extractors: list = (), metadata: dict = {},
11
+ service_url: str = DEFAULT_SERVICE_URL) -> dict:
12
+ req = {"name": name, "extractors": extractors, "metadata": metadata}
13
+ response = requests.post(f"{service_url}/repositories", json=req)
14
+ response.raise_for_status()
15
+ return response.json()
16
+
17
+
18
+ def list_repositories(service_url: str = DEFAULT_SERVICE_URL) -> list[dict]:
19
+ response = requests.get(f"{service_url}/repositories")
20
+ response.raise_for_status()
21
+ return response.json()['repositories']
22
+
23
+
24
+ # TODO: consider tying this back to IndexifyExtractor
25
+ class ExtractorBinding:
26
+
27
+ def __init__(self, extractor_name: str, index_name: str, filters: dict, input_params: dict):
28
+ self.extractor_name = extractor_name
29
+ self.index_name = index_name
30
+ self.filters = filters
31
+ self.input_params = input_params
32
+
33
+ def __repr__(self) -> str:
34
+ return f"ExtractorBinding(extractor_name={self.extractor_name}, index_name={self.index_name})"
35
+
36
+ def __str__(self) -> str:
37
+ return self.__repr__()
38
+
39
+
40
+ class ARepository:
41
+
42
+ def __init__(self, name: str, service_url: str):
43
+ self.name = name
44
+ self._service_url = service_url
45
+ self.url = f"{self._service_url}/repositories/{self.name}"
46
+
47
+ async def run_extractors(self) -> dict:
48
+ async with aiohttp.ClientSession() as session:
49
+ async with session.post(f"{self.url}/run_extractors") as resp:
50
+ return await _get_payload(resp)
51
+
52
+ async def add_documents(self, *documents: dict) -> None:
53
+ if isinstance(documents[0], dict):
54
+ documents = [documents[0]] # single document passed
55
+ else:
56
+ documents = documents[0] # list of documents passed
57
+ for doc in documents:
58
+ if "metadata" not in doc:
59
+ doc.update({"metadata": {}})
60
+ req = {"documents": documents}
61
+ async with aiohttp.ClientSession() as session:
62
+ async with session.post(f"{self.url}/add_texts", json=req) as resp:
63
+ return await _get_payload(resp)
64
+
65
+
66
+ class Repository(ARepository):
67
+
68
+ def __init__(self, name: str = "default", service_url: str = DEFAULT_SERVICE_URL):
69
+ super().__init__(name, service_url)
70
+ if not self._name_exists():
71
+ print(f"creating repo {self.name}")
72
+ create_repository(name=self.name, service_url=self._service_url)
73
+
74
+ def add_documents(self, *documents: dict) -> None:
75
+ return wait_until(ARepository.add_documents(self, *documents))
76
+
77
+ def bind_extractor(self, extractor_name: str, index_name: str,
78
+ include: dict | None = None,
79
+ exclude: dict | None = None) -> dict:
80
+ """Bind an extractor to this repository
81
+
82
+ Args:
83
+ extractor_name (str): Name of extractor
84
+ index_name (str): Name of corresponding index
85
+ include (dict | None, optional): Conditions that must be true
86
+ for an extractor to run on a document in the repository.
87
+ Defaults to None.
88
+ exclude (dict | None, optional): Conditions that must be false
89
+ for an extractor to run on a document in the repository.
90
+ Defaults to None.
91
+
92
+ Returns:
93
+ dict: response payload
94
+
95
+ Examples:
96
+ >>> repo.bind_extractor("EfficientNet", "png_embeddings",
97
+ include={"file_ext": "png"})
98
+
99
+ >>> repo.bind_extractor("MiniLML6", "non_english",
100
+ exclude={"language": "en"})
101
+
102
+ """
103
+ filters = []
104
+ if include is not None:
105
+ filters.extend([{'eq': {k: v}} for k, v in include.items()])
106
+ if exclude is not None:
107
+ filters.extend([{'ne': {k: v}} for k, v in exclude.items()])
108
+ req = {"extractor_name": extractor_name,
109
+ "index_name": index_name,
110
+ "filters": filters}
111
+ response = requests.post(f"{self.url}/extractor_bindings", json=req)
112
+ response.raise_for_status()
113
+ return response.json()
114
+
115
+ @property
116
+ def extractor_bindings(self) -> list[ExtractorBinding]:
117
+ return [ExtractorBinding(**e) for e in self._get_repository_info()['extractor_bindings']]
118
+
119
+ @property
120
+ def indexes(self) -> list[Index]:
121
+ # TODO: implement this - can take from extractors but not correct
122
+ pass
123
+
124
+ # FIXME: query type should depend on index type
125
+ def query_attribute(self, index_name: str, content_id: str = None) -> dict:
126
+ # TODO: this should be async
127
+ params = {"index": index_name}
128
+ if content_id:
129
+ params.update({"content_id": content_id})
130
+ response = requests.get(f"{self.url}/attributes", params=params)
131
+ response.raise_for_status()
132
+ return response.json()['attributes']
133
+
134
+ def unbind_extractor(self, name) -> dict:
135
+ # TODO: implement this
136
+ pass
137
+
138
+ def run_extractors(self) -> dict:
139
+ return wait_until(ARepository.run_extractors(self, self.name))
140
+
141
+ # TODO: this should move to index
142
+ def search_index(self, index_name: str, query: str, top_k: int) -> list[TextChunk]:
143
+ # TODO: this should be async
144
+ req = {"index": index_name, "query": query, "k": top_k}
145
+ response = requests.post(f"{self.url}/search", json=req)
146
+ response.raise_for_status()
147
+ return response.json()['results']
148
+
149
+ def _get_repository_info(self) -> dict:
150
+ response = requests.get(f"{self.url}")
151
+ response.raise_for_status()
152
+ return response.json()['repository']
153
+
154
+ def _name_exists(self) -> bool:
155
+ return self.name in [r['name'] for r in list_repositories(self._service_url)]
156
+
157
+ def __repr__(self) -> str:
158
+ return f"Repository(name={self.name})"
159
+
160
+ def __str__(self) -> str:
161
+ return self.__repr__()
@@ -0,0 +1 @@
1
+ DEFAULT_SERVICE_URL = "http://localhost:8900"
@@ -0,0 +1,40 @@
1
+ import asyncio
2
+ from enum import Enum
3
+ import json
4
+
5
+
6
+ class ApiException(Exception):
7
+ def __init__(self, message: str) -> None:
8
+ super().__init__(message)
9
+
10
+
11
+ class Metric(str, Enum):
12
+ COSINE = "cosine"
13
+ DOT = "dot"
14
+ EUCLIDEAN = "euclidean"
15
+
16
+ def __str__(self) -> str:
17
+ return self.name.lower()
18
+
19
+
20
+ async def _get_payload(response):
21
+ response.raise_for_status()
22
+ resp = await response.text()
23
+ return json.loads(resp)
24
+
25
+
26
+ def wait_until(functions):
27
+ single_result = False
28
+ if not isinstance(functions, list):
29
+ single_result = True
30
+ functions = [functions]
31
+ holder = []
32
+
33
+ async def run_and_capture_result():
34
+ holder.append(await asyncio.gather(*functions))
35
+
36
+ asyncio.run(run_and_capture_result())
37
+ if single_result:
38
+ return holder[0][0] # single result
39
+ else:
40
+ return holder[0] # list of results
@@ -1,16 +1,16 @@
1
1
  [tool.poetry]
2
2
  name = "indexify"
3
- version = "0.0.1"
3
+ version = "0.0.3"
4
4
  description = "Python Client for Indexify"
5
- authors = ["Diptanu Gon Choudhury <diptanuc@gmail.com>"]
5
+ authors = ["Diptanu Gon Choudhury <diptanuc@gmail.com>", "Vijay Parthasarathy <vijay2win@gmail.com>"]
6
6
  license = "Apache 2.0"
7
- readme = "README.rst"
7
+ readme = "README.md"
8
8
  homepage = "https://github.com/diptanu/indexify"
9
9
  repository = "https://github.com/diptanu/indexify"
10
10
 
11
11
  [tool.poetry.dependencies]
12
12
  python = "^3.10.0"
13
- requests = "^2.28.2, !=2.30.0"
13
+ aiohttp = "^3.4"
14
14
 
15
15
  [tool.poetry.dev-dependencies]
16
16
  black = "^22.3.0"
@@ -22,4 +22,4 @@ pytest-watch = "^4.2.0"
22
22
 
23
23
  [build-system]
24
24
  requires = ["poetry>=1.2"]
25
- build-backend = "poetry.masonry.api"
25
+ build-backend = "poetry.core.masonry.api"
indexify-0.0.1/PKG-INFO DELETED
@@ -1,18 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: indexify
3
- Version: 0.0.1
4
- Summary: Python Client for Indexify
5
- Home-page: https://github.com/diptanu/indexify
6
- License: Apache 2.0
7
- Author: Diptanu Gon Choudhury
8
- Author-email: diptanuc@gmail.com
9
- Requires-Python: >=3.10.0,<4.0.0
10
- Classifier: License :: Other/Proprietary License
11
- Classifier: Programming Language :: Python :: 3
12
- Classifier: Programming Language :: Python :: 3.10
13
- Classifier: Programming Language :: Python :: 3.11
14
- Requires-Dist: requests (>=2.28.2,<3.0.0,!=2.30.0)
15
- Project-URL: Repository, https://github.com/diptanu/indexify
16
- Description-Content-Type: text/x-rst
17
-
18
- # Indexify Python Client
indexify-0.0.1/README.rst DELETED
@@ -1 +0,0 @@
1
- # Indexify Python Client
@@ -1,3 +0,0 @@
1
- """
2
- Python Client for Indexify
3
- """
@@ -1,115 +0,0 @@
1
- from typing import Optional, List
2
- from enum import Enum
3
- import requests
4
- import json
5
- import dataclasses
6
- from dataclasses import dataclass
7
-
8
- DEFAULT_INDEXIFY_URL = "https://localhost:8090"
9
-
10
- DEFAULT_EMBEDDING_MODEL = "all-minilm-l6-v2"
11
-
12
-
13
- class ApiException(Exception):
14
- def __init__(self, message: str) -> None:
15
- super().__init__(message)
16
-
17
-
18
- class Metric(str, Enum):
19
- COSINE = "cosine"
20
- DOT = "dot"
21
- EUCLIDEAN = "euclidean"
22
-
23
- def __str__(self) -> str:
24
- return self.name.lower()
25
-
26
-
27
- class TextSplitter(str, Enum):
28
- NEWLINE = "new_line"
29
- REGEX = "regex"
30
- NOOP = "noop"
31
-
32
- def __str__(self) -> str:
33
- return self.value.lower()
34
-
35
-
36
- @dataclass
37
- class TextChunk:
38
- text: str
39
- metadata: dict
40
-
41
- def to_json(self):
42
- return json.dumps({"text": self.text, "metadata": self.metadata})
43
-
44
-
45
- @dataclass
46
- class SearchChunk:
47
- index: str
48
- query: str
49
- k: int
50
-
51
-
52
- @dataclass
53
- class SearchResult:
54
- results: List[TextChunk]
55
-
56
-
57
- class Indexify:
58
- def __init__(self, url, index) -> None:
59
- self._url = url
60
- self._index = index
61
-
62
- @classmethod
63
- def create_index(
64
- cls,
65
- name: str,
66
- indexify_url: Optional[str] = DEFAULT_INDEXIFY_URL,
67
- embedding_model: Optional[str] = DEFAULT_EMBEDDING_MODEL,
68
- metric: Metric = Metric.COSINE,
69
- splitter: Optional[str] = TextSplitter.NEWLINE,
70
- unique_labels=Optional[List[str]],
71
- ):
72
- req = {
73
- "name": name,
74
- "embedding_model": embedding_model,
75
- "metric": metric,
76
- "text_splitter": splitter,
77
- "hash_on": unique_labels,
78
- }
79
- resp = requests.post(f"{indexify_url}/index/create", json=req)
80
- if resp.status_code == 200:
81
- return cls(indexify_url, name)
82
- Indexify._get_payload(resp)
83
-
84
- @classmethod
85
- def get_index(cls, name: str, indexify_url: Optional[str]):
86
- return cls(indexify_url, name)
87
-
88
- def add_text_chunk(self, chunk: str, metadata: dict):
89
- text_chunk = TextChunk(chunk, metadata)
90
- req = {"index": self._index, "documents": [dataclasses.asdict(text_chunk)]}
91
- resp = requests.post(f"{self._url}/index/add", json=req)
92
- if resp.status_code == 200:
93
- return
94
- self._get_payload(resp)
95
-
96
- def search(self, query: str, top_k: int):
97
- req = SearchChunk(index=self._index, query=query, k=top_k)
98
- resp = requests.get(f"{self._url}/index/search", json=dataclasses.asdict(req))
99
- payload = self._get_payload(resp)
100
- result = SearchResult(results=[])
101
- for res in payload["results"]:
102
- result.results.append(TextChunk(text=res["text"], metadata=res["metadata"]))
103
- return result
104
-
105
- @staticmethod
106
- def _get_payload(response):
107
- payload = {"errors": []}
108
- try:
109
- payload = json.loads(response.text)
110
- except:
111
- raise ApiException(response.text)
112
- if len(payload["errors"]) > 0:
113
- raise ApiException(f"Failed to create index: {payload['errors']}")
114
-
115
- return payload
File without changes