indexify 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/client.py +75 -18
- {indexify-0.0.14.dist-info → indexify-0.0.16.dist-info}/METADATA +1 -1
- {indexify-0.0.14.dist-info → indexify-0.0.16.dist-info}/RECORD +5 -5
- {indexify-0.0.14.dist-info → indexify-0.0.16.dist-info}/LICENSE.txt +0 -0
- {indexify-0.0.14.dist-info → indexify-0.0.16.dist-info}/WHEEL +0 -0
indexify/client.py
CHANGED
@@ -1,4 +1,7 @@
|
|
1
|
+
import yaml
|
1
2
|
import httpx
|
3
|
+
import uuid
|
4
|
+
import hashlib
|
2
5
|
import json
|
3
6
|
from collections import namedtuple
|
4
7
|
from .settings import DEFAULT_SERVICE_URL
|
@@ -12,7 +15,7 @@ from dataclasses import dataclass
|
|
12
15
|
|
13
16
|
from typing import List, Optional, Union, Dict
|
14
17
|
|
15
|
-
Document = namedtuple("Document", ["text", "labels"])
|
18
|
+
Document = namedtuple("Document", ["text", "labels", "id"])
|
16
19
|
|
17
20
|
SQLQueryRow = namedtuple("SQLQueryRow", ["content_id", "data"])
|
18
21
|
|
@@ -44,14 +47,30 @@ class IndexifyClient:
|
|
44
47
|
self,
|
45
48
|
service_url: str = DEFAULT_SERVICE_URL,
|
46
49
|
namespace: str = "default",
|
50
|
+
config_path: Optional[str] = None,
|
47
51
|
*args,
|
48
52
|
**kwargs,
|
49
53
|
):
|
54
|
+
if config_path:
|
55
|
+
with open(config_path, 'r') as file:
|
56
|
+
config = yaml.safe_load(file)
|
57
|
+
|
58
|
+
if config.get('use_tls', False):
|
59
|
+
tls_config = config['tls_config']
|
60
|
+
self._client = httpx.Client(
|
61
|
+
http2=True,
|
62
|
+
cert=(tls_config['cert_path'], tls_config['key_path']),
|
63
|
+
verify=tls_config.get('ca_bundle_path', True)
|
64
|
+
)
|
65
|
+
else:
|
66
|
+
self._client = httpx.Client(*args, **kwargs)
|
67
|
+
else:
|
68
|
+
self._client = httpx.Client(*args, **kwargs)
|
69
|
+
|
50
70
|
self.namespace: str = namespace
|
51
71
|
self.extraction_policies: List[ExtractionPolicy] = []
|
52
72
|
self.labels: dict = {}
|
53
73
|
self._service_url = service_url
|
54
|
-
self._client = httpx.Client(*args, **kwargs)
|
55
74
|
|
56
75
|
# get namespace data
|
57
76
|
response = self.get(f"namespaces/{self.namespace}")
|
@@ -349,11 +368,21 @@ class IndexifyClient:
|
|
349
368
|
except httpx.HTTPStatusError as exc:
|
350
369
|
raise ApiException(exc.response.text)
|
351
370
|
return
|
371
|
+
|
372
|
+
def get_content_metadata(self, content_id: str) -> dict:
|
373
|
+
"""
|
374
|
+
Get metadata for a specific content ID in a given index.
|
352
375
|
|
353
|
-
|
376
|
+
Args:
|
377
|
+
- content_id (str): content id to query
|
378
|
+
"""
|
379
|
+
response = self.get(f"namespaces/{self.namespace}/content/{content_id}")
|
380
|
+
response.raise_for_status()
|
381
|
+
return response.json()
|
382
|
+
|
383
|
+
def get_extracted_content(
|
354
384
|
self,
|
355
|
-
|
356
|
-
labels_eq: str = None,
|
385
|
+
content_id: str = None,
|
357
386
|
):
|
358
387
|
"""
|
359
388
|
Get list of content from current namespace.
|
@@ -362,11 +391,7 @@ class IndexifyClient:
|
|
362
391
|
- parent_id (str): Optional filter for parent id
|
363
392
|
- labels_eq (str): Optional filter for labels
|
364
393
|
"""
|
365
|
-
params = {}
|
366
|
-
if parent_id:
|
367
|
-
params.update({"parent_id": parent_id})
|
368
|
-
if labels_eq:
|
369
|
-
params.update({"labels_eq": labels_eq})
|
394
|
+
params = {"parent_id": content_id}
|
370
395
|
|
371
396
|
response = self.get(f"namespaces/{self.namespace}/content", params=params)
|
372
397
|
response.raise_for_status()
|
@@ -390,7 +415,7 @@ class IndexifyClient:
|
|
390
415
|
raise ApiException(exc.response.text)
|
391
416
|
|
392
417
|
def add_documents(
|
393
|
-
self, documents: Union[Document, str, List[Union[Document, str]]]
|
418
|
+
self, documents: Union[Document, str, List[Union[Document, str]]], doc_id=None
|
394
419
|
) -> None:
|
395
420
|
"""
|
396
421
|
Add documents to current namespace.
|
@@ -401,14 +426,14 @@ class IndexifyClient:
|
|
401
426
|
if isinstance(documents, Document):
|
402
427
|
documents = [documents]
|
403
428
|
elif isinstance(documents, str):
|
404
|
-
documents = [Document(documents, {})]
|
429
|
+
documents = [Document(documents, {}, id=doc_id)]
|
405
430
|
elif isinstance(documents, list):
|
406
431
|
new_documents = []
|
407
432
|
for item in documents:
|
408
433
|
if isinstance(item, Document):
|
409
434
|
new_documents.append(item)
|
410
435
|
elif isinstance(item, str):
|
411
|
-
new_documents.append(Document(item, {}))
|
436
|
+
new_documents.append(Document(item, {}, id=None)) # don't pass in id for a string content because doesn't make sense to have same content id for all strings
|
412
437
|
else:
|
413
438
|
raise ValueError(
|
414
439
|
"List items must be either Document instances or strings."
|
@@ -419,7 +444,7 @@ class IndexifyClient:
|
|
419
444
|
"Invalid type for documents. Expected Document, str, or list of these."
|
420
445
|
)
|
421
446
|
|
422
|
-
req = {"documents": documents}
|
447
|
+
req = {"documents": [doc._asdict() for doc in documents]}
|
423
448
|
response = self.post(
|
424
449
|
f"namespaces/{self.namespace}/add_texts",
|
425
450
|
json=req,
|
@@ -453,7 +478,7 @@ class IndexifyClient:
|
|
453
478
|
response = self.put(f"namespaces/{self.namespace}/content/{document_id}", files={"file": f}, timeout=None)
|
454
479
|
response.raise_for_status()
|
455
480
|
|
456
|
-
def
|
481
|
+
def get_structured_data(self, content_id: str) -> dict:
|
457
482
|
"""
|
458
483
|
Query metadata for a specific content ID in a given index.
|
459
484
|
|
@@ -464,7 +489,7 @@ class IndexifyClient:
|
|
464
489
|
response.raise_for_status()
|
465
490
|
return response.json().get("metadata",[])
|
466
491
|
|
467
|
-
def search_index(self, name: str, query: str, top_k: int) -> list[TextChunk]:
|
492
|
+
def search_index(self, name: str, query: str, top_k: int, filters: List[str] = None) -> list[TextChunk]:
|
468
493
|
"""
|
469
494
|
Search index in the current namespace.
|
470
495
|
|
@@ -472,8 +497,11 @@ class IndexifyClient:
|
|
472
497
|
- name (str): name of index to search
|
473
498
|
- query (str): query string
|
474
499
|
- top_k (int): top k nearest neighbors to be returned
|
500
|
+
- filters (List[str]): list of filters to apply
|
475
501
|
"""
|
476
|
-
|
502
|
+
if filters is None:
|
503
|
+
filters = []
|
504
|
+
req = {"index": name, "query": query, "k": top_k, "filters": filters}
|
477
505
|
response = self.post(
|
478
506
|
f"namespaces/{self.namespace}/search",
|
479
507
|
json=req,
|
@@ -482,17 +510,23 @@ class IndexifyClient:
|
|
482
510
|
response.raise_for_status()
|
483
511
|
return response.json()["results"]
|
484
512
|
|
485
|
-
def upload_file(self, path: str):
|
513
|
+
def upload_file(self, path: str, id=None, labels: dict = {}):
|
486
514
|
"""
|
487
515
|
Upload a file.
|
488
516
|
|
489
517
|
Args:
|
490
518
|
- path (str): relative path to the file to be uploaded
|
519
|
+
- labels (dict): labels to be associated with the file
|
491
520
|
"""
|
521
|
+
params={}
|
522
|
+
if id is not None:
|
523
|
+
params['id'] = id
|
492
524
|
with open(path, "rb") as f:
|
493
525
|
response = self.post(
|
494
526
|
f"namespaces/{self.namespace}/upload_file",
|
495
527
|
files={"file": f},
|
528
|
+
data=labels,
|
529
|
+
params=params,
|
496
530
|
timeout=None,
|
497
531
|
)
|
498
532
|
response.raise_for_status()
|
@@ -535,4 +569,27 @@ class IndexifyClient:
|
|
535
569
|
)
|
536
570
|
response.raise_for_status()
|
537
571
|
return response.json()
|
572
|
+
|
573
|
+
def generate_unique_hex_id(self):
|
574
|
+
"""
|
575
|
+
Generate a unique hexadecimal identifier
|
576
|
+
|
577
|
+
Returns:
|
578
|
+
str: a unique hexadecimal string
|
579
|
+
"""
|
580
|
+
return uuid.uuid4().hex[:16]
|
581
|
+
|
582
|
+
def generate_hash_from_string(self, input_string: str):
|
583
|
+
"""
|
584
|
+
Generate a hash for the given string and return it as a hexadecimal string.
|
585
|
+
|
586
|
+
Args:
|
587
|
+
input_string (str): The input string to hash.
|
588
|
+
|
589
|
+
Returns:
|
590
|
+
str: The hexadecimal hash of the input string.
|
591
|
+
"""
|
592
|
+
hash_object = hashlib.sha256(input_string.encode())
|
593
|
+
return hash_object.hexdigest()[:16]
|
594
|
+
|
538
595
|
|
@@ -1,5 +1,5 @@
|
|
1
1
|
indexify/__init__.py,sha256=Sz6zkAIHsPOi0rG5RM7dVkXGDa0fO2uurD6vS4Qo15E,312
|
2
|
-
indexify/client.py,sha256=
|
2
|
+
indexify/client.py,sha256=ZDirw1O46nRx0WBgB95jvpkd4LdAjgZnlQ_2A673_cI,19047
|
3
3
|
indexify/data_containers.py,sha256=r1wxJPtsmXbyKvb17fqxm-dPjKz51oZ62f8A8Zxls1c,361
|
4
4
|
indexify/exceptions.py,sha256=vjd5SPPNFIEW35GorSIodsqvm9RKHQm9kdp8t9gv-WM,111
|
5
5
|
indexify/extraction_policy.py,sha256=vKVHT8jSjzhUaKqWpewOGkYojMBplvGdSm9zoSN9Pcg,750
|
@@ -7,7 +7,7 @@ indexify/extractor.py,sha256=KMcP9xopHJRBzeSxalztGGTBvOzVKRFEsJynV-hLRSc,1175
|
|
7
7
|
indexify/index.py,sha256=RvxYhJXEth-GKvqzlMiz5PuN1eIbZk84pt20piA1Gsw,504
|
8
8
|
indexify/settings.py,sha256=yzWAEZkrTjykSMj3hrFU7l_jUoUCOUsgPVW1nU-qzJQ,46
|
9
9
|
indexify/utils.py,sha256=rDN2lrsAs9noJEIjfx6ukmC2SAIyrlUt7QU-kaBjujM,125
|
10
|
-
indexify-0.0.
|
11
|
-
indexify-0.0.
|
12
|
-
indexify-0.0.
|
13
|
-
indexify-0.0.
|
10
|
+
indexify-0.0.16.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
11
|
+
indexify-0.0.16.dist-info/METADATA,sha256=h_tYmLlbYT0g_9SJnec9hgey8AYP0VTKascytOt0_jE,1714
|
12
|
+
indexify-0.0.16.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
13
|
+
indexify-0.0.16.dist-info/RECORD,,
|
File without changes
|
File without changes
|