indexify 0.0.19__py3-none-any.whl → 0.0.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
indexify/__init__.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from .index import Index
2
2
  from .client import IndexifyClient
3
- from .extraction_policy import ExtractionPolicy
3
+ from .extraction_policy import ExtractionPolicy, ExtractionGraphBuilder, ExtractionGraph
4
4
  from .client import IndexifyClient, Document
5
5
  from .settings import DEFAULT_SERVICE_URL
6
6
 
@@ -8,6 +8,7 @@ __all__ = [
8
8
  "Index",
9
9
  "Document",
10
10
  "IndexifyClient",
11
- "ExtractionPolicy",
11
+ "ExtractionGraph",
12
+ "ExtractionGraphBuilder" "ExtractionPolicy",
12
13
  "DEFAULT_SERVICE_URL",
13
14
  ]
indexify/client.py CHANGED
@@ -6,7 +6,7 @@ import json
6
6
  from collections import namedtuple
7
7
  from .settings import DEFAULT_SERVICE_URL
8
8
  from .extractor import Extractor
9
- from .extraction_policy import ExtractionPolicy
9
+ from .extraction_policy import ExtractionPolicy, ExtractionGraph
10
10
  from .index import Index
11
11
  from .utils import json_set_default
12
12
  from .data_containers import TextChunk
@@ -19,6 +19,7 @@ Document = namedtuple("Document", ["text", "labels", "id"])
19
19
 
20
20
  SQLQueryRow = namedtuple("SQLQueryRow", ["content_id", "data"])
21
21
 
22
+
22
23
  @dataclass
23
24
  class SqlQueryResult:
24
25
  result: List[Dict]
@@ -45,22 +46,22 @@ class IndexifyClient:
45
46
 
46
47
  def __init__(
47
48
  self,
48
- service_url: str = DEFAULT_SERVICE_URL, # switch this to DEFAULT_SERVICE_URL_HTTPS for TLS
49
+ service_url: str = DEFAULT_SERVICE_URL, # switch this to DEFAULT_SERVICE_URL_HTTPS for TLS
49
50
  namespace: str = "default",
50
51
  config_path: Optional[str] = None,
51
52
  *args,
52
53
  **kwargs,
53
54
  ):
54
55
  if config_path:
55
- with open(config_path, 'r') as file:
56
+ with open(config_path, "r") as file:
56
57
  config = yaml.safe_load(file)
57
-
58
- if config.get('use_tls', False):
59
- tls_config = config['tls_config']
58
+
59
+ if config.get("use_tls", False):
60
+ tls_config = config["tls_config"]
60
61
  self._client = httpx.Client(
61
62
  http2=True,
62
- cert=(tls_config['cert_path'], tls_config['key_path']),
63
- verify=tls_config.get('ca_bundle_path', True)
63
+ cert=(tls_config["cert_path"], tls_config["key_path"]),
64
+ verify=tls_config.get("ca_bundle_path", True),
64
65
  )
65
66
  else:
66
67
  self._client = httpx.Client(*args, **kwargs)
@@ -68,17 +69,18 @@ class IndexifyClient:
68
69
  self._client = httpx.Client(*args, **kwargs)
69
70
 
70
71
  self.namespace: str = namespace
71
- self.extraction_policies: List[ExtractionPolicy] = []
72
+ self.extraction_graphs: List[ExtractionGraph] = []
72
73
  self.labels: dict = {}
73
74
  self._service_url = service_url
75
+ self._timeout = kwargs.get("timeout")
74
76
 
75
77
  # get namespace data
76
78
  response = self.get(f"namespaces/{self.namespace}")
77
79
  response.raise_for_status()
78
80
  resp_json = response.json()
79
81
  # initialize extraction_policies
80
- for eb in resp_json["namespace"]["extraction_policies"]:
81
- self.extraction_policies.append(ExtractionPolicy.from_dict(eb))
82
+ for eb in resp_json["namespace"]["extraction_graphs"]:
83
+ self.extraction_graphs.append(ExtractionGraph.from_dict(eb))
82
84
 
83
85
  @classmethod
84
86
  def with_mtls(
@@ -128,7 +130,7 @@ class IndexifyClient:
128
130
  return client
129
131
 
130
132
  def _request(self, method: str, **kwargs) -> httpx.Response:
131
- response = self._client.request(method,timeout=None, **kwargs)
133
+ response = self._client.request(method, timeout=self._timeout, **kwargs)
132
134
  try:
133
135
  response.raise_for_status()
134
136
  except httpx.HTTPStatusError as exc:
@@ -188,7 +190,7 @@ class IndexifyClient:
188
190
  ```
189
191
  """
190
192
  return self._request("PUT", url=f"{self._service_url}/{endpoint}", **kwargs)
191
-
193
+
192
194
  def delete(self, endpoint: str, **kwargs) -> httpx.Response:
193
195
  """
194
196
  Make a DELETE request to the Indexify service.
@@ -243,9 +245,9 @@ class IndexifyClient:
243
245
  def create_namespace(
244
246
  self,
245
247
  namespace: str,
246
- extraction_policies: list = [],
248
+ extraction_graphs: list = [],
247
249
  labels: dict = {},
248
- service_url: str = DEFAULT_SERVICE_URL
250
+ service_url: str = DEFAULT_SERVICE_URL,
249
251
  ) -> "IndexifyClient":
250
252
  """
251
253
  Create a new namespace.
@@ -253,16 +255,16 @@ class IndexifyClient:
253
255
  Returns:
254
256
  IndexifyClient: a new client with the given namespace
255
257
  """
256
- extraction_policies = []
257
- for bd in extraction_policies:
258
- if isinstance(bd, ExtractionPolicy):
259
- extraction_policies.append(bd.to_dict())
258
+ extraction_graphs = []
259
+ for bd in extraction_graphs:
260
+ if isinstance(bd, extraction_graphs):
261
+ extraction_graphs.append(bd.to_dict())
260
262
  else:
261
- extraction_policies.append(bd)
263
+ extraction_graphs.append(bd)
262
264
 
263
265
  req = {
264
266
  "name": namespace,
265
- "extraction_policies": extraction_policies,
267
+ "extraction_graphs": extraction_graphs,
266
268
  "labels": labels,
267
269
  }
268
270
 
@@ -318,57 +320,24 @@ class IndexifyClient:
318
320
  self.extraction_policies.append(ExtractionPolicy.from_dict(eb))
319
321
  return self.extraction_policies
320
322
 
321
- def add_extraction_policy(
322
- self,
323
- extractor: str,
324
- name: str,
325
- input_params: dict = {},
326
- labels_eq: str = None,
327
- content_source="ingestion",
328
- ) -> dict:
329
- """Add a new extraction policy.
323
+ def create_extraction_graph(self, extraction_graph: ExtractionGraph):
324
+ """
325
+ Create a new extraction graph.
330
326
 
331
327
  Args:
332
- - extractor (str): Name of the extractor
333
- - name (str): Name for this instance
334
- - input_params (dict): Dictionary containing extractor input params
335
- - filter (Filter): Optional filter for this extractor
336
-
337
- Returns:
338
- dict: response payload
339
-
340
- Examples:
341
- >>> repo.add_extraction_policy("EfficientNet", "efficientnet")
342
-
343
- >>> repo.add_extraction_policy("MiniLML6", "minilm")
344
-
328
+ - extraction_graph (ExtractionGraph): the extraction graph to create
345
329
  """
346
- req = {
347
- "extractor": extractor,
348
- "name": name,
349
- "input_params": input_params,
350
- "filters_eq": labels_eq,
351
- "content_source": content_source,
352
- }
353
- if req["filters_eq"] == None:
354
- del req["filters_eq"]
355
-
330
+ req = extraction_graph.to_dict()
331
+ req["namespace"] = self.namespace
356
332
  request_body = json.dumps(req, default=json_set_default)
357
333
  response = self.post(
358
- f"namespaces/{self.namespace}/extraction_policies",
334
+ f"namespaces/{self.namespace}/extraction_graphs",
359
335
  data=request_body,
360
336
  headers={"Content-Type": "application/json"},
361
337
  )
362
-
363
- # update self.extractor_bindings
364
- self.get_extraction_policies()
365
-
366
- try:
367
- response.raise_for_status()
368
- except httpx.HTTPStatusError as exc:
369
- raise ApiException(exc.response.text)
338
+ response.raise_for_status()
370
339
  return
371
-
340
+
372
341
  def get_content_metadata(self, content_id: str) -> dict:
373
342
  """
374
343
  Get metadata for a specific content ID in a given index.
@@ -399,11 +368,11 @@ class IndexifyClient:
399
368
  self._add_content_url(content)
400
369
  for content in response.json()["content_list"]
401
370
  ]
402
-
403
- def download_content(self, id:str) -> bytes:
371
+
372
+ def download_content(self, id: str) -> bytes:
404
373
  """
405
374
  Download content from id. Return bytes
406
-
375
+
407
376
  Args:
408
377
  - id (str): id of content to download
409
378
  """
@@ -415,7 +384,10 @@ class IndexifyClient:
415
384
  raise ApiException(exc.response.text)
416
385
 
417
386
  def add_documents(
418
- self, documents: Union[Document, str, List[Union[Document, str]]], doc_id=None
387
+ self,
388
+ extraction_graphs: Union[str, List[str]],
389
+ documents: Union[Document, str, List[Union[Document, str]]],
390
+ doc_id=None,
419
391
  ) -> None:
420
392
  """
421
393
  Add documents to current namespace.
@@ -423,6 +395,8 @@ class IndexifyClient:
423
395
  Args:
424
396
  - documents (Union[Document, str, List[Union[Document, str]]]): this can be a list of strings, list of Documents or a mix of both
425
397
  """
398
+ if isinstance(extraction_graphs, str):
399
+ extraction_graphs = [extraction_graphs]
426
400
  if isinstance(documents, Document):
427
401
  documents = [documents]
428
402
  elif isinstance(documents, str):
@@ -433,7 +407,9 @@ class IndexifyClient:
433
407
  if isinstance(item, Document):
434
408
  new_documents.append(item)
435
409
  elif isinstance(item, str):
436
- new_documents.append(Document(item, {}, id=None)) # don't pass in id for a string content because doesn't make sense to have same content id for all strings
410
+ new_documents.append(
411
+ Document(item, {}, id=None)
412
+ ) # don't pass in id for a string content because doesn't make sense to have same content id for all strings
437
413
  else:
438
414
  raise ValueError(
439
415
  "List items must be either Document instances or strings."
@@ -444,7 +420,10 @@ class IndexifyClient:
444
420
  "Invalid type for documents. Expected Document, str, or list of these."
445
421
  )
446
422
 
447
- req = {"documents": [doc._asdict() for doc in documents]}
423
+ req = {
424
+ "documents": [doc._asdict() for doc in documents],
425
+ "extraction_graph_names": extraction_graphs,
426
+ }
448
427
  response = self.post(
449
428
  f"namespaces/{self.namespace}/add_texts",
450
429
  json=req,
@@ -475,7 +454,9 @@ class IndexifyClient:
475
454
  - path (str): relative path to the file to be uploaded
476
455
  """
477
456
  with open(path, "rb") as f:
478
- response = self.put(f"namespaces/{self.namespace}/content/{document_id}", files={"file": f})
457
+ response = self.put(
458
+ f"namespaces/{self.namespace}/content/{document_id}", files={"file": f}
459
+ )
479
460
  response.raise_for_status()
480
461
 
481
462
  def get_structured_data(self, content_id: str) -> dict:
@@ -485,11 +466,15 @@ class IndexifyClient:
485
466
  Args:
486
467
  - content_id (str): content id to query
487
468
  """
488
- response = self.get(f"namespaces/{self.namespace}/content/{content_id}/metadata")
469
+ response = self.get(
470
+ f"namespaces/{self.namespace}/content/{content_id}/metadata"
471
+ )
489
472
  response.raise_for_status()
490
- return response.json().get("metadata",[])
473
+ return response.json().get("metadata", [])
491
474
 
492
- def search_index(self, name: str, query: str, top_k: int, filters: List[str] = []) -> list[TextChunk]:
475
+ def search_index(
476
+ self, name: str, query: str, top_k: int, filters: List[str] = []
477
+ ) -> list[TextChunk]:
493
478
  """
494
479
  Search index in the current namespace.
495
480
 
@@ -508,7 +493,7 @@ class IndexifyClient:
508
493
  response.raise_for_status()
509
494
  return response.json()["results"]
510
495
 
511
- def upload_file(self, path: str, id=None, labels: dict = {}) -> str:
496
+ def upload_file(self, extraction_graphs: Union[str, List[str]], path: str, id=None, labels: dict = {}) -> str:
512
497
  """
513
498
  Upload a file.
514
499
 
@@ -516,9 +501,11 @@ class IndexifyClient:
516
501
  - path (str): relative path to the file to be uploaded
517
502
  - labels (dict): labels to be associated with the file
518
503
  """
519
- params={}
504
+ if isinstance(extraction_graphs, str):
505
+ extraction_graphs = [extraction_graphs]
506
+ params = {"extraction_graph_names": extraction_graphs}
520
507
  if id is not None:
521
- params['id'] = id
508
+ params["id"] = id
522
509
  with open(path, "rb") as f:
523
510
  response = self.post(
524
511
  f"namespaces/{self.namespace}/upload_file",
@@ -537,18 +524,20 @@ class IndexifyClient:
537
524
  response = self.get(f"namespaces/{self.namespace}/schemas")
538
525
  response.raise_for_status()
539
526
  return response.json()
540
-
541
- def get_content_tree(self, content_id:str):
527
+
528
+ def get_content_tree(self, content_id: str):
542
529
  """
543
530
  Get content tree for a given content id
544
531
 
545
532
  Args:
546
533
  - content_id (str): id of content
547
534
  """
548
- response = self.get(f"namespaces/{self.namespace}/content/{content_id}/content-tree")
535
+ response = self.get(
536
+ f"namespaces/{self.namespace}/content/{content_id}/content-tree"
537
+ )
549
538
  response.raise_for_status()
550
539
  return response.json()
551
-
540
+
552
541
  def sql_query(self, query: str):
553
542
  """
554
543
  Execute a SQL query.
@@ -569,9 +558,13 @@ class IndexifyClient:
569
558
  data = row["data"]
570
559
  rows.append(data)
571
560
  return SqlQueryResult(result=rows)
572
-
573
- def ingest_remote_file(self, url: str, mime_type: str, labels: Dict[str, str], id=None):
574
- req = {"url": url, "mime_type": mime_type, "labels": labels, "id": id}
561
+
562
+ def ingest_remote_file(
563
+ self, extraction_graphs: Union[str, List[str]], url: str, mime_type: str, labels: Dict[str, str], id=None
564
+ ):
565
+ if isinstance(extraction_graphs, str):
566
+ extraction_graphs = [extraction_graphs]
567
+ req = {"url": url, "mime_type": mime_type, "labels": labels, "id": id, "extraction_graph_names": extraction_graphs}
575
568
  response = self.post(
576
569
  f"namespaces/{self.namespace}/ingest_remote_file",
577
570
  json=req,
@@ -579,7 +572,7 @@ class IndexifyClient:
579
572
  )
580
573
  response.raise_for_status()
581
574
  return response.json()
582
-
575
+
583
576
  def generate_unique_hex_id(self):
584
577
  """
585
578
  Generate a unique hexadecimal identifier
@@ -588,18 +581,16 @@ class IndexifyClient:
588
581
  str: a unique hexadecimal string
589
582
  """
590
583
  return uuid.uuid4().hex[:16]
591
-
584
+
592
585
  def generate_hash_from_string(self, input_string: str):
593
586
  """
594
587
  Generate a hash for the given string and return it as a hexadecimal string.
595
-
588
+
596
589
  Args:
597
590
  input_string (str): The input string to hash.
598
-
591
+
599
592
  Returns:
600
593
  str: The hexadecimal hash of the input string.
601
594
  """
602
595
  hash_object = hashlib.sha256(input_string.encode())
603
596
  return hash_object.hexdigest()[:16]
604
-
605
-
@@ -1,5 +1,5 @@
1
1
  from dataclasses import dataclass, asdict
2
- from typing import Optional
2
+ from typing import Optional, List
3
3
 
4
4
 
5
5
  @dataclass
@@ -7,7 +7,7 @@ class ExtractionPolicy:
7
7
  extractor: str
8
8
  name: str
9
9
  content_source: str
10
- input_params: dict
10
+ input_params: Optional[dict] = None
11
11
  id: Optional[str] = None
12
12
  labels_eq: Optional[str] = None
13
13
 
@@ -25,4 +25,44 @@ class ExtractionPolicy:
25
25
  def from_dict(cls, json: dict):
26
26
  if "filters_eq" in json:
27
27
  json["labels_eq"] = json.pop("filters_eq")
28
+ json["id"] = json.get("id", None)
28
29
  return ExtractionPolicy(**json)
30
+
31
+
32
+ @dataclass
33
+ class ExtractionGraph:
34
+ id: str
35
+ name: str
36
+ extraction_policies: List[ExtractionPolicy]
37
+
38
+ @classmethod
39
+ def from_dict(cls, json: dict):
40
+ json["id"] = json.get("id", None)
41
+ if "namespace" in json.keys():
42
+ json.pop("namespace")
43
+ return ExtractionGraph(**json)
44
+
45
+ @staticmethod
46
+ def from_yaml(spec: str):
47
+ import yaml
48
+
49
+ return ExtractionGraph.from_dict(yaml.load(spec, Loader=yaml.FullLoader))
50
+
51
+ def to_dict(self) -> dict:
52
+ filtered_dict = {k: v for k, v in asdict(self).items() if v is not None}
53
+ return filtered_dict
54
+
55
+
56
+ class ExtractionGraphBuilder:
57
+ def __init__(self, name: str):
58
+ self.name = name
59
+ self.extraction_policies = []
60
+
61
+ def policy(self, policy: ExtractionPolicy) -> "ExtractionGraphBuilder":
62
+ self.extraction_policies.append(policy)
63
+ return self
64
+
65
+ def build(self):
66
+ return ExtractionGraph(
67
+ id=self.id, name=self.name, extraction_policies=self.extraction_policies
68
+ )
indexify/extractor.py CHANGED
@@ -17,7 +17,12 @@ class ExtractorSchema:
17
17
 
18
18
  class Extractor:
19
19
  def __init__(
20
- self, name: str, description: str, input_params: dict, outputs: ExtractorSchema, input_mime_types: list[str]
20
+ self,
21
+ name: str,
22
+ description: str,
23
+ input_params: dict,
24
+ outputs: ExtractorSchema,
25
+ input_mime_types: list[str],
21
26
  ):
22
27
  self.name = name
23
28
  self.description = description
indexify/settings.py CHANGED
@@ -1,2 +1,2 @@
1
1
  DEFAULT_SERVICE_URL = "http://localhost:8900"
2
- DEFAULT_SERVICE_URL_HTTPS = "https://localhost:8900"
2
+ DEFAULT_SERVICE_URL_HTTPS = "https://localhost:8900"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: indexify
3
- Version: 0.0.19
3
+ Version: 0.0.21
4
4
  Summary: Python Client for Indexify
5
5
  Home-page: https://github.com/tensorlakeai/indexify
6
6
  License: Apache 2.0
@@ -13,6 +13,7 @@ Classifier: Programming Language :: Python :: 3.10
13
13
  Classifier: Programming Language :: Python :: 3.11
14
14
  Classifier: Programming Language :: Python :: 3.12
15
15
  Requires-Dist: httpx[http2] (>=0.26,<0.27)
16
+ Requires-Dist: pyyaml (>=6.0.1,<7.0.0)
16
17
  Project-URL: Repository, https://github.com/tensorlakeai/indexify
17
18
  Description-Content-Type: text/markdown
18
19
 
@@ -0,0 +1,13 @@
1
+ indexify/__init__.py,sha256=hhDqRvJo4gCW1eqVgFblxKiBzArCFfo2eFGOBsQkDOc,401
2
+ indexify/client.py,sha256=s2Xflh75574WvNp0lbG6PGtK2Dy3CMfME5MDK1iDgR4,19334
3
+ indexify/data_containers.py,sha256=r1wxJPtsmXbyKvb17fqxm-dPjKz51oZ62f8A8Zxls1c,361
4
+ indexify/exceptions.py,sha256=vjd5SPPNFIEW35GorSIodsqvm9RKHQm9kdp8t9gv-WM,111
5
+ indexify/extraction_policy.py,sha256=dIyQK3N-QOpQ0BPjiZ_635o8A5ITNxaz1syQ_FPaE0k,1851
6
+ indexify/extractor.py,sha256=sWFLlXHgEfWlmiKAXN6ytUt_uG7th-XGNHqz-TG39gs,1216
7
+ indexify/index.py,sha256=RvxYhJXEth-GKvqzlMiz5PuN1eIbZk84pt20piA1Gsw,504
8
+ indexify/settings.py,sha256=LSaWZ0ADIVmUv6o6dHWRC3-Ry5uLbCw2sBSg1e_U7UM,99
9
+ indexify/utils.py,sha256=rDN2lrsAs9noJEIjfx6ukmC2SAIyrlUt7QU-kaBjujM,125
10
+ indexify-0.0.21.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
11
+ indexify-0.0.21.dist-info/METADATA,sha256=Rb_7fwsIiJKuJaLnmJp7Cw4exYLhHcdx48OfBcFzaO4,1753
12
+ indexify-0.0.21.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
13
+ indexify-0.0.21.dist-info/RECORD,,
@@ -1,13 +0,0 @@
1
- indexify/__init__.py,sha256=Sz6zkAIHsPOi0rG5RM7dVkXGDa0fO2uurD6vS4Qo15E,312
2
- indexify/client.py,sha256=x2-Yqa59x20K4-5V7Agh35jOGqRIBGZrAoQYKXjuq0A,19480
3
- indexify/data_containers.py,sha256=r1wxJPtsmXbyKvb17fqxm-dPjKz51oZ62f8A8Zxls1c,361
4
- indexify/exceptions.py,sha256=vjd5SPPNFIEW35GorSIodsqvm9RKHQm9kdp8t9gv-WM,111
5
- indexify/extraction_policy.py,sha256=vKVHT8jSjzhUaKqWpewOGkYojMBplvGdSm9zoSN9Pcg,750
6
- indexify/extractor.py,sha256=KMcP9xopHJRBzeSxalztGGTBvOzVKRFEsJynV-hLRSc,1175
7
- indexify/index.py,sha256=RvxYhJXEth-GKvqzlMiz5PuN1eIbZk84pt20piA1Gsw,504
8
- indexify/settings.py,sha256=UXUd6hYlDALPPjUCFvFkvUmsm7HwXAluWowCjZWoxjY,98
9
- indexify/utils.py,sha256=rDN2lrsAs9noJEIjfx6ukmC2SAIyrlUt7QU-kaBjujM,125
10
- indexify-0.0.19.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
11
- indexify-0.0.19.dist-info/METADATA,sha256=reizFOmSBBTh3n4wMVcxqeOdg7APpnBmpcxr32jiwJg,1714
12
- indexify-0.0.19.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
13
- indexify-0.0.19.dist-info/RECORD,,