indexify 0.0.20__tar.gz → 0.0.22__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: indexify
3
- Version: 0.0.20
3
+ Version: 0.0.22
4
4
  Summary: Python Client for Indexify
5
5
  Home-page: https://github.com/tensorlakeai/indexify
6
6
  License: Apache 2.0
@@ -1,6 +1,6 @@
1
1
  from .index import Index
2
2
  from .client import IndexifyClient
3
- from .extraction_policy import ExtractionPolicy
3
+ from .extraction_policy import ExtractionPolicy, ExtractionGraphBuilder, ExtractionGraph
4
4
  from .client import IndexifyClient, Document
5
5
  from .settings import DEFAULT_SERVICE_URL
6
6
 
@@ -8,6 +8,7 @@ __all__ = [
8
8
  "Index",
9
9
  "Document",
10
10
  "IndexifyClient",
11
- "ExtractionPolicy",
11
+ "ExtractionGraph",
12
+ "ExtractionGraphBuilder" "ExtractionPolicy",
12
13
  "DEFAULT_SERVICE_URL",
13
14
  ]
@@ -6,19 +6,20 @@ import json
6
6
  from collections import namedtuple
7
7
  from .settings import DEFAULT_SERVICE_URL
8
8
  from .extractor import Extractor
9
- from .extraction_policy import ExtractionPolicy
9
+ from .extraction_policy import ExtractionPolicy, ExtractionGraph
10
10
  from .index import Index
11
11
  from .utils import json_set_default
12
+ from .error import Error
12
13
  from .data_containers import TextChunk
13
14
  from indexify.exceptions import ApiException
14
15
  from dataclasses import dataclass
15
-
16
16
  from typing import List, Optional, Union, Dict
17
17
 
18
18
  Document = namedtuple("Document", ["text", "labels", "id"])
19
19
 
20
20
  SQLQueryRow = namedtuple("SQLQueryRow", ["content_id", "data"])
21
21
 
22
+
22
23
  @dataclass
23
24
  class SqlQueryResult:
24
25
  result: List[Dict]
@@ -45,22 +46,22 @@ class IndexifyClient:
45
46
 
46
47
  def __init__(
47
48
  self,
48
- service_url: str = DEFAULT_SERVICE_URL, # switch this to DEFAULT_SERVICE_URL_HTTPS for TLS
49
+ service_url: str = DEFAULT_SERVICE_URL, # switch this to DEFAULT_SERVICE_URL_HTTPS for TLS
49
50
  namespace: str = "default",
50
51
  config_path: Optional[str] = None,
51
52
  *args,
52
53
  **kwargs,
53
54
  ):
54
55
  if config_path:
55
- with open(config_path, 'r') as file:
56
+ with open(config_path, "r") as file:
56
57
  config = yaml.safe_load(file)
57
-
58
- if config.get('use_tls', False):
59
- tls_config = config['tls_config']
58
+
59
+ if config.get("use_tls", False):
60
+ tls_config = config["tls_config"]
60
61
  self._client = httpx.Client(
61
62
  http2=True,
62
- cert=(tls_config['cert_path'], tls_config['key_path']),
63
- verify=tls_config.get('ca_bundle_path', True)
63
+ cert=(tls_config["cert_path"], tls_config["key_path"]),
64
+ verify=tls_config.get("ca_bundle_path", True),
64
65
  )
65
66
  else:
66
67
  self._client = httpx.Client(*args, **kwargs)
@@ -68,17 +69,13 @@ class IndexifyClient:
68
69
  self._client = httpx.Client(*args, **kwargs)
69
70
 
70
71
  self.namespace: str = namespace
71
- self.extraction_policies: List[ExtractionPolicy] = []
72
+ self.extraction_graphs: List[ExtractionGraph] = []
72
73
  self.labels: dict = {}
73
74
  self._service_url = service_url
75
+ self._timeout = kwargs.get("timeout")
74
76
 
75
77
  # get namespace data
76
- response = self.get(f"namespaces/{self.namespace}")
77
- response.raise_for_status()
78
- resp_json = response.json()
79
- # initialize extraction_policies
80
- for eb in resp_json["namespace"]["extraction_policies"]:
81
- self.extraction_policies.append(ExtractionPolicy.from_dict(eb))
78
+ self.extraction_graphs = self.get_extraction_graphs()
82
79
 
83
80
  @classmethod
84
81
  def with_mtls(
@@ -128,12 +125,18 @@ class IndexifyClient:
128
125
  return client
129
126
 
130
127
  def _request(self, method: str, **kwargs) -> httpx.Response:
131
- response = self._client.request(method,timeout=None, **kwargs)
132
128
  try:
133
- response.raise_for_status()
134
- except httpx.HTTPStatusError as exc:
135
- print(f"exception: {exc}, response text: {response.text}")
136
- raise exc
129
+ response = self._client.request(method, timeout=self._timeout, **kwargs)
130
+ status_code = str(response.status_code)
131
+ if status_code.startswith("4") or status_code.startswith("5"):
132
+ error = Error.from_tonic_error_string(str(response.url), response.text)
133
+ self.__print_additional_error_context(error)
134
+ raise error
135
+ except httpx.ConnectError:
136
+ message = f"Make sure the server is running and accesible at {self._service_url}"
137
+ error = Error(status="ConnectionError", message=message)
138
+ print(error)
139
+ raise error
137
140
  return response
138
141
 
139
142
  def get(self, endpoint: str, **kwargs) -> httpx.Response:
@@ -188,7 +191,7 @@ class IndexifyClient:
188
191
  ```
189
192
  """
190
193
  return self._request("PUT", url=f"{self._service_url}/{endpoint}", **kwargs)
191
-
194
+
192
195
  def delete(self, endpoint: str, **kwargs) -> httpx.Response:
193
196
  """
194
197
  Make a DELETE request to the Indexify service.
@@ -243,9 +246,9 @@ class IndexifyClient:
243
246
  def create_namespace(
244
247
  self,
245
248
  namespace: str,
246
- extraction_policies: list = [],
249
+ extraction_graphs: list = [],
247
250
  labels: dict = {},
248
- service_url: str = DEFAULT_SERVICE_URL
251
+ service_url: str = DEFAULT_SERVICE_URL,
249
252
  ) -> "IndexifyClient":
250
253
  """
251
254
  Create a new namespace.
@@ -253,16 +256,16 @@ class IndexifyClient:
253
256
  Returns:
254
257
  IndexifyClient: a new client with the given namespace
255
258
  """
256
- extraction_policies = []
257
- for bd in extraction_policies:
258
- if isinstance(bd, ExtractionPolicy):
259
- extraction_policies.append(bd.to_dict())
259
+ extraction_graphs = []
260
+ for bd in extraction_graphs:
261
+ if isinstance(bd, extraction_graphs):
262
+ extraction_graphs.append(bd.to_dict())
260
263
  else:
261
- extraction_policies.append(bd)
264
+ extraction_graphs.append(bd)
262
265
 
263
266
  req = {
264
267
  "name": namespace,
265
- "extraction_policies": extraction_policies,
268
+ "extraction_graphs": extraction_graphs,
266
269
  "labels": labels,
267
270
  }
268
271
 
@@ -289,7 +292,6 @@ class IndexifyClient:
289
292
  List[Index]: list of indexes in the current namespace
290
293
  """
291
294
  response = self.get(f"namespaces/{self.namespace}/indexes")
292
- response.raise_for_status()
293
295
  return response.json()["indexes"]
294
296
 
295
297
  def extractors(self) -> List[Extractor]:
@@ -306,69 +308,36 @@ class IndexifyClient:
306
308
  extractors.append(Extractor.from_dict(ed))
307
309
  return extractors
308
310
 
309
- def get_extraction_policies(self):
311
+ def get_extraction_graphs(self) -> List[ExtractionGraph]:
310
312
  """
311
313
  Retrieve and update the list of extraction policies for the current namespace.
312
314
  """
313
315
  response = self.get(f"namespaces/{self.namespace}")
314
- response.raise_for_status()
316
+ json = response.json()
315
317
 
316
- self.extraction_policies = []
317
- for eb in response.json()["namespace"]["extraction_policies"]:
318
- self.extraction_policies.append(ExtractionPolicy.from_dict(eb))
319
- return self.extraction_policies
318
+ self.extraction_graphs = []
319
+ for graph in json["namespace"]["extraction_graphs"]:
320
+ self.extraction_graphs.append(ExtractionGraph.from_dict(graph))
320
321
 
321
- def add_extraction_policy(
322
- self,
323
- extractor: str,
324
- name: str,
325
- input_params: dict = {},
326
- labels_eq: str = None,
327
- content_source="ingestion",
328
- ) -> dict:
329
- """Add a new extraction policy.
330
-
331
- Args:
332
- - extractor (str): Name of the extractor
333
- - name (str): Name for this instance
334
- - input_params (dict): Dictionary containing extractor input params
335
- - filter (Filter): Optional filter for this extractor
336
-
337
- Returns:
338
- dict: response payload
339
-
340
- Examples:
341
- >>> repo.add_extraction_policy("EfficientNet", "efficientnet")
342
-
343
- >>> repo.add_extraction_policy("MiniLML6", "minilm")
322
+ return self.extraction_graphs
344
323
 
324
+ def create_extraction_graph(self, extraction_graph: ExtractionGraph):
345
325
  """
346
- req = {
347
- "extractor": extractor,
348
- "name": name,
349
- "input_params": input_params,
350
- "filters_eq": labels_eq,
351
- "content_source": content_source,
352
- }
353
- if req["filters_eq"] == None:
354
- del req["filters_eq"]
326
+ Create a new extraction graph.
355
327
 
328
+ Args:
329
+ - extraction_graph (ExtractionGraph): the extraction graph to create
330
+ """
331
+ req = extraction_graph.to_dict()
332
+ req["namespace"] = self.namespace
356
333
  request_body = json.dumps(req, default=json_set_default)
357
334
  response = self.post(
358
- f"namespaces/{self.namespace}/extraction_policies",
335
+ f"namespaces/{self.namespace}/extraction_graphs",
359
336
  data=request_body,
360
337
  headers={"Content-Type": "application/json"},
361
338
  )
362
-
363
- # update self.extractor_bindings
364
- self.get_extraction_policies()
365
-
366
- try:
367
- response.raise_for_status()
368
- except httpx.HTTPStatusError as exc:
369
- raise ApiException(exc.response.text)
370
339
  return
371
-
340
+
372
341
  def get_content_metadata(self, content_id: str) -> dict:
373
342
  """
374
343
  Get metadata for a specific content ID in a given index.
@@ -377,52 +346,32 @@ class IndexifyClient:
377
346
  - content_id (str): content id to query
378
347
  """
379
348
  response = self.get(f"namespaces/{self.namespace}/content/{content_id}")
380
- response.raise_for_status()
381
349
  return response.json()
382
-
383
- def get_extracted_content(
384
- self,
385
- content_id: str = None,
386
- ):
387
- """
388
- Get list of content from current namespace.
389
-
390
- Args:
391
- - parent_id (str): Optional filter for parent id
392
- - labels_eq (str): Optional filter for labels
393
- """
394
- params = {"parent_id": content_id}
395
-
396
- response = self.get(f"namespaces/{self.namespace}/content", params=params)
397
- response.raise_for_status()
398
- return [
399
- self._add_content_url(content)
400
- for content in response.json()["content_list"]
401
- ]
402
-
403
- def download_content(self, id:str) -> bytes:
350
+
351
+ def download_content(self, id: str) -> bytes:
404
352
  """
405
353
  Download content from id. Return bytes
406
-
354
+
407
355
  Args:
408
356
  - id (str): id of content to download
409
357
  """
410
358
  response = self.get(f"namespaces/{self.namespace}/content/{id}/download")
411
- try:
412
- response.raise_for_status()
413
- return response.content
414
- except httpx.HTTPStatusError as exc:
415
- raise ApiException(exc.response.text)
359
+ return response.content
416
360
 
417
361
  def add_documents(
418
- self, documents: Union[Document, str, List[Union[Document, str]]], doc_id=None
419
- ) -> None:
362
+ self,
363
+ extraction_graphs: Union[str, List[str]],
364
+ documents: Union[Document, str, List[Union[Document, str]]],
365
+ doc_id=None,
366
+ ) -> Union[str, List[str]]:
420
367
  """
421
368
  Add documents to current namespace.
422
369
 
423
370
  Args:
424
371
  - documents (Union[Document, str, List[Union[Document, str]]]): this can be a list of strings, list of Documents or a mix of both
425
372
  """
373
+ if isinstance(extraction_graphs, str):
374
+ extraction_graphs = [extraction_graphs]
426
375
  if isinstance(documents, Document):
427
376
  documents = [documents]
428
377
  elif isinstance(documents, str):
@@ -433,7 +382,9 @@ class IndexifyClient:
433
382
  if isinstance(item, Document):
434
383
  new_documents.append(item)
435
384
  elif isinstance(item, str):
436
- new_documents.append(Document(item, {}, id=None)) # don't pass in id for a string content because doesn't make sense to have same content id for all strings
385
+ new_documents.append(
386
+ Document(item, {}, id=None)
387
+ ) # don't pass in id for a string content because doesn't make sense to have same content id for all strings
437
388
  else:
438
389
  raise ValueError(
439
390
  "List items must be either Document instances or strings."
@@ -444,13 +395,21 @@ class IndexifyClient:
444
395
  "Invalid type for documents. Expected Document, str, or list of these."
445
396
  )
446
397
 
447
- req = {"documents": [doc._asdict() for doc in documents]}
398
+ req = {
399
+ "documents": [doc._asdict() for doc in documents],
400
+ "extraction_graph_names": extraction_graphs,
401
+ }
448
402
  response = self.post(
449
403
  f"namespaces/{self.namespace}/add_texts",
450
404
  json=req,
451
405
  headers={"Content-Type": "application/json"},
452
406
  )
453
407
  response.raise_for_status()
408
+ response_json = response.json()
409
+ content_ids = response_json["content_ids"]
410
+ if len(documents) == 1 and len(content_ids) == 1:
411
+ return content_ids[0]
412
+ return content_ids
454
413
 
455
414
  def delete_documents(self, document_ids: List[str]) -> None:
456
415
  """
@@ -465,7 +424,6 @@ class IndexifyClient:
465
424
  json=req,
466
425
  headers={"Content-Type": "application/json"},
467
426
  )
468
- response.raise_for_status()
469
427
 
470
428
  def update_content(self, document_id: str, path: str) -> None:
471
429
  """
@@ -475,8 +433,9 @@ class IndexifyClient:
475
433
  - path (str): relative path to the file to be uploaded
476
434
  """
477
435
  with open(path, "rb") as f:
478
- response = self.put(f"namespaces/{self.namespace}/content/{document_id}", files={"file": f})
479
- response.raise_for_status()
436
+ response = self.put(
437
+ f"namespaces/{self.namespace}/content/{document_id}", files={"file": f}
438
+ )
480
439
 
481
440
  def get_structured_data(self, content_id: str) -> dict:
482
441
  """
@@ -485,11 +444,14 @@ class IndexifyClient:
485
444
  Args:
486
445
  - content_id (str): content id to query
487
446
  """
488
- response = self.get(f"namespaces/{self.namespace}/content/{content_id}/metadata")
489
- response.raise_for_status()
490
- return response.json().get("metadata",[])
447
+ response = self.get(
448
+ f"namespaces/{self.namespace}/content/{content_id}/metadata"
449
+ )
450
+ return response.json().get("metadata", [])
491
451
 
492
- def search_index(self, name: str, query: str, top_k: int, filters: List[str] = []) -> list[TextChunk]:
452
+ def search_index(
453
+ self, name: str, query: str, top_k: int, filters: List[str] = []
454
+ ) -> list[TextChunk]:
493
455
  """
494
456
  Search index in the current namespace.
495
457
 
@@ -505,10 +467,9 @@ class IndexifyClient:
505
467
  json=req,
506
468
  headers={"Content-Type": "application/json"},
507
469
  )
508
- response.raise_for_status()
509
470
  return response.json()["results"]
510
471
 
511
- def upload_file(self, path: str, id=None, labels: dict = {}) -> str:
472
+ def upload_file(self, extraction_graphs: Union[str, List[str]], path: str, id=None, labels: dict = {}) -> str:
512
473
  """
513
474
  Upload a file.
514
475
 
@@ -516,9 +477,11 @@ class IndexifyClient:
516
477
  - path (str): relative path to the file to be uploaded
517
478
  - labels (dict): labels to be associated with the file
518
479
  """
519
- params={}
480
+ if isinstance(extraction_graphs, str):
481
+ extraction_graphs = [extraction_graphs]
482
+ params = {"extraction_graph_names": extraction_graphs}
520
483
  if id is not None:
521
- params['id'] = id
484
+ params["id"] = id
522
485
  with open(path, "rb") as f:
523
486
  response = self.post(
524
487
  f"namespaces/{self.namespace}/upload_file",
@@ -526,7 +489,6 @@ class IndexifyClient:
526
489
  data=labels,
527
490
  params=params,
528
491
  )
529
- response.raise_for_status()
530
492
  response_json = response.json()
531
493
  return response_json["content_id"]
532
494
 
@@ -535,20 +497,47 @@ class IndexifyClient:
535
497
  List all schemas in the current namespace.
536
498
  """
537
499
  response = self.get(f"namespaces/{self.namespace}/schemas")
538
- response.raise_for_status()
539
500
  return response.json()
540
-
541
- def get_content_tree(self, content_id:str):
501
+
502
+ def get_content_tree(self, content_id: str):
542
503
  """
543
504
  Get content tree for a given content id
544
505
 
545
506
  Args:
546
507
  - content_id (str): id of content
547
508
  """
548
- response = self.get(f"namespaces/{self.namespace}/content/{content_id}/content-tree")
549
- response.raise_for_status()
509
+ response = self.get(
510
+ f"namespaces/{self.namespace}/content/{content_id}/content-tree"
511
+ )
550
512
  return response.json()
513
+
514
+ def get_extracted_content(self, content_id: str, level: int = 0):
515
+ """
516
+ Get list of child for a given content id and their content up to the specified level.
517
+
518
+ Args:
519
+ - content_id (str): id of content
520
+ - level (int): depth of content retrieval (default: 0)
521
+ """
522
+ content_tree = self.get_content_tree(content_id)
523
+ child_list = []
524
+
525
+ def traverse_content(parent_id, current_level):
526
+ if current_level > level:
527
+ return
528
+
529
+ for item in content_tree['content_tree_metadata']:
530
+ if item['parent_id'] == parent_id:
531
+ child_id = item['id']
532
+ content = self.download_content(child_id)
533
+ child_list.append({'id': child_id, 'content': content})
534
+
535
+ traverse_content(child_id, current_level + 1)
536
+
537
+ traverse_content(content_id, 0)
551
538
 
539
+ return child_list
540
+
552
541
  def sql_query(self, query: str):
553
542
  """
554
543
  Execute a SQL query.
@@ -562,24 +551,38 @@ class IndexifyClient:
562
551
  json=req,
563
552
  headers={"Content-Type": "application/json"},
564
553
  )
565
- response.raise_for_status()
566
554
  result = response.json()
567
555
  rows = []
568
556
  for row in result["rows"]:
569
557
  data = row["data"]
570
558
  rows.append(data)
571
559
  return SqlQueryResult(result=rows)
572
-
573
- def ingest_remote_file(self, url: str, mime_type: str, labels: Dict[str, str], id=None):
574
- req = {"url": url, "mime_type": mime_type, "labels": labels, "id": id}
560
+
561
+ def ingest_remote_file(
562
+ self, extraction_graphs: Union[str, List[str]], url: str, mime_type: str, labels: Dict[str, str], id=None
563
+ ):
564
+ if isinstance(extraction_graphs, str):
565
+ extraction_graphs = [extraction_graphs]
566
+ req = {"url": url, "mime_type": mime_type, "labels": labels, "id": id, "extraction_graph_names": extraction_graphs}
575
567
  response = self.post(
576
568
  f"namespaces/{self.namespace}/ingest_remote_file",
577
569
  json=req,
578
570
  headers={"Content-Type": "application/json"},
579
571
  )
580
- response.raise_for_status()
581
572
  return response.json()
582
573
 
574
+ def wait_for_extraction(self, content_id: str):
575
+ """
576
+ Wait for extraction to complete for a given content id
577
+
578
+ Args:
579
+ - content_id (str): id of content
580
+ """
581
+ response = self.get(
582
+ f"namespaces/{self.namespace}/content/{content_id}/wait"
583
+ )
584
+ response.raise_for_status()
585
+
583
586
  def generate_unique_hex_id(self):
584
587
  """
585
588
  Generate a unique hexadecimal identifier
@@ -588,18 +591,29 @@ class IndexifyClient:
588
591
  str: a unique hexadecimal string
589
592
  """
590
593
  return uuid.uuid4().hex[:16]
591
-
594
+
592
595
  def generate_hash_from_string(self, input_string: str):
593
596
  """
594
597
  Generate a hash for the given string and return it as a hexadecimal string.
595
-
598
+
596
599
  Args:
597
600
  input_string (str): The input string to hash.
598
-
601
+
599
602
  Returns:
600
603
  str: The hexadecimal hash of the input string.
601
604
  """
602
605
  hash_object = hashlib.sha256(input_string.encode())
603
606
  return hash_object.hexdigest()[:16]
604
607
 
608
+ def __print_additional_error_context(self, error: Error):
609
+ print(error)
610
+
611
+ if error.status == "ExtractionGraphError":
612
+ graphs = [eg.name for eg in self.extraction_graphs]
613
+ extractors = [ext.name for ext in self.extractors()]
614
+ print(f"Available extraction graphs: {graphs}")
615
+ print(f"Available extractors: {extractors}")
605
616
 
617
+ if error.status == "SearchError":
618
+ indexes = [index["name"] for index in self.indexes()]
619
+ print(f"Available indexes: {indexes}")
@@ -0,0 +1,30 @@
1
+ class Error(Exception):
2
+ status: str
3
+ message: str
4
+
5
+ def __init__(self, status: str, message: str):
6
+ self.status = status
7
+ self.message = message
8
+
9
+ @staticmethod
10
+ def from_tonic_error_string(url: str, error: str) -> "Error":
11
+ data = error.split(", ")
12
+
13
+ message = data[1].split(": ", 1)[1]
14
+ if message.startswith('"') and message.endswith('"'):
15
+ message = message[1:-1]
16
+
17
+ status = "GeneralError"
18
+ if "extraction_graph" in url:
19
+ status = "ExtractionGraphError"
20
+ elif "search" in url:
21
+ status = "SearchError"
22
+
23
+ error = Error(status, message)
24
+ return error
25
+
26
+ def __str__(self):
27
+ return f"{self.status} | {self.message.capitalize()}"
28
+
29
+ def __repr__(self):
30
+ return f"Error(status={self.status!r}, message={self.message!r})"
@@ -0,0 +1,68 @@
1
+ from dataclasses import dataclass, asdict
2
+ from typing import Optional, List
3
+
4
+
5
+ @dataclass
6
+ class ExtractionPolicy:
7
+ extractor: str
8
+ name: str
9
+ content_source: str
10
+ input_params: Optional[dict] = None
11
+ id: Optional[str] = None
12
+ labels_eq: Optional[str] = None
13
+
14
+ def __repr__(self) -> str:
15
+ return f"ExtractionPolicy(name={self.name} extractor={self.extractor})"
16
+
17
+ def __str__(self) -> str:
18
+ return self.__repr__()
19
+
20
+ def to_dict(self) -> dict:
21
+ filtered_dict = {k: v for k, v in asdict(self).items() if v is not None}
22
+ return filtered_dict
23
+
24
+ @classmethod
25
+ def from_dict(cls, json: dict):
26
+ if "filters_eq" in json:
27
+ json["labels_eq"] = json.pop("filters_eq")
28
+ json["id"] = json.get("id", None)
29
+ return ExtractionPolicy(**json)
30
+
31
+
32
+ @dataclass
33
+ class ExtractionGraph:
34
+ id: str
35
+ name: str
36
+ extraction_policies: List[ExtractionPolicy]
37
+
38
+ @classmethod
39
+ def from_dict(cls, json: dict):
40
+ json["id"] = json.get("id", None)
41
+ if "namespace" in json.keys():
42
+ json.pop("namespace")
43
+ return ExtractionGraph(**json)
44
+
45
+ @staticmethod
46
+ def from_yaml(spec: str):
47
+ import yaml
48
+
49
+ return ExtractionGraph.from_dict(yaml.load(spec, Loader=yaml.FullLoader))
50
+
51
+ def to_dict(self) -> dict:
52
+ filtered_dict = {k: v for k, v in asdict(self).items() if v is not None}
53
+ return filtered_dict
54
+
55
+
56
+ class ExtractionGraphBuilder:
57
+ def __init__(self, name: str):
58
+ self.name = name
59
+ self.extraction_policies = []
60
+
61
+ def policy(self, policy: ExtractionPolicy) -> "ExtractionGraphBuilder":
62
+ self.extraction_policies.append(policy)
63
+ return self
64
+
65
+ def build(self):
66
+ return ExtractionGraph(
67
+ id=self.id, name=self.name, extraction_policies=self.extraction_policies
68
+ )
@@ -17,7 +17,12 @@ class ExtractorSchema:
17
17
 
18
18
  class Extractor:
19
19
  def __init__(
20
- self, name: str, description: str, input_params: dict, outputs: ExtractorSchema, input_mime_types: list[str]
20
+ self,
21
+ name: str,
22
+ description: str,
23
+ input_params: dict,
24
+ outputs: ExtractorSchema,
25
+ input_mime_types: list[str],
21
26
  ):
22
27
  self.name = name
23
28
  self.description = description
@@ -0,0 +1,2 @@
1
+ DEFAULT_SERVICE_URL = "http://localhost:8900"
2
+ DEFAULT_SERVICE_URL_HTTPS = "https://localhost:8900"
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "indexify"
3
- version = "0.0.20"
3
+ version = "0.0.22"
4
4
  description = "Python Client for Indexify"
5
5
  authors = ["Diptanu Gon Choudhury <diptanuc@gmail.com>", "Vijay Parthasarathy <vijay2win@gmail.com>"]
6
6
  license = "Apache 2.0"
@@ -1,28 +0,0 @@
1
- from dataclasses import dataclass, asdict
2
- from typing import Optional
3
-
4
-
5
- @dataclass
6
- class ExtractionPolicy:
7
- extractor: str
8
- name: str
9
- content_source: str
10
- input_params: dict
11
- id: Optional[str] = None
12
- labels_eq: Optional[str] = None
13
-
14
- def __repr__(self) -> str:
15
- return f"ExtractionPolicy(name={self.name} extractor={self.extractor})"
16
-
17
- def __str__(self) -> str:
18
- return self.__repr__()
19
-
20
- def to_dict(self) -> dict:
21
- filtered_dict = {k: v for k, v in asdict(self).items() if v is not None}
22
- return filtered_dict
23
-
24
- @classmethod
25
- def from_dict(cls, json: dict):
26
- if "filters_eq" in json:
27
- json["labels_eq"] = json.pop("filters_eq")
28
- return ExtractionPolicy(**json)
@@ -1,2 +0,0 @@
1
- DEFAULT_SERVICE_URL = "http://localhost:8900"
2
- DEFAULT_SERVICE_URL_HTTPS = "https://localhost:8900"
File without changes
File without changes
File without changes
File without changes