indexify 0.0.35__py3-none-any.whl → 0.0.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
indexify/__init__.py CHANGED
@@ -1,13 +1,25 @@
1
- from .index import Index
2
1
  from .client import IndexifyClient
3
2
  from .extraction_policy import ExtractionGraph
4
- from .client import IndexifyClient, Document, generate_hash_from_string, generate_unique_hex_id
5
- from .data_containers import Content
3
+ from .client import (
4
+ IndexifyClient,
5
+ Document,
6
+ generate_hash_from_string,
7
+ generate_unique_hex_id,
8
+ )
9
+ from .data import ContentMetadata, Content, Feature
10
+ from .extractor import Extractor, extractor, EmbeddingSchema
6
11
  from .settings import DEFAULT_SERVICE_URL
12
+ from . import data_loaders
7
13
 
8
14
  __all__ = [
9
- "Index",
15
+ "ContentMetadata",
10
16
  "Content",
17
+ "data_loaders",
18
+ "Feature",
19
+ "Extractor",
20
+ "extractor",
21
+ "EmbeddingSchema",
22
+ "extractor",
11
23
  "Document",
12
24
  "IndexifyClient",
13
25
  "ExtractionGraph",
indexify/client.py CHANGED
@@ -6,11 +6,11 @@ import json
6
6
  from collections import namedtuple
7
7
  from .settings import DEFAULT_SERVICE_URL, DEFAULT_SERVICE_URL_HTTPS
8
8
  from .extractor import Extractor
9
- from .extraction_policy import ExtractionPolicy, ExtractionGraph
10
- from .index import Index
9
+ from .extraction_policy import ExtractionGraph
11
10
  from .utils import json_set_default
12
11
  from .error import Error
13
- from .data_containers import TextChunk, Content
12
+ from .data import Content, ContentMetadata
13
+ from .data_loaders import DataLoader
14
14
  from indexify.exceptions import ApiException
15
15
  from dataclasses import dataclass
16
16
  from typing import List, Optional, Union, Dict
@@ -316,7 +316,7 @@ class IndexifyClient:
316
316
  "content_url": f"{self._service_url}/namespaces/{self.namespace}/content/{content['id']}/download",
317
317
  }
318
318
 
319
- def indexes(self) -> List[Index]:
319
+ def indexes(self) -> dict:
320
320
  """
321
321
  Get the indexes of the current namespace.
322
322
 
@@ -399,7 +399,9 @@ class IndexifyClient:
399
399
  Args:
400
400
  - content_id (str): content id to query
401
401
  """
402
- response = self.get(f"namespaces/{self.namespace}/content/{content_id}/metadata")
402
+ response = self.get(
403
+ f"namespaces/{self.namespace}/content/{content_id}/metadata"
404
+ )
403
405
  return response.json()["content_metadata"]
404
406
 
405
407
  def download_content(self, content_id: str) -> bytes:
@@ -409,7 +411,9 @@ class IndexifyClient:
409
411
  Args:
410
412
  - content_id (str): id of content to download
411
413
  """
412
- response = self.get(f"namespaces/{self.namespace}/content/{content_id}/download")
414
+ response = self.get(
415
+ f"namespaces/{self.namespace}/content/{content_id}/download"
416
+ )
413
417
  return response.content
414
418
 
415
419
  def add_documents(
@@ -520,7 +524,7 @@ class IndexifyClient:
520
524
 
521
525
  def search_index(
522
526
  self, name: str, query: str, top_k: int, filters: List[str] = []
523
- ) -> list[TextChunk]:
527
+ ) -> dict:
524
528
  """
525
529
  Search index in the current namespace.
526
530
 
@@ -573,34 +577,59 @@ class IndexifyClient:
573
577
 
574
578
  def upload_file(
575
579
  self,
576
- extraction_graphs: Union[str, List[str]],
577
- path: str,
580
+ extraction_graph: str,
581
+ path: str,
582
+ file_bytes:bytes=None,
578
583
  id=None,
579
584
  labels: dict = {},
580
585
  ) -> str:
581
586
  """
582
- Upload a file.
587
+ Upload a file from a path or the bytes.
583
588
 
584
589
  Args:
585
- - path (str): relative path to the file to be uploaded
590
+ - extraction_graph (str): name of the extraction graph to use for extraction
591
+ - path (Union[str, bytes]): relative path to the file to be uploaded, or the bytes of the file
586
592
  - labels (dict): labels to be associated with the file
587
593
  """
588
- if isinstance(extraction_graphs, str):
589
- extraction_graphs = [extraction_graphs]
590
594
  params = {}
591
595
  if id is not None:
592
596
  params["id"] = id
593
- with open(path, "rb") as f:
594
- for extraction_graph in extraction_graphs:
597
+
598
+ if file_bytes == None:
599
+ with open(path, "rb") as f:
595
600
  response = self.post(
596
601
  f"namespaces/{self.namespace}/extraction_graphs/{extraction_graph}/extract",
597
602
  files={"file": f},
598
603
  data={"labels": json.dumps(labels)},
599
604
  params=params,
600
605
  )
601
- response_json = response.json()
602
- content_id = response_json["content_id"]
603
- return content_id
606
+ else:
607
+ response = self.post(
608
+ f"namespaces/{self.namespace}/extraction_graphs/{extraction_graph}/extract",
609
+ files={"file": (path, file_bytes)},
610
+ data={"labels": json.dumps(labels)},
611
+ params=params,
612
+ )
613
+ file_content = path
614
+
615
+ response_json = response.json()
616
+ content_id = response_json["content_id"]
617
+ return content_id
618
+
619
+ def ingest_from_loader(self, loader: DataLoader, extraction_graph: str) -> List[str]:
620
+ """
621
+ Loads content using the loader, uploads them to Indexify and returns the content ids.
622
+ loader: DataLoader: The DataLoader object to use for loading content
623
+ extraction_graph: str: The name of the extraction graph to use for extraction
624
+ """
625
+ content_ids = []
626
+ files = loader.load()
627
+ for file_metadata in files:
628
+ labels={"file_name": file_metadata.path}
629
+ print(labels)
630
+ content_id = self.upload_file(extraction_graph, file_metadata.path, file_metadata.read_all_bytes(), labels=labels)
631
+ content_ids.append(content_id)
632
+ return content_ids
604
633
 
605
634
  def list_schemas(self) -> List[str]:
606
635
  """
@@ -610,7 +639,11 @@ class IndexifyClient:
610
639
  return response.json()
611
640
 
612
641
  def get_extracted_content(
613
- self, ingested_content_id: str, graph_name: str, policy_name: str, blocking=False
642
+ self,
643
+ ingested_content_id: str,
644
+ graph_name: str,
645
+ policy_name: str,
646
+ blocking=False,
614
647
  ):
615
648
  """
616
649
  Get list of child for a given content id and their content up to the specified level.
@@ -631,10 +664,16 @@ class IndexifyClient:
631
664
  for item in content_tree["content_tree_metadata"]:
632
665
  if (
633
666
  graph_name in item["extraction_graph_names"]
634
- and item["source"] == policy_name
667
+ and item["source"] == policy_name
635
668
  ):
636
669
  content = self.download_content(item["id"])
637
- child_list.append({"id": item["id"], "mime_type": item["mime_type"], "content": content})
670
+ child_list.append(
671
+ {
672
+ "id": item["id"],
673
+ "mime_type": item["mime_type"],
674
+ "content": content,
675
+ }
676
+ )
638
677
 
639
678
  return child_list
640
679
 
@@ -660,23 +699,20 @@ class IndexifyClient:
660
699
 
661
700
  def ingest_remote_file(
662
701
  self,
663
- extraction_graphs: Union[str, List[str]],
702
+ extraction_graph: str,
664
703
  url: str,
665
704
  mime_type: str,
666
705
  labels: Dict[str, str],
667
706
  id=None,
668
707
  ):
669
- if isinstance(extraction_graphs, str):
670
- extraction_graphs = [extraction_graphs]
671
708
  req = {
672
709
  "url": url,
673
710
  "mime_type": mime_type,
674
711
  "labels": labels,
675
712
  "id": id,
676
- "extraction_graph_names": extraction_graphs,
677
713
  }
678
714
  response = self.post(
679
- f"namespaces/{self.namespace}/ingest_remote_file",
715
+ f"namespaces/{self.namespace}/extraction_graphs/{extraction_graph}/extract_remote",
680
716
  json=req,
681
717
  headers={"Content-Type": "application/json"},
682
718
  )
indexify/data.py ADDED
@@ -0,0 +1,83 @@
1
+ from typing import Any, List, Optional, Literal, Dict
2
+ from pydantic import BaseModel, Json, Field
3
+ import json
4
+
5
+
6
+ class Feature(BaseModel):
7
+ feature_type: Literal["embedding", "metadata"]
8
+ name: str
9
+ value: Json
10
+ comment: Optional[Json] = Field(default=None)
11
+
12
+ @classmethod
13
+ def embedding(cls, values: List[float], name: str = "embedding", distance="cosine"):
14
+ return cls(
15
+ feature_type="embedding",
16
+ name=name,
17
+ value={values: values, distance: distance},
18
+ comment=None,
19
+ )
20
+
21
+ @classmethod
22
+ def metadata(cls, value: Json, comment: Json = None, name: str = "metadata"):
23
+ value = json.dumps(value)
24
+ comment = json.dumps(comment) if comment is not None else None
25
+ return cls(feature_type="metadata", name=name, value=value)
26
+
27
+
28
+ class Content(BaseModel):
29
+ id: str
30
+ content_type: Optional[str]
31
+ data: bytes
32
+ features: List[Feature] = []
33
+
34
+ @classmethod
35
+ def from_text(
36
+ cls,
37
+ text: str,
38
+ features: List[Feature] = [],
39
+ ):
40
+ return Content(
41
+ id="none-for-now",
42
+ content_type="text/plain",
43
+ data=bytes(text, "utf-8"),
44
+ features=features,
45
+ )
46
+
47
+ @classmethod
48
+ def from_json(cls, json_data: Json, features: List[Feature] = []):
49
+ return cls(
50
+ content_type="application/json",
51
+ data=bytes(json.dumps(json_data), "utf-8"),
52
+ features=features,
53
+ )
54
+
55
+ @classmethod
56
+ def from_file(cls, path: str):
57
+ import mimetypes
58
+
59
+ m, _ = mimetypes.guess_type(path)
60
+ with open(path, "rb") as f:
61
+ return cls(content_type=m, data=f.read())
62
+
63
+
64
+ class ContentMetadata(BaseModel):
65
+ id: str
66
+ parent_id: str
67
+ labels: Dict[str, Any]
68
+ extraction_graph_names: List[str]
69
+ extraction_policy: str
70
+ mime_type: str
71
+ extracted_metadata: Dict[str, Any] = {}
72
+
73
+ @classmethod
74
+ def from_dict(cls, json: Dict):
75
+ return cls(
76
+ id=json["id"],
77
+ parent_id=json["parent_id"],
78
+ labels=json["labels"],
79
+ extraction_graph_names=json["extraction_graph_names"],
80
+ extraction_policy=json["source"],
81
+ mime_type=json["mime_type"],
82
+ extracted_metadata=json["extracted_metadata"],
83
+ )
@@ -0,0 +1,55 @@
1
+ from pydantic import BaseModel
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import List
5
+ import os
6
+ import mimetypes
7
+ import hashlib
8
+
9
+ class FileMetadata(BaseModel):
10
+ path: str
11
+ file_size: int
12
+ mime_type: str
13
+ md5_hash: str
14
+ created_at: int
15
+ updated_at: int
16
+
17
+ @classmethod
18
+ def from_path(cls, path: str):
19
+ file_size = os.path.getsize(path)
20
+ mime_type = mimetypes.guess_type(path)[0]
21
+
22
+ # Compute MD5 hash
23
+ hash_md5 = hashlib.md5()
24
+ with open(path, "rb") as f:
25
+ for chunk in iter(lambda: f.read(4096), b""):
26
+ hash_md5.update(chunk)
27
+ md5_hash = hash_md5.hexdigest()
28
+
29
+ created_at = int(os.path.getctime(path))
30
+ updated_at = int(os.path.getmtime(path))
31
+
32
+ return cls(
33
+ path=path,
34
+ file_size=file_size,
35
+ mime_type=str(mime_type),
36
+ md5_hash=md5_hash,
37
+ created_at=created_at,
38
+ updated_at=updated_at,
39
+ )
40
+
41
+ def read_all_bytes(self) -> bytes:
42
+ with open(self.path, "rb") as f:
43
+ return f.read()
44
+
45
+
46
+ class DataLoader(ABC):
47
+ @abstractmethod
48
+ def load(self) -> List[FileMetadata]:
49
+ pass
50
+
51
+ @abstractmethod
52
+ def state(self) -> dict:
53
+ pass
54
+
55
+ from .local_directory_loader import LocalDirectoryLoader
@@ -0,0 +1,27 @@
1
+ from . import DataLoader, FileMetadata
2
+ from typing import List, Optional
3
+ import os
4
+
5
+
6
+ class LocalDirectoryLoader(DataLoader):
7
+ def __init__(self, directory: str, file_extensions: Optional[List[str]] = None, state: dict ={}):
8
+ self.directory = directory
9
+ self.file_extensions = file_extensions
10
+ self.processed_files = set(state.get("processed_files", []))
11
+
12
+ def load(self) -> List[FileMetadata]:
13
+ file_metadata_list = []
14
+ for root, _, files in os.walk(self.directory):
15
+ for file in files:
16
+ if self.file_extensions is None or any(
17
+ file.endswith(ext) for ext in self.file_extensions
18
+ ):
19
+ file_path = os.path.join(root, file)
20
+ if file_path not in self.processed_files:
21
+ file_metadata_list.append(FileMetadata.from_path(file_path))
22
+ self.processed_files.add(file_path)
23
+
24
+ return file_metadata_list
25
+
26
+ def state(self) -> dict:
27
+ return {"processed_files": list(self.processed_files)}
@@ -49,7 +49,7 @@ class ExtractionGraph:
49
49
  import yaml
50
50
 
51
51
  return ExtractionGraph.from_dict(yaml.load(spec, Loader=yaml.FullLoader))
52
-
52
+
53
53
  @staticmethod
54
54
  def from_yaml_file(path: str):
55
55
  with open(path, "r") as f:
indexify/extractor.py CHANGED
@@ -1,47 +1,120 @@
1
- from dataclasses import dataclass
2
- from typing import Union
1
+ from typing import Union, Optional, List, Type, Tuple, Callable, get_type_hints, Dict
2
+ import inspect
3
+ from pydantic import BaseModel
4
+ from abc import ABC, abstractmethod
5
+ from .data import Content, Feature
6
+ import json
3
7
 
4
- from .settings import DEFAULT_SERVICE_URL
8
+ class EmbeddingSchema(BaseModel):
9
+ dimension: int
5
10
 
11
+ class Extractor(ABC):
12
+ name: str = ""
6
13
 
7
- @dataclass
8
- class EmbeddingSchema:
9
- distance: str
10
- dim: int
14
+ version: str = "0.0.0"
11
15
 
16
+ system_dependencies: List[str] = []
12
17
 
13
- @dataclass
14
- class ExtractorSchema:
15
- outputs: dict[str, Union[EmbeddingSchema, dict]]
18
+ python_dependencies: List[str] = []
16
19
 
20
+ description: str = ""
17
21
 
18
- class Extractor:
19
- def __init__(
20
- self,
21
- name: str,
22
- description: str,
23
- input_params: dict,
24
- outputs: ExtractorSchema,
25
- input_mime_types: list[str],
26
- ):
27
- self.name = name
28
- self.description = description
29
- self.input_params = input_params
30
- self.outputs = outputs
31
- self.input_mime_types = input_mime_types
22
+ input_mime_types = ["text/plain"]
23
+
24
+ def extract(
25
+ self, content: Content, params: Type[BaseModel] = None
26
+ ) -> List[Union[Feature, Content]]:
27
+ """
28
+ Extracts information from the content. Returns a list of features to add
29
+ to the content.
30
+ It can also return a list of Content objects, which will be added to storage
31
+ and any extraction policies defined will be applied to them.
32
+ """
33
+ pass
34
+
35
+ def extract_batch(
36
+ self, content_list: List[Content], params: List[Type[BaseModel]] = None
37
+ ) -> List[List[Union[Feature, Content]]]:
38
+ """
39
+ Extracts information from the content. Returns a list of features to add
40
+ to the content.
41
+ It can also return a list of Content objects, which will be added to storage
42
+ and any extraction policies defined will be applied to them.
43
+ """
44
+ pass
45
+
46
+ @classmethod
47
+ @abstractmethod
48
+ def sample_input(cls) -> Tuple[Content, Type[BaseModel]]:
49
+ pass
32
50
 
33
51
  @classmethod
34
- def from_dict(cls, data):
35
- return Extractor(
36
- name=data["name"],
37
- description=data["description"],
38
- input_params=data["input_params"],
39
- input_mime_types=data["input_mime_types"],
40
- outputs=data["outputs"],
52
+ @abstractmethod
53
+ def embedding_schemas(cls) -> Dict[str, EmbeddingSchema]:
54
+ raise NotImplementedError
55
+
56
+ def describe(self) -> Dict:
57
+ embedding_schemas = {}
58
+ try:
59
+ embedding_schemas = self.embedding_schemas()
60
+ except NotImplementedError:
61
+ pass
62
+
63
+ json_schema = (
64
+ self._param_cls.model_json_schema() if self._param_cls is not None else None
41
65
  )
42
66
 
43
- def __repr__(self) -> str:
44
- return f"Extractor(name={self.name}, description={self.description}, input_params={self.input_params}, input_mime_types={self.input_mime_types}, outputs={self.outputs})"
67
+ return {
68
+ "name": self.name,
69
+ "version": self.version,
70
+ "description": self.description,
71
+ "system_dependencies": self.system_dependencies,
72
+ "python_dependencies": self.python_dependencies,
73
+ "input_mime_types": self.input_mime_types,
74
+ "embedding_schemas": embedding_schemas,
75
+ "input_params": json.dumps(json_schema),
76
+ }
77
+
78
+ def extractor(
79
+ name: Optional[str] = None,
80
+ description: Optional[str] = "",
81
+ version: Optional[str] = "",
82
+ python_dependencies: Optional[List[str]] = None,
83
+ system_dependencies: Optional[List[str]] = None,
84
+ input_mime_types: Optional[List[str]] = None,
85
+ embedding_schemas: Optional[Dict[str, EmbeddingSchema]] = None,
86
+ sample_content: Optional[Callable] = None,
87
+ ):
88
+ args = locals()
89
+ del args["sample_content"]
90
+
91
+ def construct(fn):
92
+ def wrapper():
93
+ hint = get_type_hints(fn).get("params", dict)
94
+
95
+ if not args.get("name"):
96
+ args["name"] = (
97
+ f"{inspect.getmodule(inspect.stack()[1][0]).__name__}:{fn.__name__}"
98
+ )
99
+
100
+ class DecoratedFn(Extractor):
101
+ @classmethod
102
+ def extract(cls, content: Content, params: hint) -> List[Content]: # type: ignore
103
+ # TODO we can force all the functions to take in a parms object
104
+ # or check if someone adds a params
105
+ if params is None:
106
+ return fn(content)
107
+ else:
108
+ return fn(content, params)
109
+
110
+ def sample_input(self) -> Content:
111
+ return sample_content() if sample_content else self.sample_text()
112
+
113
+ for key, val in args.items():
114
+ setattr(DecoratedFn, key, val)
115
+
116
+ return DecoratedFn
117
+
118
+ return wrapper
45
119
 
46
- def __str__(self) -> str:
47
- return self.__repr__()
120
+ return construct
@@ -0,0 +1,108 @@
1
+ import os
2
+ import httpx
3
+ from typing import List
4
+ from .data import Content, Feature
5
+
6
+ class SampleExtractorData:
7
+
8
+ def _download_file(self, url, filename):
9
+ if os.path.exists(filename):
10
+ # file exists skip
11
+ return
12
+ try:
13
+ with httpx.get(url, stream=True) as r:
14
+ r.raise_for_status() # Raises an HTTPError if the response status code is 4XX/5XX
15
+ with open(filename, "wb") as f:
16
+ for chunk in r.iter_content(chunk_size=8192):
17
+ f.write(chunk)
18
+ except httpx.exceptions.RequestException as e:
19
+ print(f"Error downloading the file: {e}")
20
+
21
+ def sample_mp3(self, features: List[Feature] = []) -> Content:
22
+ file_name = "sample.mp3"
23
+ self._download_file(
24
+ "https://extractor-files.diptanu-6d5.workers.dev/sample-000009.mp3",
25
+ file_name,
26
+ )
27
+ f = open(file_name, "rb")
28
+ return Content(content_type="audio/mpeg", data=f.read(), features=features)
29
+
30
+ def sample_mp4(self, features: List[Feature] = []) -> Content:
31
+ file_name = "sample.mp4"
32
+ self._download_file(
33
+ "https://extractor-files.diptanu-6d5.workers.dev/sample.mp4",
34
+ file_name,
35
+ )
36
+ f = open(file_name, "rb")
37
+ return Content(content_type="video/mp4", data=f.read(), features=features)
38
+
39
+ def sample_jpg(self, features: List[Feature] = []) -> Content:
40
+ file_name = "sample.jpg"
41
+ self._download_file(
42
+ "https://extractor-files.diptanu-6d5.workers.dev/people-standing.jpg",
43
+ file_name,
44
+ )
45
+ f = open(file_name, "rb")
46
+ return Content(content_type="image/jpg", data=f.read(), features=features)
47
+
48
+ def sample_invoice_jpg(self, features: List[Feature] = []) -> Content:
49
+ file_name = "sample.jpg"
50
+ self._download_file(
51
+ "https://extractor-files.diptanu-6d5.workers.dev/invoice-example.jpg",
52
+ file_name,
53
+ )
54
+ f = open(file_name, "rb")
55
+ return Content(content_type="image/jpg", data=f.read(), features=features)
56
+
57
+ def sample_invoice_pdf(self, features: List[Feature] = []) -> Content:
58
+ file_name = "sample.pdf"
59
+ self._download_file(
60
+ "https://extractor-files.diptanu-6d5.workers.dev/invoice-example.pdf",
61
+ file_name,
62
+ )
63
+ f = open(file_name, "rb")
64
+ return Content(content_type="application/pdf", data=f.read(), features=features)
65
+
66
+ def sample_image_based_pdf(self, features: List[Feature] = []) -> Content:
67
+ file_name = "sample.pdf"
68
+ self._download_file(
69
+ "https://extractor-files.diptanu-6d5.workers.dev/image-based.pdf",
70
+ file_name,
71
+ )
72
+ f = open(file_name, "rb")
73
+ return Content(content_type="application/pdf", data=f.read(), features=features)
74
+
75
+ def sample_scientific_pdf(self, features: List[Feature] = []) -> Content:
76
+ file_name = "sample.pdf"
77
+ self._download_file(
78
+ "https://extractor-files.diptanu-6d5.workers.dev/scientific-paper-example.pdf",
79
+ file_name,
80
+ )
81
+ f = open(file_name, "rb")
82
+ return Content(content_type="application/pdf", data=f.read(), features=features)
83
+
84
+ def sample_presentation(self, features: List[Feature] = []) -> Content:
85
+ file_name = "test.pptx"
86
+ self._download_file(
87
+ "https://raw.githubusercontent.com/tensorlakeai/indexify/main/docs/docs/files/test.pptx",
88
+ file_name,
89
+ )
90
+ f = open(file_name, "rb")
91
+ return Content(
92
+ content_type="application/vnd.openxmlformats-officedocument.presentationml.presentation",
93
+ data=f.read(),
94
+ features=features,
95
+ )
96
+
97
+ def sample_text(self, features: List[Feature] = []) -> Content:
98
+ article = """New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband. Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the 2010 marriage license application, according to court documents. Prosecutors said the marriages were part of an immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002. All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages. Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted. The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces up to four years in prison. Her next court appearance is scheduled for May 18."""
99
+ return Content(content_type="text/plain", data=article, features=features)
100
+
101
+ def sample_html(self, features: List[Feature] = []) -> Content:
102
+ file_name = "sample.html"
103
+ self._download_file(
104
+ "https://extractor-files.diptanu-6d5.workers.dev/sample.html",
105
+ file_name,
106
+ )
107
+ f = open(file_name, "rb")
108
+ return Content(content_type="text/html", data=f.read(), features=features)
indexify/graph.py ADDED
@@ -0,0 +1,49 @@
1
+ from indexify import Content, extractor
2
+ from indexify.extractor import Extractor
3
+
4
+ from collections import defaultdict
5
+ from typing import Any, Callable, Dict, List, Optional
6
+
7
+
8
+ @extractor(description="id function")
9
+ def _id(content: Content) -> List[Content]:
10
+ return [content]
11
+
12
+ class Graph:
13
+ def __init__(self, name: str):
14
+ # TODO check for cycles
15
+ self.name = name
16
+
17
+ self.nodes: Dict[str, Callable] = {}
18
+ self.params: Dict[str, Any] = {}
19
+
20
+ self.edges: Dict[str, List[(str, str)]] = defaultdict(list)
21
+
22
+ self.results: Dict[str, Any] = defaultdict(list) # TODO should the Any be Content?
23
+
24
+ self.nodes["start"] = _id
25
+ self.nodes["end"] = _id
26
+
27
+ self._topo_counter = defaultdict(int)
28
+
29
+ self._start_node = None
30
+
31
+ def node(self, name: str, closure: Extractor, params: Any = None) -> None:
32
+ if name in self.nodes:
33
+ raise Exception(f"Cannot insert node, node with name: `{name}` already exists")
34
+
35
+ self.nodes[name] = closure
36
+ self.params[name] = params
37
+
38
+ # assign each node a rank of 1 to init the graph
39
+ self._topo_counter[name] = 1
40
+
41
+ def edge(self, from_node: str, to_node: str, prefilter_predicates: Optional[str] = None) -> None:
42
+ self.edges[from_node].append((to_node, prefilter_predicates))
43
+
44
+ self._topo_counter[to_node] += 1
45
+
46
+ def _assign_start_node(self):
47
+ # this method should be called before a graph can be run
48
+ nodes = sorted(self._topo_counter.items(), key=lambda x: x[1])
49
+ self._start_node = nodes[0][0]
@@ -0,0 +1,53 @@
1
+ from indexify import Content
2
+
3
+ from collections import defaultdict
4
+ from typing import Any, Callable, Dict, Optional
5
+
6
+ class LocalRunner:
7
+ def __init__(self):
8
+ self.results: Dict[str, Any] = defaultdict(list) # TODO should the Any be Content?
9
+
10
+ def run(self, g, content: Content):
11
+ g._assign_start_node()
12
+ return self._run(g, content=content, node_name=g._start_node)
13
+
14
+ def _run(self, g, content: Content, node_name: str):
15
+ extractor_construct: Callable = g.nodes[node_name]
16
+ params = g.params.get(node_name, None)
17
+
18
+ res = extractor_construct().extract(content=content, params=params)
19
+
20
+ self.results[node_name].extend(res)
21
+
22
+ for out_edge, pre_filter_predicate in g.edges[node_name]:
23
+ # TODO there are no reductions yet, each recursion finishes it's path and returns
24
+ for r in res:
25
+ if self._prefilter_content(content=r, prefilter_predicate=pre_filter_predicate):
26
+ continue
27
+
28
+ self._run(g, content=r, node_name=out_edge)
29
+
30
+ def _prefilter_content(self, content: Content, prefilter_predicate: Optional[str]) -> bool:
31
+ if prefilter_predicate is None:
32
+ return False
33
+
34
+ atoms = prefilter_predicate.split('and')
35
+ if len(atoms) == 0 or len(atoms) == 1:
36
+ return False
37
+
38
+ # TODO For now only support `and` and `=` and `string values`
39
+ bools = []
40
+ for feature in content.features:
41
+ if feature.feature_type == 'metadata':
42
+ values = feature.value
43
+
44
+ print(f'{prefilter_predicate, atoms}')
45
+ for atom in atoms:
46
+ l, r = atom.split('=')
47
+ if l in values:
48
+ bools.append(values[l] == r)
49
+
50
+ return all(bools)
51
+
52
+ def get_result(self, node_name: str) -> Content:
53
+ return self.results[node_name]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: indexify
3
- Version: 0.0.35
3
+ Version: 0.0.36
4
4
  Summary: Python Client for Indexify
5
5
  Home-page: https://github.com/tensorlakeai/indexify
6
6
  License: Apache 2.0
@@ -14,6 +14,7 @@ Classifier: Programming Language :: Python :: 3.10
14
14
  Classifier: Programming Language :: Python :: 3.11
15
15
  Classifier: Programming Language :: Python :: 3.12
16
16
  Requires-Dist: httpx[http2] (>=0.26,<0.27)
17
+ Requires-Dist: pydantic (>=2.8,<3.0)
17
18
  Requires-Dist: pyyaml (>=6.0.1,<7.0.0)
18
19
  Project-URL: Repository, https://github.com/tensorlakeai/indexify
19
20
  Description-Content-Type: text/markdown
@@ -0,0 +1,18 @@
1
+ indexify/__init__.py,sha256=W58FqmnKHIx-gHKTBDQa1QI49Gi8f1rw90yDg31jwgQ,743
2
+ indexify/client.py,sha256=czMeUoAMMiEH3txysdRTCu84mwWj9Ec_NjiXy6oc9Vw,25858
3
+ indexify/data.py,sha256=XWs5_rW2ZGldgwtqN62VwZF15ot1POBkf_X5ByVmfiI,2315
4
+ indexify/data_loaders/__init__.py,sha256=EiYemxCP4zRfDWnDKiX6-SFwXVmv1TSdcXHBQRbE_Uw,1309
5
+ indexify/data_loaders/local_directory_loader.py,sha256=kF7VwkuOJFBrhKrR7IOOdZ4TDAItw_CyUOfcuej1CKI,1080
6
+ indexify/error.py,sha256=3umTeYb0ugtUyehV1ibfvaeACxAONPyWPc-1HRN4d1M,856
7
+ indexify/exceptions.py,sha256=vjd5SPPNFIEW35GorSIodsqvm9RKHQm9kdp8t9gv-WM,111
8
+ indexify/extraction_policy.py,sha256=awNDqwCz0tr4jTQmGf7s8_s6vcEuxMb0xynEl7b7iPI,2076
9
+ indexify/extractor.py,sha256=Pzcn9gZET5XRz3OMGQ_k9XjFT8UVeyaynOT86_C08yY,3837
10
+ indexify/extractor_utils.py,sha256=68V5vZB9GYx648dyyVKAia0M4pG_R31QPqUQz3ZZ1FQ,6593
11
+ indexify/graph.py,sha256=5WQphl30vd606MHz_IZ23oZVQot9dPN79cksbhjUncA,1572
12
+ indexify/local_runner.py,sha256=yXKH2HrfhXdsx3vtrk3Q4LJGcWoAJazNfRhDp_s6Kx0,1900
13
+ indexify/settings.py,sha256=LSaWZ0ADIVmUv6o6dHWRC3-Ry5uLbCw2sBSg1e_U7UM,99
14
+ indexify/utils.py,sha256=rDN2lrsAs9noJEIjfx6ukmC2SAIyrlUt7QU-kaBjujM,125
15
+ indexify-0.0.36.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
16
+ indexify-0.0.36.dist-info/METADATA,sha256=6Kh0Ngr9iAQF0NPyRULy0KOE6n5i9XFQGaZiuyxP1ss,1891
17
+ indexify-0.0.36.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
18
+ indexify-0.0.36.dist-info/RECORD,,
@@ -1,37 +0,0 @@
1
- from enum import Enum
2
- from typing import List
3
- from dataclasses import dataclass, field
4
-
5
- @dataclass
6
- class Content:
7
- id: str
8
- parent_id: str
9
- labels: dict[str, any]
10
- extraction_graph_names: List[str]
11
- extraction_policy: str
12
- mime_type: str
13
-
14
- @classmethod
15
- def from_dict(cls, json: dict):
16
- return Content(
17
- id=json["id"],
18
- parent_id=json["parent_id"],
19
- labels=json["labels"],
20
- extraction_graph_names=json["extraction_graph_names"],
21
- extraction_policy=json["source"],
22
- mime_type=json["mime_type"],
23
- )
24
-
25
- @dataclass
26
- class TextChunk:
27
- text: str
28
- metadata: dict[str, any] = field(default_factory=dict)
29
- score: float = 0.0
30
-
31
- def to_dict(self):
32
- return {"text": self.text, "metadata": self.metadata}
33
-
34
-
35
- @dataclass
36
- class SearchResult:
37
- results: List[TextChunk]
indexify/index.py DELETED
@@ -1,17 +0,0 @@
1
- import httpx
2
-
3
- from .data_containers import TextChunk
4
-
5
-
6
- class Index:
7
- def __init__(self, service_url, index):
8
- self._service_url = service_url
9
- self._index = index
10
-
11
- def search(self, query: str, top_k: int) -> list[TextChunk]:
12
- req = {"index": self._index, "query": query, "k": top_k}
13
- response = httpx.post(
14
- f"{self._service_url}/indexes/{self._index}/search", json=req
15
- )
16
- response.raise_for_status()
17
- return response.json()["results"]
@@ -1,14 +0,0 @@
1
- indexify/__init__.py,sha256=xqymbwqaiHiWXFpm7Cll2j-_V1lNQH2EEGlevtCTZK4,525
2
- indexify/client.py,sha256=WLnwUtvdJ17bEG7T2k_jxMOEDlTQwySwSXsLvV_FnSE,24692
3
- indexify/data_containers.py,sha256=fIX_rghpojrCUtmZ0grywoq_HWniDgN1mnR7yXDej-Y,874
4
- indexify/error.py,sha256=3umTeYb0ugtUyehV1ibfvaeACxAONPyWPc-1HRN4d1M,856
5
- indexify/exceptions.py,sha256=vjd5SPPNFIEW35GorSIodsqvm9RKHQm9kdp8t9gv-WM,111
6
- indexify/extraction_policy.py,sha256=POluredrBw6DzTN0OyfPLaLFP5-2DoWGRK0V6w68R28,2080
7
- indexify/extractor.py,sha256=sWFLlXHgEfWlmiKAXN6ytUt_uG7th-XGNHqz-TG39gs,1216
8
- indexify/index.py,sha256=RvxYhJXEth-GKvqzlMiz5PuN1eIbZk84pt20piA1Gsw,504
9
- indexify/settings.py,sha256=LSaWZ0ADIVmUv6o6dHWRC3-Ry5uLbCw2sBSg1e_U7UM,99
10
- indexify/utils.py,sha256=rDN2lrsAs9noJEIjfx6ukmC2SAIyrlUt7QU-kaBjujM,125
11
- indexify-0.0.35.dist-info/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
12
- indexify-0.0.35.dist-info/METADATA,sha256=5dk9KT6S-pNHOQwAgYDaxWv2XRyf_8NtaQQXUzpS5HE,1854
13
- indexify-0.0.35.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
14
- indexify-0.0.35.dist-info/RECORD,,