beekeeper-ai 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. beekeeper/__init__.py +1 -0
  2. beekeeper/core/document/__init__.py +6 -0
  3. beekeeper/core/document/schema.py +97 -0
  4. beekeeper/core/document_loaders/__init__.py +5 -0
  5. beekeeper/core/document_loaders/base.py +24 -0
  6. beekeeper/core/embeddings/__init__.py +6 -0
  7. beekeeper/core/embeddings/base.py +44 -0
  8. beekeeper/core/text_splitters/utils.py +142 -0
  9. beekeeper/core/utils/pairwise.py +20 -0
  10. beekeeper/document_loaders/__init__.py +17 -0
  11. beekeeper/document_loaders/directory.py +65 -0
  12. beekeeper/document_loaders/docx.py +31 -0
  13. beekeeper/document_loaders/html.py +77 -0
  14. beekeeper/document_loaders/json.py +53 -0
  15. beekeeper/document_loaders/pdf.py +38 -0
  16. beekeeper/document_loaders/s3.py +72 -0
  17. beekeeper/document_loaders/watson_discovery.py +121 -0
  18. beekeeper/embeddings/__init__.py +7 -0
  19. beekeeper/embeddings/huggingface.py +66 -0
  20. beekeeper/embeddings/watsonx.py +100 -0
  21. beekeeper/evaluation/__init__.py +5 -0
  22. beekeeper/evaluation/knowledge_base_coverage.py +62 -0
  23. beekeeper/monitor/__init__.py +11 -0
  24. beekeeper/monitor/watsonx.py +843 -0
  25. beekeeper/retrievers/__init__.py +5 -0
  26. beekeeper/retrievers/watson_discovery.py +121 -0
  27. beekeeper/text_splitters/__init__.py +9 -0
  28. beekeeper/text_splitters/semantic.py +139 -0
  29. beekeeper/text_splitters/sentence.py +107 -0
  30. beekeeper/text_splitters/token.py +101 -0
  31. beekeeper/vector_stores/__init__.py +7 -0
  32. beekeeper/vector_stores/chroma.py +115 -0
  33. beekeeper/vector_stores/elasticsearch.py +183 -0
  34. beekeeper_ai-0.6.1.dist-info/LICENSE +7 -0
  35. beekeeper_ai-0.6.1.dist-info/METADATA +49 -0
  36. beekeeper_ai-0.6.1.dist-info/RECORD +37 -0
  37. beekeeper_ai-0.6.1.dist-info/WHEEL +4 -0
beekeeper/__init__.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "0.6.1"
@@ -0,0 +1,6 @@
1
+ from beekeeper.core.document.schema import Document, DocumentWithScore
2
+
3
+ __all__ = [
4
+ "Document",
5
+ "DocumentWithScore"
6
+ ]
@@ -0,0 +1,97 @@
1
+ import uuid
2
+ from abc import ABC, abstractmethod
3
+ from dataclasses import dataclass
4
+ from typing import TYPE_CHECKING, Any, Dict, Optional
5
+
6
+ from pydantic.v1 import BaseModel, Field, validator
7
+
8
+ if TYPE_CHECKING:
9
+ from langchain_core.documents import Document as LangChainDocument
10
+
11
+
12
+ class BaseDocument(ABC, BaseModel):
13
+ """Generic abstract interface for retrievable documents."""
14
+
15
+ doc_id: str = Field(
16
+ default_factory=lambda: str(uuid.uuid4()),
17
+ description="Unique ID of the document.")
18
+ metadata: Dict[str, Any] = Field(
19
+ default_factory=dict,
20
+ description="A flat dictionary of metadata fields.")
21
+
22
+ @validator("metadata", pre=True)
23
+ def _validate_metadata(cls, v) -> Dict:
24
+ if v is None:
25
+ return {}
26
+ return v
27
+
28
+ @abstractmethod
29
+ def get_content(self) -> str:
30
+ """Get document content."""
31
+
32
+ @abstractmethod
33
+ def get_metadata(self) -> str:
34
+ """Get metadata."""
35
+
36
+
37
+ class Document(BaseDocument):
38
+ """Generic interface for data document."""
39
+
40
+ text: str = Field(default="", description="Text content of the document.")
41
+
42
+ @classmethod
43
+ def class_name(cls) -> str:
44
+ return "Document"
45
+
46
+ def get_content(self) -> str:
47
+ """Get the text content."""
48
+ return self.text
49
+
50
+ def get_metadata(self) -> dict:
51
+ """Get metadata."""
52
+ return self.metadata
53
+
54
+ @classmethod
55
+ def from_langchain_format(cls, doc: "LangChainDocument") -> "Document":
56
+ """
57
+ Convert a document from LangChain format.
58
+
59
+ Args:
60
+ doc (LangChainDocument): Document in LangChain format.
61
+ """
62
+ return cls(text=doc.page_content, metadata=doc.metadata)
63
+
64
+
65
+ @dataclass
66
+ class DocumentWithScore:
67
+ document: BaseDocument
68
+ score: Optional[float] = None
69
+
70
+ @classmethod
71
+ def class_name(cls) -> str:
72
+ return "DocumentWithScore"
73
+
74
+ def get_score(self) -> float:
75
+ """Get score."""
76
+ if self.score is None:
77
+ return 0.0
78
+ else:
79
+ return self.score
80
+
81
+ # #### pass through methods to BaseDocument ####
82
+ @property
83
+ def doc_id(self) -> str:
84
+ return self.document.doc_id
85
+
86
+ @property
87
+ def text(self) -> str:
88
+ if isinstance(self.document, Document):
89
+ return self.document.text
90
+ else:
91
+ raise ValueError("Must be a Document to get text")
92
+
93
+ def get_content(self) -> str:
94
+ return self.document.get_content()
95
+
96
+ def get_metadata(self) -> str:
97
+ return self.document.get_metadata()
@@ -0,0 +1,5 @@
1
+ from beekeeper.core.document_loaders.base import BaseLoader
2
+
3
+ __all__ = [
4
+ "BaseLoader",
5
+ ]
@@ -0,0 +1,24 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import List, Optional
3
+
4
+ from pydantic.v1 import BaseModel
5
+
6
+ from beekeeper.core.document import Document
7
+
8
+
9
+ class BaseLoader(ABC, BaseModel):
10
+ """An interface for document loader."""
11
+
12
+ @classmethod
13
+ def class_name(cls) -> str:
14
+ return "BaseLoader"
15
+
16
+ @abstractmethod
17
+ def load_data(self, extra_info: Optional[dict] = None) -> List[Document]:
18
+ """Loads data."""
19
+
20
+ def load(self) -> List[Document]:
21
+ return self.load_data()
22
+
23
+ def lazy_load(self) -> List[Document]:
24
+ return self.load_data()
@@ -0,0 +1,6 @@
1
+ from beekeeper.core.embeddings.base import BaseEmbedding, Embedding
2
+
3
+ __all__ = [
4
+ "BaseEmbedding",
5
+ "Embedding",
6
+ ]
@@ -0,0 +1,44 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import List, Literal
3
+
4
+ import numpy as np
5
+
6
+ from beekeeper.core.utils.pairwise import cosine_similarity
7
+
8
+ Embedding = List[float]
9
+
10
+
11
+ class BaseEmbedding(ABC):
12
+ """An interface for embedding models."""
13
+
14
+ @classmethod
15
+ def class_name(cls) -> str:
16
+ return "BaseEmbedding"
17
+
18
+ @abstractmethod
19
+ def get_query_embedding(self, query: str) -> Embedding:
20
+ """Get query embedding."""
21
+
22
+ @abstractmethod
23
+ def get_texts_embedding(self, texts: List[str]) -> List[Embedding]:
24
+ """Get text embeddings."""
25
+
26
+ @abstractmethod
27
+ def get_documents_embedding(self, documents: List[str]) -> List[Embedding]:
28
+ """Get documents embeddings."""
29
+
30
+ def embed_documents(self, texts: List[str]) -> List[Embedding]:
31
+ return self.get_texts_embedding(texts=texts)
32
+
33
+ @staticmethod
34
+ def similarity(embedding1: Embedding, embedding2: Embedding,
35
+ mode: Literal["cosine", "dot_product", "euclidean"] = "cosine"):
36
+ """Get embedding similarity."""
37
+ if mode == "euclidean":
38
+ return -float(np.linalg.norm(np.array(embedding1) - np.array(embedding2)))
39
+
40
+ elif mode == "dot_product":
41
+ return np.dot(embedding1, embedding2)
42
+
43
+ else:
44
+ return cosine_similarity(embedding1, embedding2)
@@ -0,0 +1,142 @@
1
+ from typing import Callable, List, Tuple
2
+
3
+
4
+ def tokenizer(text: str) -> List:
5
+ try:
6
+ import tiktoken
7
+ except ImportError:
8
+ raise ImportError("tiktoken package not found, please install it with `pip install tiktoken`")
9
+
10
+ enc = tiktoken.get_encoding("cl100k_base")
11
+ return enc.encode(text)
12
+
13
+
14
+ def split_by_sep(sep) -> Callable[[str], List[str]]:
15
+ """Split text by separator."""
16
+ return lambda text: text.split(sep)
17
+
18
+
19
+ def split_by_regex(regex: str) -> Callable[[str], List[str]]:
20
+ """Split text by regex."""
21
+ import re
22
+
23
+ return lambda text: re.findall(regex, text)
24
+
25
+
26
+ def split_by_char() -> Callable[[str], List[str]]:
27
+ """Split text by character."""
28
+ return lambda text: list(text)
29
+
30
+
31
+ def split_by_sentence_tokenizer() -> Callable[[str], List[str]]:
32
+ try:
33
+ import nltk
34
+ except ImportError:
35
+ raise ImportError("nltk package not found, please install it with `pip install nltk`")
36
+
37
+ sentence_tokenizer = nltk.tokenize.PunktSentenceTokenizer()
38
+ return lambda text: _split_by_sentence_tokenizer(text, sentence_tokenizer)
39
+
40
+
41
+ def _split_by_sentence_tokenizer(text: str, sentence_tokenizer) -> List[str]:
42
+ """Get the spans and then return the sentences.
43
+
44
+ Using the start index of each span
45
+ Instead of using end, use the start of the next span
46
+ """
47
+ spans = list(sentence_tokenizer.span_tokenize(text))
48
+ sentences = []
49
+ for i, span in enumerate(spans):
50
+ start = span[0]
51
+ if i < len(spans) - 1:
52
+ end = spans[i + 1][0]
53
+ else:
54
+ end = len(text)
55
+ sentences.append(text[start:end])
56
+ return sentences
57
+
58
+
59
+ def split_by_fns(text: str,
60
+ split_fns: List[Callable],
61
+ sub_split_fns: List[Callable] = None) -> Tuple[List[str], bool]:
62
+ """Split text by defined list of split functions."""
63
+ if not split_fns:
64
+ raise ValueError("Must provide a `split_fns` parameter")
65
+
66
+ for split_fn in split_fns:
67
+ splits = split_fn(text)
68
+ if len(splits) > 1:
69
+ return splits, True
70
+
71
+ if sub_split_fns: # noqa: RET503
72
+ for split_fn in sub_split_fns: # noqa: RET503
73
+ splits = split_fn(text)
74
+ if len(splits) > 1:
75
+ return splits, False
76
+
77
+
78
+ def merge_splits(splits: List[dict],
79
+ chunk_size: int,
80
+ chunk_overlap: int) -> List[str]:
81
+ """Merge splits into chunks."""
82
+ chunks: List[str] = []
83
+ cur_chunk: List[Tuple[str, int]] = []
84
+ cur_chunk_len = 0
85
+ last_chunk: List[Tuple[str, int]] = []
86
+ new_chunk = True
87
+
88
+ def close_chunk() -> None:
89
+ nonlocal chunks, cur_chunk, last_chunk, cur_chunk_len, new_chunk
90
+
91
+ chunks.append("".join([text for text, length in cur_chunk]))
92
+ last_chunk = cur_chunk
93
+ cur_chunk = []
94
+ cur_chunk_len = 0
95
+ new_chunk = True
96
+
97
+ # add overlap to the next chunk using previous chunk
98
+ if len(last_chunk) > 0:
99
+ last_index = len(last_chunk) - 1
100
+ while (
101
+ last_index >= 0
102
+ and cur_chunk_len + last_chunk[last_index][1] <= chunk_overlap):
103
+ text, length = last_chunk[last_index]
104
+ cur_chunk_len += length
105
+ cur_chunk.insert(0, (text, length))
106
+ last_index -= 1
107
+
108
+ def postprocess_chunks(_chunks: List[str]) -> List[str]:
109
+ """Post-process chunks."""
110
+ post_chunks = []
111
+ for _chunk in _chunks:
112
+ stripped_chunk = _chunk.strip()
113
+ if stripped_chunk == "":
114
+ continue
115
+ post_chunks.append(stripped_chunk)
116
+ return post_chunks
117
+
118
+ while len(splits) > 0:
119
+ cur_split = splits[0]
120
+
121
+ if cur_split["token_size"] > chunk_size:
122
+ raise ValueError("Got a split size that exceeded chunk size")
123
+
124
+ if cur_chunk_len + cur_split["token_size"] > chunk_size and not new_chunk:
125
+ close_chunk()
126
+ else:
127
+ if (cur_split["is_sentence"]
128
+ or cur_chunk_len + cur_split["token_size"] <= chunk_size
129
+ or new_chunk): # If `new_chunk`, always add at least one split
130
+
131
+ cur_chunk_len += cur_split["token_size"]
132
+ cur_chunk.append((cur_split["text"], cur_split["token_size"]))
133
+ splits.pop(0)
134
+ new_chunk = False
135
+ else:
136
+ close_chunk()
137
+
138
+ if not new_chunk:
139
+ chunk = "".join([text for text, length in cur_chunk])
140
+ chunks.append(chunk)
141
+
142
+ return postprocess_chunks(chunks)
@@ -0,0 +1,20 @@
1
+ from typing import List
2
+
3
+ import numpy as np
4
+
5
+ Matrix = List[float]
6
+
7
+ def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
8
+ """Row-wise cosine similarity between two equal-width matrices."""
9
+ X = np.array(X)
10
+ Y = np.array(Y)
11
+
12
+ if X.shape[0] != Y.shape[0]:
13
+ raise ValueError(
14
+ f"Number of rows in X and Y must be the same. X has shape {X.shape} "
15
+ f"and Y has shape {Y.shape}."
16
+ )
17
+ product = np.dot(X, Y)
18
+ norm = np.linalg.norm(X) * np.linalg.norm(Y)
19
+
20
+ return product / norm
@@ -0,0 +1,17 @@
1
+ from beekeeper.document_loaders.directory import DirectoryLoader
2
+ from beekeeper.document_loaders.docx import DocxLoader
3
+ from beekeeper.document_loaders.html import HTMLLoader
4
+ from beekeeper.document_loaders.json import JSONLoader
5
+ from beekeeper.document_loaders.pdf import PDFLoader
6
+ from beekeeper.document_loaders.s3 import S3Loader
7
+ from beekeeper.document_loaders.watson_discovery import WatsonDiscoveryLoader
8
+
9
+ __all__ = [
10
+ "DirectoryLoader",
11
+ "DocxLoader",
12
+ "HTMLLoader",
13
+ "JSONLoader",
14
+ "PDFLoader",
15
+ "S3Loader",
16
+ "WatsonDiscoveryLoader",
17
+ ]
@@ -0,0 +1,65 @@
1
+ import glob
2
+ import os
3
+ from pathlib import Path
4
+ from typing import List, Optional, Type
5
+
6
+ from beekeeper.core.document import Document
7
+ from beekeeper.core.document_loaders import BaseLoader
8
+
9
+
10
+ def _loading_default_supported_loaders():
11
+ from beekeeper.document_loaders import DocxLoader, HTMLLoader, PDFLoader
12
+
13
+ return {
14
+ ".docx": DocxLoader,
15
+ ".html": HTMLLoader,
16
+ ".pdf": PDFLoader,
17
+ }
18
+
19
+
20
+ class DirectoryLoader(BaseLoader):
21
+ """Simple directory loader.
22
+
23
+ Args:
24
+ required_exts: (List[str], optional): List of file extensions to only load files with those extensions.
25
+ recursive (str, optional): Whether to recursively search for files. Defaults to ``False``.
26
+ """
27
+
28
+ required_exts: List[str] = [".pdf", ".docx", ".html"]
29
+ recursive: Optional[bool] = False
30
+ file_loader: Optional[dict[str, Type[BaseLoader]]] = None
31
+
32
+ def load_data(self, input_dir: str, extra_info: Optional[dict] = None) -> List[Document]:
33
+ """Loads data from the specified directory.
34
+
35
+ Args:
36
+ input_dir (str): Directory path from which to load the documents.
37
+ """
38
+ if not os.path.isdir(input_dir):
39
+ raise ValueError(f"`{input_dir}` is not a valid directory.")
40
+
41
+ if self.file_loader is None:
42
+ self.file_loader = _loading_default_supported_loaders()
43
+
44
+ input_dir = Path(input_dir)
45
+ documents = []
46
+
47
+ pattern_prefix = "**/*" if self.recursive else ""
48
+
49
+ for extension in self.required_exts:
50
+ files = glob.glob(os.path.join(input_dir, pattern_prefix + extension), recursive=self.recursive)
51
+
52
+ for file_dir in files:
53
+ loader_cls = self.file_loader.get(extension)
54
+ if loader_cls:
55
+ try:
56
+ #TO-DO add `file_loader_kwargs`
57
+ doc = loader_cls().load_data(file_dir)
58
+ documents.extend(doc)
59
+ except Exception as e:
60
+ raise f"Error reading {file_dir}: {e}"
61
+ else:
62
+ #TO-DO add `unstructured file` support
63
+ raise f"Unsupported file type: {extension}"
64
+
65
+ return documents
@@ -0,0 +1,31 @@
1
+ import os
2
+ from pathlib import Path
3
+ from typing import List, Optional
4
+
5
+ from beekeeper.core.document import Document
6
+ from beekeeper.core.document_loaders import BaseLoader
7
+
8
+
9
+ class DocxLoader(BaseLoader):
10
+ """Microsoft Word (Docx) loader."""
11
+
12
+ def load_data(self, input_file: str, extra_info: Optional[dict] = None) -> List[Document]:
13
+ """Loads data from the specified directory.
14
+
15
+ Args:
16
+ input_file (str): File path to load.
17
+ """
18
+ try:
19
+ import docx2txt # noqa: F401
20
+ except ImportError:
21
+ raise ImportError("docx2txt package not found, please install it with `pip install docx2txt`")
22
+
23
+ if not os.path.isfile(input_file):
24
+ raise ValueError(f"File `{input_file}` does not exist")
25
+
26
+ input_file = str(Path(input_file).resolve())
27
+
28
+ text = docx2txt.process(input_file)
29
+ metadata = {"source": input_file}
30
+
31
+ return [Document(text=text, metadata=metadata)]
@@ -0,0 +1,77 @@
1
+ import os
2
+ from pathlib import Path
3
+ from typing import List, Optional
4
+
5
+ from beekeeper.core.document import Document
6
+ from beekeeper.core.document_loaders import BaseLoader
7
+
8
+
9
+ class HTMLLoader(BaseLoader):
10
+ """Load HTML file and extract text from a specific tag.
11
+
12
+ Args:
13
+ tag (str): HTML tag to extract. Defaults to ``section``.
14
+ """
15
+
16
+ tag: str = "section"
17
+
18
+ def load_data(self, input_file: str, extra_info: Optional[dict] = None) -> List[Document]:
19
+ """Loads data from the specified directory.
20
+
21
+ Args:
22
+ input_file (str): File path to load.
23
+ """
24
+ try:
25
+ from bs4 import BeautifulSoup # noqa: F401
26
+ except ImportError:
27
+ raise ImportError("beautifulsoup4 package not found, please install it with `pip install beautifulsoup4`")
28
+
29
+ if not os.path.isfile(input_file):
30
+ raise ValueError(f"File `{input_file}` does not exist")
31
+
32
+ input_file = str(Path(input_file).resolve())
33
+
34
+ with open(input_file, encoding="utf-8") as html_file:
35
+ soup = BeautifulSoup(html_file, "html.parser")
36
+
37
+ tags = soup.find_all(self.tag)
38
+ documents = []
39
+
40
+ for tag in tags:
41
+ tag_text = self._extract_text_from_tag(tag)
42
+
43
+ metadata = {
44
+ "tag": self.tag,
45
+ "source": input_file,
46
+ }
47
+
48
+ doc = Document(
49
+ text=tag_text,
50
+ metadata=metadata,
51
+ )
52
+
53
+ documents.append(doc)
54
+
55
+ return documents
56
+
57
+ def _extract_text_from_tag(self, tag) -> str:
58
+ """Extract the text from an HTML tag, ignoring other nested tags."""
59
+ try:
60
+ from bs4 import NavigableString # noqa: F401
61
+ except ImportError:
62
+ raise ImportError("beautifulsoup4 package not found, please install it with `pip install beautifulsoup4`")
63
+
64
+ texts = []
65
+
66
+ for elem in tag.children:
67
+ # Check if the element is a text node, not a tag
68
+ if isinstance(elem, NavigableString):
69
+ if elem.strip():
70
+ texts.append(elem.strip())
71
+ # Ignore any tag that matches the main tag being processed (to avoid recursion)
72
+ elif elem.name == self.tag:
73
+ continue
74
+ else:
75
+ texts.append(elem.get_text().strip())
76
+
77
+ return "\n".join(texts)
@@ -0,0 +1,53 @@
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+ from typing import List, Optional
5
+
6
+ from beekeeper.core.document import Document
7
+ from beekeeper.core.document_loaders import BaseLoader
8
+
9
+
10
+ class JSONLoader(BaseLoader):
11
+ """JSON loader.
12
+
13
+ Args:
14
+ jq_schema (str, optional): jq schema to use to extract the data from the JSON.
15
+ """
16
+
17
+ jq_schema: Optional[str] = None
18
+
19
+ def load_data(self, input_file: str, extra_info: Optional[dict] = None) -> List[Document]:
20
+ """Loads data from the specified directory.
21
+
22
+ Args:
23
+ input_file (str): File path to load.
24
+ """
25
+ try:
26
+ import jq # noqa: F401
27
+ except ImportError:
28
+ raise ImportError("jq package not found, please install it with `pip install jq`")
29
+
30
+ if not os.path.isfile(input_file):
31
+ raise ValueError(f"File `{input_file}` does not exist")
32
+
33
+ documents = []
34
+ jq_compiler = jq.compile(self.jq_schema)
35
+ json_file = Path(input_file).resolve().read_text(encoding="utf-8")
36
+ json_data = jq_compiler.input(json.loads(json_file))
37
+
38
+
39
+ for content in json_data:
40
+
41
+ if isinstance(content, str):
42
+ content = content
43
+ elif isinstance(content, dict):
44
+ content = json.dumps(content) if content else ""
45
+ else:
46
+ content = str(content) if content is not None else ""
47
+
48
+ if content.strip() != "":
49
+ documents.append(Document(
50
+ text=content,
51
+ metadata={"source": str(Path(input_file).resolve())}))
52
+
53
+ return documents
@@ -0,0 +1,38 @@
1
+ import logging
2
+ import os
3
+ from pathlib import Path
4
+ from typing import List, Optional
5
+
6
+ from beekeeper.core.document import Document
7
+ from beekeeper.core.document_loaders import BaseLoader
8
+
9
+ logging.getLogger("pypdf").setLevel(logging.ERROR)
10
+
11
+ class PDFLoader(BaseLoader):
12
+ """PDF loader using PyPDF."""
13
+
14
+ def load_data(self, input_file: str, extra_info: Optional[dict] = None) -> List[Document]:
15
+ """Loads data from the specified directory.
16
+
17
+ Args:
18
+ input_file (str): File path to load.
19
+ """
20
+ try:
21
+ import pypdf # noqa: F401
22
+
23
+ except ImportError:
24
+ raise ImportError("pypdf package not found, please install it with `pip install pypdf`")
25
+
26
+ if not os.path.isfile(input_file):
27
+ raise ValueError(f"File `{input_file}` does not exist")
28
+
29
+ input_file = str(Path(input_file).resolve())
30
+ pdf_loader = pypdf.PdfReader(input_file)
31
+
32
+ return [
33
+ Document(
34
+ text=page.extract_text().strip(),
35
+ metadata={"source": input_file, "page": page_number}
36
+ )
37
+ for page_number, page in enumerate(pdf_loader.pages)
38
+ ]