beekeeper-ai 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- beekeeper/__init__.py +1 -0
- beekeeper/core/document/__init__.py +6 -0
- beekeeper/core/document/schema.py +97 -0
- beekeeper/core/document_loaders/__init__.py +5 -0
- beekeeper/core/document_loaders/base.py +24 -0
- beekeeper/core/embeddings/__init__.py +6 -0
- beekeeper/core/embeddings/base.py +44 -0
- beekeeper/core/text_splitters/utils.py +142 -0
- beekeeper/core/utils/pairwise.py +20 -0
- beekeeper/document_loaders/__init__.py +17 -0
- beekeeper/document_loaders/directory.py +65 -0
- beekeeper/document_loaders/docx.py +31 -0
- beekeeper/document_loaders/html.py +77 -0
- beekeeper/document_loaders/json.py +53 -0
- beekeeper/document_loaders/pdf.py +38 -0
- beekeeper/document_loaders/s3.py +72 -0
- beekeeper/document_loaders/watson_discovery.py +121 -0
- beekeeper/embeddings/__init__.py +7 -0
- beekeeper/embeddings/huggingface.py +66 -0
- beekeeper/embeddings/watsonx.py +100 -0
- beekeeper/evaluation/__init__.py +5 -0
- beekeeper/evaluation/knowledge_base_coverage.py +62 -0
- beekeeper/monitor/__init__.py +11 -0
- beekeeper/monitor/watsonx.py +843 -0
- beekeeper/retrievers/__init__.py +5 -0
- beekeeper/retrievers/watson_discovery.py +121 -0
- beekeeper/text_splitters/__init__.py +9 -0
- beekeeper/text_splitters/semantic.py +139 -0
- beekeeper/text_splitters/sentence.py +107 -0
- beekeeper/text_splitters/token.py +101 -0
- beekeeper/vector_stores/__init__.py +7 -0
- beekeeper/vector_stores/chroma.py +115 -0
- beekeeper/vector_stores/elasticsearch.py +183 -0
- beekeeper_ai-0.6.1.dist-info/LICENSE +7 -0
- beekeeper_ai-0.6.1.dist-info/METADATA +49 -0
- beekeeper_ai-0.6.1.dist-info/RECORD +37 -0
- beekeeper_ai-0.6.1.dist-info/WHEEL +4 -0
beekeeper/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.6.1"
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Dict, Optional
|
|
5
|
+
|
|
6
|
+
from pydantic.v1 import BaseModel, Field, validator
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from langchain_core.documents import Document as LangChainDocument
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BaseDocument(ABC, BaseModel):
|
|
13
|
+
"""Generic abstract interface for retrievable documents."""
|
|
14
|
+
|
|
15
|
+
doc_id: str = Field(
|
|
16
|
+
default_factory=lambda: str(uuid.uuid4()),
|
|
17
|
+
description="Unique ID of the document.")
|
|
18
|
+
metadata: Dict[str, Any] = Field(
|
|
19
|
+
default_factory=dict,
|
|
20
|
+
description="A flat dictionary of metadata fields.")
|
|
21
|
+
|
|
22
|
+
@validator("metadata", pre=True)
|
|
23
|
+
def _validate_metadata(cls, v) -> Dict:
|
|
24
|
+
if v is None:
|
|
25
|
+
return {}
|
|
26
|
+
return v
|
|
27
|
+
|
|
28
|
+
@abstractmethod
|
|
29
|
+
def get_content(self) -> str:
|
|
30
|
+
"""Get document content."""
|
|
31
|
+
|
|
32
|
+
@abstractmethod
|
|
33
|
+
def get_metadata(self) -> str:
|
|
34
|
+
"""Get metadata."""
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class Document(BaseDocument):
|
|
38
|
+
"""Generic interface for data document."""
|
|
39
|
+
|
|
40
|
+
text: str = Field(default="", description="Text content of the document.")
|
|
41
|
+
|
|
42
|
+
@classmethod
|
|
43
|
+
def class_name(cls) -> str:
|
|
44
|
+
return "Document"
|
|
45
|
+
|
|
46
|
+
def get_content(self) -> str:
|
|
47
|
+
"""Get the text content."""
|
|
48
|
+
return self.text
|
|
49
|
+
|
|
50
|
+
def get_metadata(self) -> dict:
|
|
51
|
+
"""Get metadata."""
|
|
52
|
+
return self.metadata
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
def from_langchain_format(cls, doc: "LangChainDocument") -> "Document":
|
|
56
|
+
"""
|
|
57
|
+
Convert a document from LangChain format.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
doc (LangChainDocument): Document in LangChain format.
|
|
61
|
+
"""
|
|
62
|
+
return cls(text=doc.page_content, metadata=doc.metadata)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass
|
|
66
|
+
class DocumentWithScore:
|
|
67
|
+
document: BaseDocument
|
|
68
|
+
score: Optional[float] = None
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def class_name(cls) -> str:
|
|
72
|
+
return "DocumentWithScore"
|
|
73
|
+
|
|
74
|
+
def get_score(self) -> float:
|
|
75
|
+
"""Get score."""
|
|
76
|
+
if self.score is None:
|
|
77
|
+
return 0.0
|
|
78
|
+
else:
|
|
79
|
+
return self.score
|
|
80
|
+
|
|
81
|
+
# #### pass through methods to BaseDocument ####
|
|
82
|
+
@property
|
|
83
|
+
def doc_id(self) -> str:
|
|
84
|
+
return self.document.doc_id
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def text(self) -> str:
|
|
88
|
+
if isinstance(self.document, Document):
|
|
89
|
+
return self.document.text
|
|
90
|
+
else:
|
|
91
|
+
raise ValueError("Must be a Document to get text")
|
|
92
|
+
|
|
93
|
+
def get_content(self) -> str:
|
|
94
|
+
return self.document.get_content()
|
|
95
|
+
|
|
96
|
+
def get_metadata(self) -> str:
|
|
97
|
+
return self.document.get_metadata()
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
|
|
4
|
+
from pydantic.v1 import BaseModel
|
|
5
|
+
|
|
6
|
+
from beekeeper.core.document import Document
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class BaseLoader(ABC, BaseModel):
|
|
10
|
+
"""An interface for document loader."""
|
|
11
|
+
|
|
12
|
+
@classmethod
|
|
13
|
+
def class_name(cls) -> str:
|
|
14
|
+
return "BaseLoader"
|
|
15
|
+
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def load_data(self, extra_info: Optional[dict] = None) -> List[Document]:
|
|
18
|
+
"""Loads data."""
|
|
19
|
+
|
|
20
|
+
def load(self) -> List[Document]:
|
|
21
|
+
return self.load_data()
|
|
22
|
+
|
|
23
|
+
def lazy_load(self) -> List[Document]:
|
|
24
|
+
return self.load_data()
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import List, Literal
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
from beekeeper.core.utils.pairwise import cosine_similarity
|
|
7
|
+
|
|
8
|
+
Embedding = List[float]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class BaseEmbedding(ABC):
|
|
12
|
+
"""An interface for embedding models."""
|
|
13
|
+
|
|
14
|
+
@classmethod
|
|
15
|
+
def class_name(cls) -> str:
|
|
16
|
+
return "BaseEmbedding"
|
|
17
|
+
|
|
18
|
+
@abstractmethod
|
|
19
|
+
def get_query_embedding(self, query: str) -> Embedding:
|
|
20
|
+
"""Get query embedding."""
|
|
21
|
+
|
|
22
|
+
@abstractmethod
|
|
23
|
+
def get_texts_embedding(self, texts: List[str]) -> List[Embedding]:
|
|
24
|
+
"""Get text embeddings."""
|
|
25
|
+
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def get_documents_embedding(self, documents: List[str]) -> List[Embedding]:
|
|
28
|
+
"""Get documents embeddings."""
|
|
29
|
+
|
|
30
|
+
def embed_documents(self, texts: List[str]) -> List[Embedding]:
|
|
31
|
+
return self.get_texts_embedding(texts=texts)
|
|
32
|
+
|
|
33
|
+
@staticmethod
|
|
34
|
+
def similarity(embedding1: Embedding, embedding2: Embedding,
|
|
35
|
+
mode: Literal["cosine", "dot_product", "euclidean"] = "cosine"):
|
|
36
|
+
"""Get embedding similarity."""
|
|
37
|
+
if mode == "euclidean":
|
|
38
|
+
return -float(np.linalg.norm(np.array(embedding1) - np.array(embedding2)))
|
|
39
|
+
|
|
40
|
+
elif mode == "dot_product":
|
|
41
|
+
return np.dot(embedding1, embedding2)
|
|
42
|
+
|
|
43
|
+
else:
|
|
44
|
+
return cosine_similarity(embedding1, embedding2)
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
from typing import Callable, List, Tuple
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def tokenizer(text: str) -> List:
|
|
5
|
+
try:
|
|
6
|
+
import tiktoken
|
|
7
|
+
except ImportError:
|
|
8
|
+
raise ImportError("tiktoken package not found, please install it with `pip install tiktoken`")
|
|
9
|
+
|
|
10
|
+
enc = tiktoken.get_encoding("cl100k_base")
|
|
11
|
+
return enc.encode(text)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def split_by_sep(sep) -> Callable[[str], List[str]]:
|
|
15
|
+
"""Split text by separator."""
|
|
16
|
+
return lambda text: text.split(sep)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def split_by_regex(regex: str) -> Callable[[str], List[str]]:
|
|
20
|
+
"""Split text by regex."""
|
|
21
|
+
import re
|
|
22
|
+
|
|
23
|
+
return lambda text: re.findall(regex, text)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def split_by_char() -> Callable[[str], List[str]]:
|
|
27
|
+
"""Split text by character."""
|
|
28
|
+
return lambda text: list(text)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def split_by_sentence_tokenizer() -> Callable[[str], List[str]]:
|
|
32
|
+
try:
|
|
33
|
+
import nltk
|
|
34
|
+
except ImportError:
|
|
35
|
+
raise ImportError("nltk package not found, please install it with `pip install nltk`")
|
|
36
|
+
|
|
37
|
+
sentence_tokenizer = nltk.tokenize.PunktSentenceTokenizer()
|
|
38
|
+
return lambda text: _split_by_sentence_tokenizer(text, sentence_tokenizer)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _split_by_sentence_tokenizer(text: str, sentence_tokenizer) -> List[str]:
|
|
42
|
+
"""Get the spans and then return the sentences.
|
|
43
|
+
|
|
44
|
+
Using the start index of each span
|
|
45
|
+
Instead of using end, use the start of the next span
|
|
46
|
+
"""
|
|
47
|
+
spans = list(sentence_tokenizer.span_tokenize(text))
|
|
48
|
+
sentences = []
|
|
49
|
+
for i, span in enumerate(spans):
|
|
50
|
+
start = span[0]
|
|
51
|
+
if i < len(spans) - 1:
|
|
52
|
+
end = spans[i + 1][0]
|
|
53
|
+
else:
|
|
54
|
+
end = len(text)
|
|
55
|
+
sentences.append(text[start:end])
|
|
56
|
+
return sentences
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def split_by_fns(text: str,
|
|
60
|
+
split_fns: List[Callable],
|
|
61
|
+
sub_split_fns: List[Callable] = None) -> Tuple[List[str], bool]:
|
|
62
|
+
"""Split text by defined list of split functions."""
|
|
63
|
+
if not split_fns:
|
|
64
|
+
raise ValueError("Must provide a `split_fns` parameter")
|
|
65
|
+
|
|
66
|
+
for split_fn in split_fns:
|
|
67
|
+
splits = split_fn(text)
|
|
68
|
+
if len(splits) > 1:
|
|
69
|
+
return splits, True
|
|
70
|
+
|
|
71
|
+
if sub_split_fns: # noqa: RET503
|
|
72
|
+
for split_fn in sub_split_fns: # noqa: RET503
|
|
73
|
+
splits = split_fn(text)
|
|
74
|
+
if len(splits) > 1:
|
|
75
|
+
return splits, False
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def merge_splits(splits: List[dict],
|
|
79
|
+
chunk_size: int,
|
|
80
|
+
chunk_overlap: int) -> List[str]:
|
|
81
|
+
"""Merge splits into chunks."""
|
|
82
|
+
chunks: List[str] = []
|
|
83
|
+
cur_chunk: List[Tuple[str, int]] = []
|
|
84
|
+
cur_chunk_len = 0
|
|
85
|
+
last_chunk: List[Tuple[str, int]] = []
|
|
86
|
+
new_chunk = True
|
|
87
|
+
|
|
88
|
+
def close_chunk() -> None:
|
|
89
|
+
nonlocal chunks, cur_chunk, last_chunk, cur_chunk_len, new_chunk
|
|
90
|
+
|
|
91
|
+
chunks.append("".join([text for text, length in cur_chunk]))
|
|
92
|
+
last_chunk = cur_chunk
|
|
93
|
+
cur_chunk = []
|
|
94
|
+
cur_chunk_len = 0
|
|
95
|
+
new_chunk = True
|
|
96
|
+
|
|
97
|
+
# add overlap to the next chunk using previous chunk
|
|
98
|
+
if len(last_chunk) > 0:
|
|
99
|
+
last_index = len(last_chunk) - 1
|
|
100
|
+
while (
|
|
101
|
+
last_index >= 0
|
|
102
|
+
and cur_chunk_len + last_chunk[last_index][1] <= chunk_overlap):
|
|
103
|
+
text, length = last_chunk[last_index]
|
|
104
|
+
cur_chunk_len += length
|
|
105
|
+
cur_chunk.insert(0, (text, length))
|
|
106
|
+
last_index -= 1
|
|
107
|
+
|
|
108
|
+
def postprocess_chunks(_chunks: List[str]) -> List[str]:
|
|
109
|
+
"""Post-process chunks."""
|
|
110
|
+
post_chunks = []
|
|
111
|
+
for _chunk in _chunks:
|
|
112
|
+
stripped_chunk = _chunk.strip()
|
|
113
|
+
if stripped_chunk == "":
|
|
114
|
+
continue
|
|
115
|
+
post_chunks.append(stripped_chunk)
|
|
116
|
+
return post_chunks
|
|
117
|
+
|
|
118
|
+
while len(splits) > 0:
|
|
119
|
+
cur_split = splits[0]
|
|
120
|
+
|
|
121
|
+
if cur_split["token_size"] > chunk_size:
|
|
122
|
+
raise ValueError("Got a split size that exceeded chunk size")
|
|
123
|
+
|
|
124
|
+
if cur_chunk_len + cur_split["token_size"] > chunk_size and not new_chunk:
|
|
125
|
+
close_chunk()
|
|
126
|
+
else:
|
|
127
|
+
if (cur_split["is_sentence"]
|
|
128
|
+
or cur_chunk_len + cur_split["token_size"] <= chunk_size
|
|
129
|
+
or new_chunk): # If `new_chunk`, always add at least one split
|
|
130
|
+
|
|
131
|
+
cur_chunk_len += cur_split["token_size"]
|
|
132
|
+
cur_chunk.append((cur_split["text"], cur_split["token_size"]))
|
|
133
|
+
splits.pop(0)
|
|
134
|
+
new_chunk = False
|
|
135
|
+
else:
|
|
136
|
+
close_chunk()
|
|
137
|
+
|
|
138
|
+
if not new_chunk:
|
|
139
|
+
chunk = "".join([text for text, length in cur_chunk])
|
|
140
|
+
chunks.append(chunk)
|
|
141
|
+
|
|
142
|
+
return postprocess_chunks(chunks)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
Matrix = List[float]
|
|
6
|
+
|
|
7
|
+
def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
|
|
8
|
+
"""Row-wise cosine similarity between two equal-width matrices."""
|
|
9
|
+
X = np.array(X)
|
|
10
|
+
Y = np.array(Y)
|
|
11
|
+
|
|
12
|
+
if X.shape[0] != Y.shape[0]:
|
|
13
|
+
raise ValueError(
|
|
14
|
+
f"Number of rows in X and Y must be the same. X has shape {X.shape} "
|
|
15
|
+
f"and Y has shape {Y.shape}."
|
|
16
|
+
)
|
|
17
|
+
product = np.dot(X, Y)
|
|
18
|
+
norm = np.linalg.norm(X) * np.linalg.norm(Y)
|
|
19
|
+
|
|
20
|
+
return product / norm
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from beekeeper.document_loaders.directory import DirectoryLoader
|
|
2
|
+
from beekeeper.document_loaders.docx import DocxLoader
|
|
3
|
+
from beekeeper.document_loaders.html import HTMLLoader
|
|
4
|
+
from beekeeper.document_loaders.json import JSONLoader
|
|
5
|
+
from beekeeper.document_loaders.pdf import PDFLoader
|
|
6
|
+
from beekeeper.document_loaders.s3 import S3Loader
|
|
7
|
+
from beekeeper.document_loaders.watson_discovery import WatsonDiscoveryLoader
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"DirectoryLoader",
|
|
11
|
+
"DocxLoader",
|
|
12
|
+
"HTMLLoader",
|
|
13
|
+
"JSONLoader",
|
|
14
|
+
"PDFLoader",
|
|
15
|
+
"S3Loader",
|
|
16
|
+
"WatsonDiscoveryLoader",
|
|
17
|
+
]
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import glob
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import List, Optional, Type
|
|
5
|
+
|
|
6
|
+
from beekeeper.core.document import Document
|
|
7
|
+
from beekeeper.core.document_loaders import BaseLoader
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _loading_default_supported_loaders():
|
|
11
|
+
from beekeeper.document_loaders import DocxLoader, HTMLLoader, PDFLoader
|
|
12
|
+
|
|
13
|
+
return {
|
|
14
|
+
".docx": DocxLoader,
|
|
15
|
+
".html": HTMLLoader,
|
|
16
|
+
".pdf": PDFLoader,
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DirectoryLoader(BaseLoader):
|
|
21
|
+
"""Simple directory loader.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
required_exts: (List[str], optional): List of file extensions to only load files with those extensions.
|
|
25
|
+
recursive (str, optional): Whether to recursively search for files. Defaults to ``False``.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
required_exts: List[str] = [".pdf", ".docx", ".html"]
|
|
29
|
+
recursive: Optional[bool] = False
|
|
30
|
+
file_loader: Optional[dict[str, Type[BaseLoader]]] = None
|
|
31
|
+
|
|
32
|
+
def load_data(self, input_dir: str, extra_info: Optional[dict] = None) -> List[Document]:
|
|
33
|
+
"""Loads data from the specified directory.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
input_dir (str): Directory path from which to load the documents.
|
|
37
|
+
"""
|
|
38
|
+
if not os.path.isdir(input_dir):
|
|
39
|
+
raise ValueError(f"`{input_dir}` is not a valid directory.")
|
|
40
|
+
|
|
41
|
+
if self.file_loader is None:
|
|
42
|
+
self.file_loader = _loading_default_supported_loaders()
|
|
43
|
+
|
|
44
|
+
input_dir = Path(input_dir)
|
|
45
|
+
documents = []
|
|
46
|
+
|
|
47
|
+
pattern_prefix = "**/*" if self.recursive else ""
|
|
48
|
+
|
|
49
|
+
for extension in self.required_exts:
|
|
50
|
+
files = glob.glob(os.path.join(input_dir, pattern_prefix + extension), recursive=self.recursive)
|
|
51
|
+
|
|
52
|
+
for file_dir in files:
|
|
53
|
+
loader_cls = self.file_loader.get(extension)
|
|
54
|
+
if loader_cls:
|
|
55
|
+
try:
|
|
56
|
+
#TO-DO add `file_loader_kwargs`
|
|
57
|
+
doc = loader_cls().load_data(file_dir)
|
|
58
|
+
documents.extend(doc)
|
|
59
|
+
except Exception as e:
|
|
60
|
+
raise f"Error reading {file_dir}: {e}"
|
|
61
|
+
else:
|
|
62
|
+
#TO-DO add `unstructured file` support
|
|
63
|
+
raise f"Unsupported file type: {extension}"
|
|
64
|
+
|
|
65
|
+
return documents
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import List, Optional
|
|
4
|
+
|
|
5
|
+
from beekeeper.core.document import Document
|
|
6
|
+
from beekeeper.core.document_loaders import BaseLoader
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DocxLoader(BaseLoader):
|
|
10
|
+
"""Microsoft Word (Docx) loader."""
|
|
11
|
+
|
|
12
|
+
def load_data(self, input_file: str, extra_info: Optional[dict] = None) -> List[Document]:
|
|
13
|
+
"""Loads data from the specified directory.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
input_file (str): File path to load.
|
|
17
|
+
"""
|
|
18
|
+
try:
|
|
19
|
+
import docx2txt # noqa: F401
|
|
20
|
+
except ImportError:
|
|
21
|
+
raise ImportError("docx2txt package not found, please install it with `pip install docx2txt`")
|
|
22
|
+
|
|
23
|
+
if not os.path.isfile(input_file):
|
|
24
|
+
raise ValueError(f"File `{input_file}` does not exist")
|
|
25
|
+
|
|
26
|
+
input_file = str(Path(input_file).resolve())
|
|
27
|
+
|
|
28
|
+
text = docx2txt.process(input_file)
|
|
29
|
+
metadata = {"source": input_file}
|
|
30
|
+
|
|
31
|
+
return [Document(text=text, metadata=metadata)]
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import List, Optional
|
|
4
|
+
|
|
5
|
+
from beekeeper.core.document import Document
|
|
6
|
+
from beekeeper.core.document_loaders import BaseLoader
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class HTMLLoader(BaseLoader):
|
|
10
|
+
"""Load HTML file and extract text from a specific tag.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
tag (str): HTML tag to extract. Defaults to ``section``.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
tag: str = "section"
|
|
17
|
+
|
|
18
|
+
def load_data(self, input_file: str, extra_info: Optional[dict] = None) -> List[Document]:
|
|
19
|
+
"""Loads data from the specified directory.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
input_file (str): File path to load.
|
|
23
|
+
"""
|
|
24
|
+
try:
|
|
25
|
+
from bs4 import BeautifulSoup # noqa: F401
|
|
26
|
+
except ImportError:
|
|
27
|
+
raise ImportError("beautifulsoup4 package not found, please install it with `pip install beautifulsoup4`")
|
|
28
|
+
|
|
29
|
+
if not os.path.isfile(input_file):
|
|
30
|
+
raise ValueError(f"File `{input_file}` does not exist")
|
|
31
|
+
|
|
32
|
+
input_file = str(Path(input_file).resolve())
|
|
33
|
+
|
|
34
|
+
with open(input_file, encoding="utf-8") as html_file:
|
|
35
|
+
soup = BeautifulSoup(html_file, "html.parser")
|
|
36
|
+
|
|
37
|
+
tags = soup.find_all(self.tag)
|
|
38
|
+
documents = []
|
|
39
|
+
|
|
40
|
+
for tag in tags:
|
|
41
|
+
tag_text = self._extract_text_from_tag(tag)
|
|
42
|
+
|
|
43
|
+
metadata = {
|
|
44
|
+
"tag": self.tag,
|
|
45
|
+
"source": input_file,
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
doc = Document(
|
|
49
|
+
text=tag_text,
|
|
50
|
+
metadata=metadata,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
documents.append(doc)
|
|
54
|
+
|
|
55
|
+
return documents
|
|
56
|
+
|
|
57
|
+
def _extract_text_from_tag(self, tag) -> str:
|
|
58
|
+
"""Extract the text from an HTML tag, ignoring other nested tags."""
|
|
59
|
+
try:
|
|
60
|
+
from bs4 import NavigableString # noqa: F401
|
|
61
|
+
except ImportError:
|
|
62
|
+
raise ImportError("beautifulsoup4 package not found, please install it with `pip install beautifulsoup4`")
|
|
63
|
+
|
|
64
|
+
texts = []
|
|
65
|
+
|
|
66
|
+
for elem in tag.children:
|
|
67
|
+
# Check if the element is a text node, not a tag
|
|
68
|
+
if isinstance(elem, NavigableString):
|
|
69
|
+
if elem.strip():
|
|
70
|
+
texts.append(elem.strip())
|
|
71
|
+
# Ignore any tag that matches the main tag being processed (to avoid recursion)
|
|
72
|
+
elif elem.name == self.tag:
|
|
73
|
+
continue
|
|
74
|
+
else:
|
|
75
|
+
texts.append(elem.get_text().strip())
|
|
76
|
+
|
|
77
|
+
return "\n".join(texts)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import List, Optional
|
|
5
|
+
|
|
6
|
+
from beekeeper.core.document import Document
|
|
7
|
+
from beekeeper.core.document_loaders import BaseLoader
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class JSONLoader(BaseLoader):
|
|
11
|
+
"""JSON loader.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
jq_schema (str, optional): jq schema to use to extract the data from the JSON.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
jq_schema: Optional[str] = None
|
|
18
|
+
|
|
19
|
+
def load_data(self, input_file: str, extra_info: Optional[dict] = None) -> List[Document]:
|
|
20
|
+
"""Loads data from the specified directory.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
input_file (str): File path to load.
|
|
24
|
+
"""
|
|
25
|
+
try:
|
|
26
|
+
import jq # noqa: F401
|
|
27
|
+
except ImportError:
|
|
28
|
+
raise ImportError("jq package not found, please install it with `pip install jq`")
|
|
29
|
+
|
|
30
|
+
if not os.path.isfile(input_file):
|
|
31
|
+
raise ValueError(f"File `{input_file}` does not exist")
|
|
32
|
+
|
|
33
|
+
documents = []
|
|
34
|
+
jq_compiler = jq.compile(self.jq_schema)
|
|
35
|
+
json_file = Path(input_file).resolve().read_text(encoding="utf-8")
|
|
36
|
+
json_data = jq_compiler.input(json.loads(json_file))
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
for content in json_data:
|
|
40
|
+
|
|
41
|
+
if isinstance(content, str):
|
|
42
|
+
content = content
|
|
43
|
+
elif isinstance(content, dict):
|
|
44
|
+
content = json.dumps(content) if content else ""
|
|
45
|
+
else:
|
|
46
|
+
content = str(content) if content is not None else ""
|
|
47
|
+
|
|
48
|
+
if content.strip() != "":
|
|
49
|
+
documents.append(Document(
|
|
50
|
+
text=content,
|
|
51
|
+
metadata={"source": str(Path(input_file).resolve())}))
|
|
52
|
+
|
|
53
|
+
return documents
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import List, Optional
|
|
5
|
+
|
|
6
|
+
from beekeeper.core.document import Document
|
|
7
|
+
from beekeeper.core.document_loaders import BaseLoader
|
|
8
|
+
|
|
9
|
+
logging.getLogger("pypdf").setLevel(logging.ERROR)
|
|
10
|
+
|
|
11
|
+
class PDFLoader(BaseLoader):
|
|
12
|
+
"""PDF loader using PyPDF."""
|
|
13
|
+
|
|
14
|
+
def load_data(self, input_file: str, extra_info: Optional[dict] = None) -> List[Document]:
|
|
15
|
+
"""Loads data from the specified directory.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
input_file (str): File path to load.
|
|
19
|
+
"""
|
|
20
|
+
try:
|
|
21
|
+
import pypdf # noqa: F401
|
|
22
|
+
|
|
23
|
+
except ImportError:
|
|
24
|
+
raise ImportError("pypdf package not found, please install it with `pip install pypdf`")
|
|
25
|
+
|
|
26
|
+
if not os.path.isfile(input_file):
|
|
27
|
+
raise ValueError(f"File `{input_file}` does not exist")
|
|
28
|
+
|
|
29
|
+
input_file = str(Path(input_file).resolve())
|
|
30
|
+
pdf_loader = pypdf.PdfReader(input_file)
|
|
31
|
+
|
|
32
|
+
return [
|
|
33
|
+
Document(
|
|
34
|
+
text=page.extract_text().strip(),
|
|
35
|
+
metadata={"source": input_file, "page": page_number}
|
|
36
|
+
)
|
|
37
|
+
for page_number, page in enumerate(pdf_loader.pages)
|
|
38
|
+
]
|