unstructured-ingest 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/weaviate/test_local.py +27 -6
- test/integration/embedders/test_azure_openai.py +1 -3
- test/integration/embedders/test_bedrock.py +2 -2
- test/integration/embedders/test_huggingface.py +1 -3
- test/integration/embedders/test_mixedbread.py +2 -2
- test/integration/embedders/test_octoai.py +2 -4
- test/integration/embedders/test_openai.py +2 -4
- test/integration/embedders/test_togetherai.py +2 -2
- test/integration/embedders/test_vertexai.py +2 -4
- test/integration/embedders/test_voyageai.py +2 -4
- test/integration/embedders/utils.py +12 -14
- test/unit/embed/test_openai.py +12 -4
- test/unit/test_html.py +112 -0
- test/unit/v2/embedders/test_voyageai.py +1 -1
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/huggingface.py +6 -1
- unstructured_ingest/embed/interfaces.py +9 -6
- unstructured_ingest/embed/mixedbreadai.py +3 -10
- unstructured_ingest/embed/octoai.py +14 -7
- unstructured_ingest/embed/openai.py +18 -5
- unstructured_ingest/embed/togetherai.py +19 -8
- unstructured_ingest/embed/vertexai.py +13 -6
- unstructured_ingest/embed/voyageai.py +19 -6
- unstructured_ingest/utils/html.py +143 -93
- unstructured_ingest/v2/interfaces/__init__.py +2 -1
- unstructured_ingest/v2/interfaces/indexer.py +2 -3
- unstructured_ingest/v2/interfaces/process.py +3 -0
- unstructured_ingest/v2/interfaces/uploader.py +14 -1
- unstructured_ingest/v2/pipeline/pipeline.py +20 -6
- unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/confluence.py +15 -22
- unstructured_ingest/v2/processes/connectors/onedrive.py +5 -29
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +41 -3
- unstructured_ingest/v2/processes/embedder.py +3 -0
- {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.4.dist-info}/METADATA +9 -9
- {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.4.dist-info}/RECORD +40 -38
- {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.4.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.4.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.4.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.4.dist-info}/top_level.txt +0 -0
|
@@ -9,6 +9,7 @@ from unstructured_ingest.embed.interfaces import (
|
|
|
9
9
|
EmbeddingConfig,
|
|
10
10
|
)
|
|
11
11
|
from unstructured_ingest.logger import logger
|
|
12
|
+
from unstructured_ingest.utils.data_prep import batch_generator
|
|
12
13
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
14
|
from unstructured_ingest.v2.errors import (
|
|
14
15
|
RateLimitError as CustomRateLimitError,
|
|
@@ -71,13 +72,18 @@ class TogetherAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
71
72
|
|
|
72
73
|
def _embed_documents(self, elements: list[str]) -> list[list[float]]:
|
|
73
74
|
client = self.config.get_client()
|
|
75
|
+
embeddings = []
|
|
74
76
|
try:
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
)
|
|
77
|
+
for batch in batch_generator(
|
|
78
|
+
elements, batch_size=self.config.batch_size or len(elements)
|
|
79
|
+
):
|
|
80
|
+
outputs = client.embeddings.create(
|
|
81
|
+
model=self.config.embedder_model_name, input=batch
|
|
82
|
+
)
|
|
83
|
+
embeddings.extend([outputs.data[i].embedding for i in range(len(batch))])
|
|
78
84
|
except Exception as e:
|
|
79
85
|
raise self.wrap_error(e=e)
|
|
80
|
-
return
|
|
86
|
+
return embeddings
|
|
81
87
|
|
|
82
88
|
|
|
83
89
|
@dataclass
|
|
@@ -97,10 +103,15 @@ class AsyncTogetherAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
|
97
103
|
|
|
98
104
|
async def _embed_documents(self, elements: list[str]) -> list[list[float]]:
|
|
99
105
|
client = self.config.get_async_client()
|
|
106
|
+
embeddings = []
|
|
100
107
|
try:
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
)
|
|
108
|
+
for batch in batch_generator(
|
|
109
|
+
elements, batch_size=self.config.batch_size or len(elements)
|
|
110
|
+
):
|
|
111
|
+
outputs = await client.embeddings.create(
|
|
112
|
+
model=self.config.embedder_model_name, input=batch
|
|
113
|
+
)
|
|
114
|
+
embeddings.extend([outputs.data[i].embedding for i in range(len(batch))])
|
|
104
115
|
except Exception as e:
|
|
105
116
|
raise self.wrap_error(e=e)
|
|
106
|
-
return
|
|
117
|
+
return embeddings
|
|
@@ -13,6 +13,7 @@ from unstructured_ingest.embed.interfaces import (
|
|
|
13
13
|
BaseEmbeddingEncoder,
|
|
14
14
|
EmbeddingConfig,
|
|
15
15
|
)
|
|
16
|
+
from unstructured_ingest.utils.data_prep import batch_generator
|
|
16
17
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
17
18
|
from unstructured_ingest.v2.errors import UserAuthError
|
|
18
19
|
|
|
@@ -86,12 +87,15 @@ class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
86
87
|
from vertexai.language_models import TextEmbeddingInput
|
|
87
88
|
|
|
88
89
|
inputs = [TextEmbeddingInput(text=element) for element in elements]
|
|
90
|
+
client = self.config.get_client()
|
|
91
|
+
embeddings = []
|
|
89
92
|
try:
|
|
90
|
-
|
|
91
|
-
|
|
93
|
+
for batch in batch_generator(inputs, batch_size=self.config.batch_size or len(inputs)):
|
|
94
|
+
response = client.get_embeddings(batch)
|
|
95
|
+
embeddings.extend([e.values for e in response])
|
|
92
96
|
except Exception as e:
|
|
93
97
|
raise self.wrap_error(e=e)
|
|
94
|
-
return
|
|
98
|
+
return embeddings
|
|
95
99
|
|
|
96
100
|
|
|
97
101
|
@dataclass
|
|
@@ -118,9 +122,12 @@ class AsyncVertexAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
|
118
122
|
from vertexai.language_models import TextEmbeddingInput
|
|
119
123
|
|
|
120
124
|
inputs = [TextEmbeddingInput(text=element) for element in elements]
|
|
125
|
+
client = self.config.get_client()
|
|
126
|
+
embeddings = []
|
|
121
127
|
try:
|
|
122
|
-
|
|
123
|
-
|
|
128
|
+
for batch in batch_generator(inputs, batch_size=self.config.batch_size or len(inputs)):
|
|
129
|
+
response = await client.get_embeddings_async(batch)
|
|
130
|
+
embeddings.extend([e.values for e in response])
|
|
124
131
|
except Exception as e:
|
|
125
132
|
raise self.wrap_error(e=e)
|
|
126
|
-
return
|
|
133
|
+
return embeddings
|
|
@@ -9,6 +9,7 @@ from unstructured_ingest.embed.interfaces import (
|
|
|
9
9
|
EmbeddingConfig,
|
|
10
10
|
)
|
|
11
11
|
from unstructured_ingest.logger import logger
|
|
12
|
+
from unstructured_ingest.utils.data_prep import batch_generator
|
|
12
13
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
14
|
from unstructured_ingest.v2.errors import (
|
|
14
15
|
ProviderError,
|
|
@@ -25,9 +26,13 @@ if TYPE_CHECKING:
|
|
|
25
26
|
|
|
26
27
|
|
|
27
28
|
class VoyageAIEmbeddingConfig(EmbeddingConfig):
|
|
29
|
+
batch_size: int = Field(
|
|
30
|
+
default=32,
|
|
31
|
+
le=128,
|
|
32
|
+
description="Batch size for embedding requests. VoyageAI has a limit of 128.",
|
|
33
|
+
)
|
|
28
34
|
api_key: SecretStr
|
|
29
35
|
embedder_model_name: str = Field(default="voyage-3", alias="model_name")
|
|
30
|
-
batch_size: Optional[int] = Field(default=None)
|
|
31
36
|
truncation: Optional[bool] = Field(default=None)
|
|
32
37
|
max_retries: int = 0
|
|
33
38
|
timeout_in_seconds: Optional[int] = None
|
|
@@ -91,12 +96,15 @@ class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
91
96
|
return self.config.wrap_error(e=e)
|
|
92
97
|
|
|
93
98
|
def _embed_documents(self, elements: list[str]) -> list[list[float]]:
|
|
94
|
-
client
|
|
99
|
+
client = self.config.get_client()
|
|
100
|
+
embeddings = []
|
|
95
101
|
try:
|
|
96
|
-
|
|
102
|
+
for batch in batch_generator(elements, batch_size=self.config.batch_size):
|
|
103
|
+
response = client.embed(texts=batch, model=self.config.embedder_model_name)
|
|
104
|
+
embeddings.extend(response.embeddings)
|
|
97
105
|
except Exception as e:
|
|
98
106
|
raise self.wrap_error(e=e)
|
|
99
|
-
return
|
|
107
|
+
return embeddings
|
|
100
108
|
|
|
101
109
|
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
102
110
|
embeddings = self._embed_documents([e.get("text", "") for e in elements])
|
|
@@ -115,11 +123,16 @@ class AsyncVoyageAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
|
115
123
|
|
|
116
124
|
async def _embed_documents(self, elements: list[str]) -> list[list[float]]:
|
|
117
125
|
client = self.config.get_async_client()
|
|
126
|
+
embeddings = []
|
|
118
127
|
try:
|
|
119
|
-
|
|
128
|
+
for batch in batch_generator(
|
|
129
|
+
elements, batch_size=self.config.batch_size or len(elements)
|
|
130
|
+
):
|
|
131
|
+
response = await client.embed(texts=batch, model=self.config.embedder_model_name)
|
|
132
|
+
embeddings.extend(response.embeddings)
|
|
120
133
|
except Exception as e:
|
|
121
134
|
raise self.wrap_error(e=e)
|
|
122
|
-
return
|
|
135
|
+
return embeddings
|
|
123
136
|
|
|
124
137
|
async def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
125
138
|
embeddings = await self._embed_documents([e.get("text", "") for e in elements])
|
|
@@ -1,109 +1,159 @@
|
|
|
1
1
|
import base64
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import Optional
|
|
3
|
+
from typing import TYPE_CHECKING, Optional
|
|
4
4
|
from urllib.parse import urlparse
|
|
5
5
|
from uuid import NAMESPACE_DNS, uuid5
|
|
6
6
|
|
|
7
|
-
import
|
|
8
|
-
from bs4 import BeautifulSoup
|
|
9
|
-
from requests import Session
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
10
8
|
|
|
9
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
10
|
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, SourceIdentifiers
|
|
12
11
|
from unstructured_ingest.v2.logger import logger
|
|
13
12
|
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from bs4.element import Tag
|
|
15
|
+
from requests import Session
|
|
14
16
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
if
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
17
|
+
|
|
18
|
+
class HtmlMixin(BaseModel):
|
|
19
|
+
extract_images: bool = Field(
|
|
20
|
+
default=False,
|
|
21
|
+
description="if true, will download images and replace "
|
|
22
|
+
"the html content with base64 encoded images",
|
|
23
|
+
)
|
|
24
|
+
extract_files: bool = Field(
|
|
25
|
+
default=False, description="if true, will download any embedded files"
|
|
26
|
+
)
|
|
27
|
+
force_download: bool = Field(
|
|
28
|
+
default=False,
|
|
29
|
+
description="if true, will redownload extracted files even if they already exist locally",
|
|
30
|
+
)
|
|
31
|
+
allow_list: Optional[list[str]] = Field(
|
|
32
|
+
default=None,
|
|
33
|
+
description="list of allowed urls to download, if not set, "
|
|
34
|
+
"will default to the base url the original HTML came from",
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
@requires_dependencies(["requests"])
|
|
38
|
+
def get_default_session(self) -> "Session":
|
|
39
|
+
import requests
|
|
40
|
+
|
|
41
|
+
return requests.Session()
|
|
42
|
+
|
|
43
|
+
def get_absolute_url(self, tag_link: str, url: str) -> str:
|
|
44
|
+
parsed_url = urlparse(url)
|
|
45
|
+
base_url = parsed_url.scheme + "://" + parsed_url.netloc
|
|
46
|
+
if tag_link.startswith("//"):
|
|
47
|
+
return f"{parsed_url.scheme}:{tag_link}"
|
|
48
|
+
elif tag_link.startswith("http"):
|
|
49
|
+
return tag_link
|
|
27
50
|
else:
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
logger.debug(
|
|
34
|
-
"img tag having src updated from {} to base64 content".format(image["src"])
|
|
35
|
-
)
|
|
36
|
-
image["src"] = f"data:image/png;base64,{base64.b64encode(image_content).decode()}"
|
|
37
|
-
except Exception as e:
|
|
38
|
-
logger.warning(
|
|
39
|
-
f"failed to download image content from {source_url}: {e}", exc_info=True
|
|
40
|
-
)
|
|
41
|
-
return str(soup)
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def download_link(
|
|
45
|
-
download_dir: Path, link: str, session: Optional[Session] = None, force_download: bool = False
|
|
46
|
-
) -> Path:
|
|
47
|
-
session = session or requests.Session()
|
|
48
|
-
filename = Path(urlparse(url=link).path).name
|
|
49
|
-
download_path = download_dir / filename
|
|
50
|
-
logger.debug(f"downloading file from {link} to {download_path}")
|
|
51
|
-
if download_path.exists() and download_path.is_file() and not force_download:
|
|
52
|
-
return download_path
|
|
53
|
-
with download_path.open("wb") as downloaded_file:
|
|
54
|
-
response = session.get(link)
|
|
51
|
+
tag_link = tag_link.lstrip("/")
|
|
52
|
+
return f"{base_url}/{tag_link}"
|
|
53
|
+
|
|
54
|
+
def download_content(self, url: str, session: "Session") -> bytes:
|
|
55
|
+
response = session.get(url)
|
|
55
56
|
response.raise_for_status()
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
url
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
57
|
+
return response.content
|
|
58
|
+
|
|
59
|
+
def can_download(self, url_to_download: str, original_url: str) -> bool:
|
|
60
|
+
parsed_original_url = urlparse(original_url)
|
|
61
|
+
base_url = parsed_original_url.scheme + "://" + parsed_original_url.netloc
|
|
62
|
+
allow_list = self.allow_list or [base_url]
|
|
63
|
+
for allowed_url in allow_list:
|
|
64
|
+
if url_to_download.startswith(allowed_url):
|
|
65
|
+
return True
|
|
66
|
+
logger.info(f"Skipping url because it does not match the allow list: {url_to_download}")
|
|
67
|
+
return False
|
|
68
|
+
|
|
69
|
+
def extract_image_src(self, image: "Tag", url: str, session: "Session") -> "Tag":
|
|
70
|
+
current_src = image["src"]
|
|
71
|
+
if current_src.startswith("data:image/png;base64"):
|
|
72
|
+
# already base64 encoded
|
|
73
|
+
return image
|
|
74
|
+
absolute_url = self.get_absolute_url(tag_link=image["src"], url=url)
|
|
75
|
+
if not self.can_download(url_to_download=absolute_url, original_url=url):
|
|
76
|
+
return image
|
|
77
|
+
image_content = self.download_content(url=absolute_url, session=session)
|
|
78
|
+
logger.debug("img tag having src updated from {} to base64 content".format(image["src"]))
|
|
79
|
+
image["src"] = f"data:image/png;base64,{base64.b64encode(image_content).decode()}"
|
|
80
|
+
return image
|
|
81
|
+
|
|
82
|
+
@requires_dependencies(["bs4"])
|
|
83
|
+
def extract_html_images(self, url: str, html: str, session: Optional["Session"] = None) -> str:
|
|
84
|
+
from bs4 import BeautifulSoup
|
|
85
|
+
|
|
86
|
+
session = session or self.get_default_session()
|
|
87
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
88
|
+
images = soup.find_all("img")
|
|
89
|
+
for image in images:
|
|
90
|
+
self.extract_image_src(image=image, url=url, session=session)
|
|
91
|
+
return str(soup)
|
|
92
|
+
|
|
93
|
+
@requires_dependencies(["bs4"])
|
|
94
|
+
def get_hrefs(self, url: str, html: str) -> list:
|
|
95
|
+
from bs4 import BeautifulSoup
|
|
96
|
+
|
|
97
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
98
|
+
tags = soup.find_all("a", href=True)
|
|
99
|
+
hrefs = [
|
|
100
|
+
tag["href"]
|
|
101
|
+
for tag in tags
|
|
102
|
+
if not tag["href"].startswith("#") and Path(tag["href"]).suffix != ""
|
|
103
|
+
]
|
|
104
|
+
absolute_urls = [self.get_absolute_url(tag_link=href, url=url) for href in hrefs]
|
|
105
|
+
allowed_urls = [
|
|
106
|
+
url_to_download
|
|
107
|
+
for url_to_download in absolute_urls
|
|
108
|
+
if self.can_download(url_to_download=url_to_download, original_url=url)
|
|
109
|
+
]
|
|
110
|
+
return allowed_urls
|
|
111
|
+
|
|
112
|
+
def write_content(self, content: bytes, path: Path) -> None:
|
|
113
|
+
if path.exists() and path.is_file() and not self.force_download:
|
|
114
|
+
return
|
|
115
|
+
if not path.parent.exists():
|
|
116
|
+
path.parent.mkdir(parents=True)
|
|
117
|
+
with path.open("wb") as f:
|
|
118
|
+
f.write(content)
|
|
119
|
+
|
|
120
|
+
def get_download_response(
|
|
121
|
+
self, url: str, download_dir: Path, file_data: FileData, session: "Session"
|
|
122
|
+
) -> DownloadResponse:
|
|
123
|
+
filename = Path(urlparse(url=url).path).name
|
|
124
|
+
download_path = download_dir / filename
|
|
125
|
+
self.write_content(
|
|
126
|
+
content=self.download_content(url=url, session=session), path=download_path
|
|
102
127
|
)
|
|
103
|
-
|
|
128
|
+
result_file_data = file_data.model_copy(deep=True)
|
|
129
|
+
result_file_data.metadata.url = url
|
|
130
|
+
if result_file_data.metadata.record_locator is None:
|
|
131
|
+
result_file_data.metadata.record_locator = {}
|
|
132
|
+
result_file_data.metadata.record_locator["parent_url"] = url
|
|
133
|
+
result_file_data.identifier = str(uuid5(NAMESPACE_DNS, url + file_data.identifier))
|
|
134
|
+
filename = Path(urlparse(url=url).path).name
|
|
104
135
|
result_file_data.source_identifiers = SourceIdentifiers(
|
|
105
136
|
filename=filename, fullpath=filename
|
|
106
137
|
)
|
|
107
|
-
result_file_data.local_download_path =
|
|
108
|
-
|
|
109
|
-
|
|
138
|
+
result_file_data.local_download_path = download_path.as_posix()
|
|
139
|
+
return DownloadResponse(file_data=result_file_data, path=download_path)
|
|
140
|
+
|
|
141
|
+
def extract_embedded_files(
|
|
142
|
+
self,
|
|
143
|
+
url: str,
|
|
144
|
+
html: str,
|
|
145
|
+
download_dir: Path,
|
|
146
|
+
original_filedata: FileData,
|
|
147
|
+
session: Optional["Session"] = None,
|
|
148
|
+
) -> list[DownloadResponse]:
|
|
149
|
+
session = session or self.get_default_session()
|
|
150
|
+
urls_to_download = self.get_hrefs(url=url, html=html)
|
|
151
|
+
return [
|
|
152
|
+
self.get_download_response(
|
|
153
|
+
url=url_to_download,
|
|
154
|
+
download_dir=download_dir,
|
|
155
|
+
file_data=original_filedata,
|
|
156
|
+
session=session,
|
|
157
|
+
)
|
|
158
|
+
for url_to_download in urls_to_download
|
|
159
|
+
]
|
|
@@ -5,7 +5,7 @@ from .indexer import Indexer, IndexerConfig
|
|
|
5
5
|
from .process import BaseProcess
|
|
6
6
|
from .processor import ProcessorConfig
|
|
7
7
|
from .upload_stager import UploadStager, UploadStagerConfig
|
|
8
|
-
from .uploader import UploadContent, Uploader, UploaderConfig
|
|
8
|
+
from .uploader import UploadContent, Uploader, UploaderConfig, VectorDBUploader
|
|
9
9
|
|
|
10
10
|
__all__ = [
|
|
11
11
|
"DownloadResponse",
|
|
@@ -29,4 +29,5 @@ __all__ = [
|
|
|
29
29
|
"FileDataSourceMetadata",
|
|
30
30
|
"BatchFileData",
|
|
31
31
|
"BatchItem",
|
|
32
|
+
"VectorDBUploader",
|
|
32
33
|
]
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from abc import ABC
|
|
1
|
+
from abc import ABC
|
|
2
2
|
from typing import Any, AsyncGenerator, Generator, Optional, TypeVar
|
|
3
3
|
|
|
4
4
|
from pydantic import BaseModel
|
|
@@ -22,9 +22,8 @@ class Indexer(BaseProcess, BaseConnector, ABC):
|
|
|
22
22
|
def is_async(self) -> bool:
|
|
23
23
|
return False
|
|
24
24
|
|
|
25
|
-
@abstractmethod
|
|
26
25
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
27
|
-
|
|
26
|
+
raise NotImplementedError()
|
|
28
27
|
|
|
29
28
|
async def run_async(self, **kwargs: Any) -> AsyncGenerator[FileData, None]:
|
|
30
29
|
raise NotImplementedError()
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from abc import ABC
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Any, TypeVar
|
|
4
|
+
from typing import Any, Optional, TypeVar
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
@@ -38,6 +38,11 @@ class Uploader(BaseProcess, BaseConnector, ABC):
|
|
|
38
38
|
def run_batch(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
39
39
|
raise NotImplementedError()
|
|
40
40
|
|
|
41
|
+
def create_destination(self, destination_name: str = "elements", **kwargs: Any) -> bool:
|
|
42
|
+
# Update the uploader config if needed with a new destination that gets created.
|
|
43
|
+
# Return a flag on if anything was created or not.
|
|
44
|
+
return False
|
|
45
|
+
|
|
41
46
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
42
47
|
data = get_data(path=path)
|
|
43
48
|
self.run_data(data=data, file_data=file_data, **kwargs)
|
|
@@ -51,3 +56,11 @@ class Uploader(BaseProcess, BaseConnector, ABC):
|
|
|
51
56
|
|
|
52
57
|
async def run_data_async(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
53
58
|
return self.run_data(data=data, file_data=file_data, **kwargs)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class VectorDBUploader(Uploader, ABC):
|
|
63
|
+
def create_destination(
|
|
64
|
+
self, destination_name: str = "elements", vector_length: Optional[int] = None, **kwargs: Any
|
|
65
|
+
) -> bool:
|
|
66
|
+
return False
|
|
@@ -11,6 +11,7 @@ from typing import Any
|
|
|
11
11
|
from unstructured_ingest.v2.interfaces import ProcessorConfig, Uploader
|
|
12
12
|
from unstructured_ingest.v2.logger import logger, make_default_logger
|
|
13
13
|
from unstructured_ingest.v2.otel import OtelHandler
|
|
14
|
+
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
14
15
|
from unstructured_ingest.v2.pipeline.steps.chunk import Chunker, ChunkStep
|
|
15
16
|
from unstructured_ingest.v2.pipeline.steps.download import DownloaderT, DownloadStep
|
|
16
17
|
from unstructured_ingest.v2.pipeline.steps.embed import Embedder, EmbedStep
|
|
@@ -91,10 +92,6 @@ class Pipeline:
|
|
|
91
92
|
self.chunker_step = ChunkStep(process=chunker, context=self.context) if chunker else None
|
|
92
93
|
|
|
93
94
|
self.embedder_step = EmbedStep(process=embedder, context=self.context) if embedder else None
|
|
94
|
-
# TODO: support initialize() call from each step process
|
|
95
|
-
# Potential long call to download embedder models, run before any fanout:
|
|
96
|
-
if embedder and embedder.config:
|
|
97
|
-
embedder.config.get_embedder().initialize()
|
|
98
95
|
|
|
99
96
|
self.stager_step = UploadStageStep(process=stager, context=self.context) if stager else None
|
|
100
97
|
self.uploader_step = UploadStep(process=uploader, context=self.context)
|
|
@@ -135,6 +132,7 @@ class Pipeline:
|
|
|
135
132
|
with otel_handler.get_tracer().start_as_current_span(
|
|
136
133
|
"ingest process", record_exception=True
|
|
137
134
|
):
|
|
135
|
+
self._run_inits()
|
|
138
136
|
self._run_prechecks()
|
|
139
137
|
self._run()
|
|
140
138
|
finally:
|
|
@@ -156,7 +154,7 @@ class Pipeline:
|
|
|
156
154
|
final = [f for f in flat if f]
|
|
157
155
|
return final or None
|
|
158
156
|
|
|
159
|
-
def
|
|
157
|
+
def _get_all_steps(self) -> list[PipelineStep]:
|
|
160
158
|
steps = [self.indexer_step, self.downloader_step, self.partitioner_step, self.uploader_step]
|
|
161
159
|
if self.chunker_step:
|
|
162
160
|
steps.append(self.chunker_step)
|
|
@@ -166,8 +164,24 @@ class Pipeline:
|
|
|
166
164
|
steps.append(self.uncompress_step)
|
|
167
165
|
if self.stager_step:
|
|
168
166
|
steps.append(self.stager_step)
|
|
167
|
+
return steps
|
|
168
|
+
|
|
169
|
+
def _run_inits(self):
|
|
170
|
+
failures = {}
|
|
171
|
+
|
|
172
|
+
for step in self._get_all_steps():
|
|
173
|
+
try:
|
|
174
|
+
step.process.init()
|
|
175
|
+
except Exception as e:
|
|
176
|
+
failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
|
|
177
|
+
if failures:
|
|
178
|
+
for k, v in failures.items():
|
|
179
|
+
logger.error(f"Step init failure: {k}: {v}")
|
|
180
|
+
raise PipelineError("Init failed")
|
|
181
|
+
|
|
182
|
+
def _run_prechecks(self):
|
|
169
183
|
failures = {}
|
|
170
|
-
for step in
|
|
184
|
+
for step in self._get_all_steps():
|
|
171
185
|
try:
|
|
172
186
|
step.process.precheck()
|
|
173
187
|
except Exception as e:
|
|
File without changes
|
|
@@ -7,6 +7,7 @@ from pydantic import Field, Secret
|
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.error import SourceConnectionError
|
|
9
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
|
+
from unstructured_ingest.utils.html import HtmlMixin
|
|
10
11
|
from unstructured_ingest.v2.interfaces import (
|
|
11
12
|
AccessConfig,
|
|
12
13
|
ConnectionConfig,
|
|
@@ -180,19 +181,8 @@ class ConfluenceIndexer(Indexer):
|
|
|
180
181
|
yield file_data
|
|
181
182
|
|
|
182
183
|
|
|
183
|
-
class ConfluenceDownloaderConfig(DownloaderConfig):
|
|
184
|
-
|
|
185
|
-
default=False,
|
|
186
|
-
description="if true, will download images and replace "
|
|
187
|
-
"the html content with base64 encoded images",
|
|
188
|
-
)
|
|
189
|
-
extract_files: bool = Field(
|
|
190
|
-
default=False, description="if true, will download any embedded files"
|
|
191
|
-
)
|
|
192
|
-
force_download: bool = Field(
|
|
193
|
-
default=False,
|
|
194
|
-
description="if true, will redownload extracted files even if they already exist locally",
|
|
195
|
-
)
|
|
184
|
+
class ConfluenceDownloaderConfig(DownloaderConfig, HtmlMixin):
|
|
185
|
+
pass
|
|
196
186
|
|
|
197
187
|
|
|
198
188
|
@dataclass
|
|
@@ -206,24 +196,27 @@ class ConfluenceDownloader(Downloader):
|
|
|
206
196
|
) -> list[DownloadResponse]:
|
|
207
197
|
if not self.download_config.extract_files:
|
|
208
198
|
return []
|
|
209
|
-
|
|
210
|
-
|
|
199
|
+
url = current_file_data.metadata.url
|
|
200
|
+
if url is None:
|
|
201
|
+
logger.warning(
|
|
202
|
+
f"""Missing URL for file: {current_file_data.source_identifiers.filename}.
|
|
203
|
+
Skipping file extraction."""
|
|
204
|
+
)
|
|
205
|
+
return []
|
|
211
206
|
filepath = current_file_data.source_identifiers.relative_path
|
|
212
207
|
download_path = Path(self.download_dir) / filepath
|
|
213
208
|
download_dir = download_path.with_suffix("")
|
|
214
|
-
return
|
|
209
|
+
return self.download_config.extract_embedded_files(
|
|
210
|
+
url=url,
|
|
215
211
|
download_dir=download_dir,
|
|
216
212
|
original_filedata=current_file_data,
|
|
217
|
-
|
|
213
|
+
html=html,
|
|
218
214
|
session=session,
|
|
219
|
-
force_download=self.download_config.force_download,
|
|
220
215
|
)
|
|
221
216
|
|
|
222
217
|
def run(self, file_data: FileData, **kwargs) -> download_responses:
|
|
223
218
|
from bs4 import BeautifulSoup
|
|
224
219
|
|
|
225
|
-
from unstructured_ingest.utils.html import convert_image_tags
|
|
226
|
-
|
|
227
220
|
doc_id = file_data.identifier
|
|
228
221
|
try:
|
|
229
222
|
with self.connection_config.get_client() as client:
|
|
@@ -246,8 +239,8 @@ class ConfluenceDownloader(Downloader):
|
|
|
246
239
|
content = f"<body class='Document' >{title_html}{content}</body>"
|
|
247
240
|
if self.download_config.extract_images:
|
|
248
241
|
with self.connection_config.get_client() as client:
|
|
249
|
-
content =
|
|
250
|
-
url=file_data.metadata.url,
|
|
242
|
+
content = self.download_config.extract_html_images(
|
|
243
|
+
url=file_data.metadata.url, html=content, session=client._session
|
|
251
244
|
)
|
|
252
245
|
|
|
253
246
|
filepath = file_data.source_identifiers.relative_path
|