unstructured-ingest 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (40) hide show
  1. test/integration/connectors/weaviate/test_local.py +27 -6
  2. test/integration/embedders/test_azure_openai.py +1 -3
  3. test/integration/embedders/test_bedrock.py +2 -2
  4. test/integration/embedders/test_huggingface.py +1 -3
  5. test/integration/embedders/test_mixedbread.py +2 -2
  6. test/integration/embedders/test_octoai.py +2 -4
  7. test/integration/embedders/test_openai.py +2 -4
  8. test/integration/embedders/test_togetherai.py +2 -2
  9. test/integration/embedders/test_vertexai.py +2 -4
  10. test/integration/embedders/test_voyageai.py +2 -4
  11. test/integration/embedders/utils.py +12 -14
  12. test/unit/embed/test_openai.py +12 -4
  13. test/unit/test_html.py +112 -0
  14. test/unit/v2/embedders/test_voyageai.py +1 -1
  15. unstructured_ingest/__version__.py +1 -1
  16. unstructured_ingest/embed/huggingface.py +6 -1
  17. unstructured_ingest/embed/interfaces.py +9 -6
  18. unstructured_ingest/embed/mixedbreadai.py +3 -10
  19. unstructured_ingest/embed/octoai.py +14 -7
  20. unstructured_ingest/embed/openai.py +18 -5
  21. unstructured_ingest/embed/togetherai.py +19 -8
  22. unstructured_ingest/embed/vertexai.py +13 -6
  23. unstructured_ingest/embed/voyageai.py +19 -6
  24. unstructured_ingest/utils/html.py +143 -93
  25. unstructured_ingest/v2/interfaces/__init__.py +2 -1
  26. unstructured_ingest/v2/interfaces/indexer.py +2 -3
  27. unstructured_ingest/v2/interfaces/process.py +3 -0
  28. unstructured_ingest/v2/interfaces/uploader.py +14 -1
  29. unstructured_ingest/v2/pipeline/pipeline.py +20 -6
  30. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  31. unstructured_ingest/v2/processes/connectors/confluence.py +15 -22
  32. unstructured_ingest/v2/processes/connectors/onedrive.py +5 -29
  33. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +41 -3
  34. unstructured_ingest/v2/processes/embedder.py +3 -0
  35. {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.4.dist-info}/METADATA +9 -9
  36. {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.4.dist-info}/RECORD +40 -38
  37. {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.4.dist-info}/LICENSE.md +0 -0
  38. {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.4.dist-info}/WHEEL +0 -0
  39. {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.4.dist-info}/entry_points.txt +0 -0
  40. {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.4.dist-info}/top_level.txt +0 -0
@@ -9,6 +9,7 @@ from unstructured_ingest.embed.interfaces import (
9
9
  EmbeddingConfig,
10
10
  )
11
11
  from unstructured_ingest.logger import logger
12
+ from unstructured_ingest.utils.data_prep import batch_generator
12
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
13
14
  from unstructured_ingest.v2.errors import (
14
15
  RateLimitError as CustomRateLimitError,
@@ -71,13 +72,18 @@ class TogetherAIEmbeddingEncoder(BaseEmbeddingEncoder):
71
72
 
72
73
  def _embed_documents(self, elements: list[str]) -> list[list[float]]:
73
74
  client = self.config.get_client()
75
+ embeddings = []
74
76
  try:
75
- outputs = client.embeddings.create(
76
- model=self.config.embedder_model_name, input=elements
77
- )
77
+ for batch in batch_generator(
78
+ elements, batch_size=self.config.batch_size or len(elements)
79
+ ):
80
+ outputs = client.embeddings.create(
81
+ model=self.config.embedder_model_name, input=batch
82
+ )
83
+ embeddings.extend([outputs.data[i].embedding for i in range(len(batch))])
78
84
  except Exception as e:
79
85
  raise self.wrap_error(e=e)
80
- return [outputs.data[i].embedding for i in range(len(elements))]
86
+ return embeddings
81
87
 
82
88
 
83
89
  @dataclass
@@ -97,10 +103,15 @@ class AsyncTogetherAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
97
103
 
98
104
  async def _embed_documents(self, elements: list[str]) -> list[list[float]]:
99
105
  client = self.config.get_async_client()
106
+ embeddings = []
100
107
  try:
101
- outputs = await client.embeddings.create(
102
- model=self.config.embedder_model_name, input=elements
103
- )
108
+ for batch in batch_generator(
109
+ elements, batch_size=self.config.batch_size or len(elements)
110
+ ):
111
+ outputs = await client.embeddings.create(
112
+ model=self.config.embedder_model_name, input=batch
113
+ )
114
+ embeddings.extend([outputs.data[i].embedding for i in range(len(batch))])
104
115
  except Exception as e:
105
116
  raise self.wrap_error(e=e)
106
- return [outputs.data[i].embedding for i in range(len(elements))]
117
+ return embeddings
@@ -13,6 +13,7 @@ from unstructured_ingest.embed.interfaces import (
13
13
  BaseEmbeddingEncoder,
14
14
  EmbeddingConfig,
15
15
  )
16
+ from unstructured_ingest.utils.data_prep import batch_generator
16
17
  from unstructured_ingest.utils.dep_check import requires_dependencies
17
18
  from unstructured_ingest.v2.errors import UserAuthError
18
19
 
@@ -86,12 +87,15 @@ class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
86
87
  from vertexai.language_models import TextEmbeddingInput
87
88
 
88
89
  inputs = [TextEmbeddingInput(text=element) for element in elements]
90
+ client = self.config.get_client()
91
+ embeddings = []
89
92
  try:
90
- client = self.config.get_client()
91
- embeddings = client.get_embeddings(inputs)
93
+ for batch in batch_generator(inputs, batch_size=self.config.batch_size or len(inputs)):
94
+ response = client.get_embeddings(batch)
95
+ embeddings.extend([e.values for e in response])
92
96
  except Exception as e:
93
97
  raise self.wrap_error(e=e)
94
- return [e.values for e in embeddings]
98
+ return embeddings
95
99
 
96
100
 
97
101
  @dataclass
@@ -118,9 +122,12 @@ class AsyncVertexAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
118
122
  from vertexai.language_models import TextEmbeddingInput
119
123
 
120
124
  inputs = [TextEmbeddingInput(text=element) for element in elements]
125
+ client = self.config.get_client()
126
+ embeddings = []
121
127
  try:
122
- client = self.config.get_client()
123
- embeddings = await client.get_embeddings_async(inputs)
128
+ for batch in batch_generator(inputs, batch_size=self.config.batch_size or len(inputs)):
129
+ response = await client.get_embeddings_async(batch)
130
+ embeddings.extend([e.values for e in response])
124
131
  except Exception as e:
125
132
  raise self.wrap_error(e=e)
126
- return [e.values for e in embeddings]
133
+ return embeddings
@@ -9,6 +9,7 @@ from unstructured_ingest.embed.interfaces import (
9
9
  EmbeddingConfig,
10
10
  )
11
11
  from unstructured_ingest.logger import logger
12
+ from unstructured_ingest.utils.data_prep import batch_generator
12
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
13
14
  from unstructured_ingest.v2.errors import (
14
15
  ProviderError,
@@ -25,9 +26,13 @@ if TYPE_CHECKING:
25
26
 
26
27
 
27
28
  class VoyageAIEmbeddingConfig(EmbeddingConfig):
29
+ batch_size: int = Field(
30
+ default=32,
31
+ le=128,
32
+ description="Batch size for embedding requests. VoyageAI has a limit of 128.",
33
+ )
28
34
  api_key: SecretStr
29
35
  embedder_model_name: str = Field(default="voyage-3", alias="model_name")
30
- batch_size: Optional[int] = Field(default=None)
31
36
  truncation: Optional[bool] = Field(default=None)
32
37
  max_retries: int = 0
33
38
  timeout_in_seconds: Optional[int] = None
@@ -91,12 +96,15 @@ class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
91
96
  return self.config.wrap_error(e=e)
92
97
 
93
98
  def _embed_documents(self, elements: list[str]) -> list[list[float]]:
94
- client: VoyageAIClient = self.config.get_client()
99
+ client = self.config.get_client()
100
+ embeddings = []
95
101
  try:
96
- response = client.embed(texts=elements, model=self.config.embedder_model_name)
102
+ for batch in batch_generator(elements, batch_size=self.config.batch_size):
103
+ response = client.embed(texts=batch, model=self.config.embedder_model_name)
104
+ embeddings.extend(response.embeddings)
97
105
  except Exception as e:
98
106
  raise self.wrap_error(e=e)
99
- return response.embeddings
107
+ return embeddings
100
108
 
101
109
  def embed_documents(self, elements: list[dict]) -> list[dict]:
102
110
  embeddings = self._embed_documents([e.get("text", "") for e in elements])
@@ -115,11 +123,16 @@ class AsyncVoyageAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
115
123
 
116
124
  async def _embed_documents(self, elements: list[str]) -> list[list[float]]:
117
125
  client = self.config.get_async_client()
126
+ embeddings = []
118
127
  try:
119
- response = await client.embed(texts=elements, model=self.config.embedder_model_name)
128
+ for batch in batch_generator(
129
+ elements, batch_size=self.config.batch_size or len(elements)
130
+ ):
131
+ response = await client.embed(texts=batch, model=self.config.embedder_model_name)
132
+ embeddings.extend(response.embeddings)
120
133
  except Exception as e:
121
134
  raise self.wrap_error(e=e)
122
- return response.embeddings
135
+ return embeddings
123
136
 
124
137
  async def embed_documents(self, elements: list[dict]) -> list[dict]:
125
138
  embeddings = await self._embed_documents([e.get("text", "") for e in elements])
@@ -1,109 +1,159 @@
1
1
  import base64
2
2
  from pathlib import Path
3
- from typing import Optional
3
+ from typing import TYPE_CHECKING, Optional
4
4
  from urllib.parse import urlparse
5
5
  from uuid import NAMESPACE_DNS, uuid5
6
6
 
7
- import requests
8
- from bs4 import BeautifulSoup
9
- from requests import Session
7
+ from pydantic import BaseModel, Field
10
8
 
9
+ from unstructured_ingest.utils.dep_check import requires_dependencies
11
10
  from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, SourceIdentifiers
12
11
  from unstructured_ingest.v2.logger import logger
13
12
 
13
+ if TYPE_CHECKING:
14
+ from bs4.element import Tag
15
+ from requests import Session
14
16
 
15
- def convert_image_tags(url: str, original_html: str, session: Optional[Session] = None) -> str:
16
- session = session or requests.Session()
17
- parsed_url = urlparse(url)
18
- base_url = parsed_url.scheme + "://" + parsed_url.netloc
19
- soup = BeautifulSoup(original_html, "html.parser")
20
- images = soup.find_all("img")
21
- for image in images:
22
- current_source = image["src"]
23
- if current_source.startswith("//"):
24
- source_url = f"{parsed_url.scheme}:{current_source}"
25
- elif current_source.startswith("http"):
26
- source_url = current_source
17
+
18
+ class HtmlMixin(BaseModel):
19
+ extract_images: bool = Field(
20
+ default=False,
21
+ description="if true, will download images and replace "
22
+ "the html content with base64 encoded images",
23
+ )
24
+ extract_files: bool = Field(
25
+ default=False, description="if true, will download any embedded files"
26
+ )
27
+ force_download: bool = Field(
28
+ default=False,
29
+ description="if true, will redownload extracted files even if they already exist locally",
30
+ )
31
+ allow_list: Optional[list[str]] = Field(
32
+ default=None,
33
+ description="list of allowed urls to download, if not set, "
34
+ "will default to the base url the original HTML came from",
35
+ )
36
+
37
+ @requires_dependencies(["requests"])
38
+ def get_default_session(self) -> "Session":
39
+ import requests
40
+
41
+ return requests.Session()
42
+
43
+ def get_absolute_url(self, tag_link: str, url: str) -> str:
44
+ parsed_url = urlparse(url)
45
+ base_url = parsed_url.scheme + "://" + parsed_url.netloc
46
+ if tag_link.startswith("//"):
47
+ return f"{parsed_url.scheme}:{tag_link}"
48
+ elif tag_link.startswith("http"):
49
+ return tag_link
27
50
  else:
28
- source_url = base_url + current_source
29
- try:
30
- response = session.get(source_url)
31
- response.raise_for_status()
32
- image_content = response.content
33
- logger.debug(
34
- "img tag having src updated from {} to base64 content".format(image["src"])
35
- )
36
- image["src"] = f"data:image/png;base64,{base64.b64encode(image_content).decode()}"
37
- except Exception as e:
38
- logger.warning(
39
- f"failed to download image content from {source_url}: {e}", exc_info=True
40
- )
41
- return str(soup)
42
-
43
-
44
- def download_link(
45
- download_dir: Path, link: str, session: Optional[Session] = None, force_download: bool = False
46
- ) -> Path:
47
- session = session or requests.Session()
48
- filename = Path(urlparse(url=link).path).name
49
- download_path = download_dir / filename
50
- logger.debug(f"downloading file from {link} to {download_path}")
51
- if download_path.exists() and download_path.is_file() and not force_download:
52
- return download_path
53
- with download_path.open("wb") as downloaded_file:
54
- response = session.get(link)
51
+ tag_link = tag_link.lstrip("/")
52
+ return f"{base_url}/{tag_link}"
53
+
54
+ def download_content(self, url: str, session: "Session") -> bytes:
55
+ response = session.get(url)
55
56
  response.raise_for_status()
56
- downloaded_file.write(response.content)
57
- return download_path
58
-
59
-
60
- def download_embedded_files(
61
- download_dir: Path,
62
- original_filedata: FileData,
63
- original_html: str,
64
- session: Optional[Session] = None,
65
- force_download: bool = False,
66
- ) -> list[DownloadResponse]:
67
- session = session or requests.Session()
68
- url = original_filedata.metadata.url
69
- parsed_url = urlparse(url)
70
- base_url = parsed_url.scheme + "://" + parsed_url.netloc
71
- soup = BeautifulSoup(original_html, "html.parser")
72
- tags = soup.find_all("a", href=True)
73
- hrefs = [
74
- tag["href"]
75
- for tag in tags
76
- if not tag["href"].startswith("#") and Path(tag["href"]).suffix != ""
77
- ]
78
- results = []
79
- for current_source in hrefs:
80
- download_dir.mkdir(parents=True, exist_ok=True)
81
- if current_source.startswith("//"):
82
- source_url = f"{parsed_url.scheme}:{current_source}"
83
- elif current_source.startswith("http"):
84
- source_url = current_source
85
- else:
86
- source_url = base_url + current_source
87
- try:
88
- downloaded_path = download_link(
89
- download_dir=download_dir,
90
- link=source_url,
91
- session=session,
92
- force_download=force_download,
93
- )
94
- except Exception as e:
95
- logger.warning(f"failed to download file content from {source_url}: {e}")
96
- continue
97
- result_file_data = original_filedata.model_copy(deep=True)
98
- result_file_data.metadata.url = source_url
99
- result_file_data.metadata.record_locator["parent_url"] = url
100
- result_file_data.identifier = str(
101
- uuid5(NAMESPACE_DNS, source_url + original_filedata.identifier)
57
+ return response.content
58
+
59
+ def can_download(self, url_to_download: str, original_url: str) -> bool:
60
+ parsed_original_url = urlparse(original_url)
61
+ base_url = parsed_original_url.scheme + "://" + parsed_original_url.netloc
62
+ allow_list = self.allow_list or [base_url]
63
+ for allowed_url in allow_list:
64
+ if url_to_download.startswith(allowed_url):
65
+ return True
66
+ logger.info(f"Skipping url because it does not match the allow list: {url_to_download}")
67
+ return False
68
+
69
+ def extract_image_src(self, image: "Tag", url: str, session: "Session") -> "Tag":
70
+ current_src = image["src"]
71
+ if current_src.startswith("data:image/png;base64"):
72
+ # already base64 encoded
73
+ return image
74
+ absolute_url = self.get_absolute_url(tag_link=image["src"], url=url)
75
+ if not self.can_download(url_to_download=absolute_url, original_url=url):
76
+ return image
77
+ image_content = self.download_content(url=absolute_url, session=session)
78
+ logger.debug("img tag having src updated from {} to base64 content".format(image["src"]))
79
+ image["src"] = f"data:image/png;base64,{base64.b64encode(image_content).decode()}"
80
+ return image
81
+
82
+ @requires_dependencies(["bs4"])
83
+ def extract_html_images(self, url: str, html: str, session: Optional["Session"] = None) -> str:
84
+ from bs4 import BeautifulSoup
85
+
86
+ session = session or self.get_default_session()
87
+ soup = BeautifulSoup(html, "html.parser")
88
+ images = soup.find_all("img")
89
+ for image in images:
90
+ self.extract_image_src(image=image, url=url, session=session)
91
+ return str(soup)
92
+
93
+ @requires_dependencies(["bs4"])
94
+ def get_hrefs(self, url: str, html: str) -> list:
95
+ from bs4 import BeautifulSoup
96
+
97
+ soup = BeautifulSoup(html, "html.parser")
98
+ tags = soup.find_all("a", href=True)
99
+ hrefs = [
100
+ tag["href"]
101
+ for tag in tags
102
+ if not tag["href"].startswith("#") and Path(tag["href"]).suffix != ""
103
+ ]
104
+ absolute_urls = [self.get_absolute_url(tag_link=href, url=url) for href in hrefs]
105
+ allowed_urls = [
106
+ url_to_download
107
+ for url_to_download in absolute_urls
108
+ if self.can_download(url_to_download=url_to_download, original_url=url)
109
+ ]
110
+ return allowed_urls
111
+
112
+ def write_content(self, content: bytes, path: Path) -> None:
113
+ if path.exists() and path.is_file() and not self.force_download:
114
+ return
115
+ if not path.parent.exists():
116
+ path.parent.mkdir(parents=True)
117
+ with path.open("wb") as f:
118
+ f.write(content)
119
+
120
+ def get_download_response(
121
+ self, url: str, download_dir: Path, file_data: FileData, session: "Session"
122
+ ) -> DownloadResponse:
123
+ filename = Path(urlparse(url=url).path).name
124
+ download_path = download_dir / filename
125
+ self.write_content(
126
+ content=self.download_content(url=url, session=session), path=download_path
102
127
  )
103
- filename = Path(urlparse(url=source_url).path).name
128
+ result_file_data = file_data.model_copy(deep=True)
129
+ result_file_data.metadata.url = url
130
+ if result_file_data.metadata.record_locator is None:
131
+ result_file_data.metadata.record_locator = {}
132
+ result_file_data.metadata.record_locator["parent_url"] = url
133
+ result_file_data.identifier = str(uuid5(NAMESPACE_DNS, url + file_data.identifier))
134
+ filename = Path(urlparse(url=url).path).name
104
135
  result_file_data.source_identifiers = SourceIdentifiers(
105
136
  filename=filename, fullpath=filename
106
137
  )
107
- result_file_data.local_download_path = downloaded_path.as_posix()
108
- results.append(DownloadResponse(file_data=result_file_data, path=downloaded_path))
109
- return results
138
+ result_file_data.local_download_path = download_path.as_posix()
139
+ return DownloadResponse(file_data=result_file_data, path=download_path)
140
+
141
+ def extract_embedded_files(
142
+ self,
143
+ url: str,
144
+ html: str,
145
+ download_dir: Path,
146
+ original_filedata: FileData,
147
+ session: Optional["Session"] = None,
148
+ ) -> list[DownloadResponse]:
149
+ session = session or self.get_default_session()
150
+ urls_to_download = self.get_hrefs(url=url, html=html)
151
+ return [
152
+ self.get_download_response(
153
+ url=url_to_download,
154
+ download_dir=download_dir,
155
+ file_data=original_filedata,
156
+ session=session,
157
+ )
158
+ for url_to_download in urls_to_download
159
+ ]
@@ -5,7 +5,7 @@ from .indexer import Indexer, IndexerConfig
5
5
  from .process import BaseProcess
6
6
  from .processor import ProcessorConfig
7
7
  from .upload_stager import UploadStager, UploadStagerConfig
8
- from .uploader import UploadContent, Uploader, UploaderConfig
8
+ from .uploader import UploadContent, Uploader, UploaderConfig, VectorDBUploader
9
9
 
10
10
  __all__ = [
11
11
  "DownloadResponse",
@@ -29,4 +29,5 @@ __all__ = [
29
29
  "FileDataSourceMetadata",
30
30
  "BatchFileData",
31
31
  "BatchItem",
32
+ "VectorDBUploader",
32
33
  ]
@@ -1,4 +1,4 @@
1
- from abc import ABC, abstractmethod
1
+ from abc import ABC
2
2
  from typing import Any, AsyncGenerator, Generator, Optional, TypeVar
3
3
 
4
4
  from pydantic import BaseModel
@@ -22,9 +22,8 @@ class Indexer(BaseProcess, BaseConnector, ABC):
22
22
  def is_async(self) -> bool:
23
23
  return False
24
24
 
25
- @abstractmethod
26
25
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
27
- pass
26
+ raise NotImplementedError()
28
27
 
29
28
  async def run_async(self, **kwargs: Any) -> AsyncGenerator[FileData, None]:
30
29
  raise NotImplementedError()
@@ -8,6 +8,9 @@ class BaseProcess(ABC):
8
8
  def is_async(self) -> bool:
9
9
  return False
10
10
 
11
+ def init(self, *kwargs: Any) -> None:
12
+ pass
13
+
11
14
  def precheck(self) -> None:
12
15
  pass
13
16
 
@@ -1,7 +1,7 @@
1
1
  from abc import ABC
2
2
  from dataclasses import dataclass
3
3
  from pathlib import Path
4
- from typing import Any, TypeVar
4
+ from typing import Any, Optional, TypeVar
5
5
 
6
6
  from pydantic import BaseModel
7
7
 
@@ -38,6 +38,11 @@ class Uploader(BaseProcess, BaseConnector, ABC):
38
38
  def run_batch(self, contents: list[UploadContent], **kwargs: Any) -> None:
39
39
  raise NotImplementedError()
40
40
 
41
+ def create_destination(self, destination_name: str = "elements", **kwargs: Any) -> bool:
42
+ # Update the uploader config if needed with a new destination that gets created.
43
+ # Return a flag on if anything was created or not.
44
+ return False
45
+
41
46
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
42
47
  data = get_data(path=path)
43
48
  self.run_data(data=data, file_data=file_data, **kwargs)
@@ -51,3 +56,11 @@ class Uploader(BaseProcess, BaseConnector, ABC):
51
56
 
52
57
  async def run_data_async(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
53
58
  return self.run_data(data=data, file_data=file_data, **kwargs)
59
+
60
+
61
+ @dataclass
62
+ class VectorDBUploader(Uploader, ABC):
63
+ def create_destination(
64
+ self, destination_name: str = "elements", vector_length: Optional[int] = None, **kwargs: Any
65
+ ) -> bool:
66
+ return False
@@ -11,6 +11,7 @@ from typing import Any
11
11
  from unstructured_ingest.v2.interfaces import ProcessorConfig, Uploader
12
12
  from unstructured_ingest.v2.logger import logger, make_default_logger
13
13
  from unstructured_ingest.v2.otel import OtelHandler
14
+ from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
14
15
  from unstructured_ingest.v2.pipeline.steps.chunk import Chunker, ChunkStep
15
16
  from unstructured_ingest.v2.pipeline.steps.download import DownloaderT, DownloadStep
16
17
  from unstructured_ingest.v2.pipeline.steps.embed import Embedder, EmbedStep
@@ -91,10 +92,6 @@ class Pipeline:
91
92
  self.chunker_step = ChunkStep(process=chunker, context=self.context) if chunker else None
92
93
 
93
94
  self.embedder_step = EmbedStep(process=embedder, context=self.context) if embedder else None
94
- # TODO: support initialize() call from each step process
95
- # Potential long call to download embedder models, run before any fanout:
96
- if embedder and embedder.config:
97
- embedder.config.get_embedder().initialize()
98
95
 
99
96
  self.stager_step = UploadStageStep(process=stager, context=self.context) if stager else None
100
97
  self.uploader_step = UploadStep(process=uploader, context=self.context)
@@ -135,6 +132,7 @@ class Pipeline:
135
132
  with otel_handler.get_tracer().start_as_current_span(
136
133
  "ingest process", record_exception=True
137
134
  ):
135
+ self._run_inits()
138
136
  self._run_prechecks()
139
137
  self._run()
140
138
  finally:
@@ -156,7 +154,7 @@ class Pipeline:
156
154
  final = [f for f in flat if f]
157
155
  return final or None
158
156
 
159
- def _run_prechecks(self):
157
+ def _get_all_steps(self) -> list[PipelineStep]:
160
158
  steps = [self.indexer_step, self.downloader_step, self.partitioner_step, self.uploader_step]
161
159
  if self.chunker_step:
162
160
  steps.append(self.chunker_step)
@@ -166,8 +164,24 @@ class Pipeline:
166
164
  steps.append(self.uncompress_step)
167
165
  if self.stager_step:
168
166
  steps.append(self.stager_step)
167
+ return steps
168
+
169
+ def _run_inits(self):
170
+ failures = {}
171
+
172
+ for step in self._get_all_steps():
173
+ try:
174
+ step.process.init()
175
+ except Exception as e:
176
+ failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
177
+ if failures:
178
+ for k, v in failures.items():
179
+ logger.error(f"Step init failure: {k}: {v}")
180
+ raise PipelineError("Init failed")
181
+
182
+ def _run_prechecks(self):
169
183
  failures = {}
170
- for step in steps:
184
+ for step in self._get_all_steps():
171
185
  try:
172
186
  step.process.precheck()
173
187
  except Exception as e:
@@ -7,6 +7,7 @@ from pydantic import Field, Secret
7
7
 
8
8
  from unstructured_ingest.error import SourceConnectionError
9
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
+ from unstructured_ingest.utils.html import HtmlMixin
10
11
  from unstructured_ingest.v2.interfaces import (
11
12
  AccessConfig,
12
13
  ConnectionConfig,
@@ -180,19 +181,8 @@ class ConfluenceIndexer(Indexer):
180
181
  yield file_data
181
182
 
182
183
 
183
- class ConfluenceDownloaderConfig(DownloaderConfig):
184
- extract_images: bool = Field(
185
- default=False,
186
- description="if true, will download images and replace "
187
- "the html content with base64 encoded images",
188
- )
189
- extract_files: bool = Field(
190
- default=False, description="if true, will download any embedded files"
191
- )
192
- force_download: bool = Field(
193
- default=False,
194
- description="if true, will redownload extracted files even if they already exist locally",
195
- )
184
+ class ConfluenceDownloaderConfig(DownloaderConfig, HtmlMixin):
185
+ pass
196
186
 
197
187
 
198
188
  @dataclass
@@ -206,24 +196,27 @@ class ConfluenceDownloader(Downloader):
206
196
  ) -> list[DownloadResponse]:
207
197
  if not self.download_config.extract_files:
208
198
  return []
209
- from unstructured_ingest.utils.html import download_embedded_files
210
-
199
+ url = current_file_data.metadata.url
200
+ if url is None:
201
+ logger.warning(
202
+ f"""Missing URL for file: {current_file_data.source_identifiers.filename}.
203
+ Skipping file extraction."""
204
+ )
205
+ return []
211
206
  filepath = current_file_data.source_identifiers.relative_path
212
207
  download_path = Path(self.download_dir) / filepath
213
208
  download_dir = download_path.with_suffix("")
214
- return download_embedded_files(
209
+ return self.download_config.extract_embedded_files(
210
+ url=url,
215
211
  download_dir=download_dir,
216
212
  original_filedata=current_file_data,
217
- original_html=html,
213
+ html=html,
218
214
  session=session,
219
- force_download=self.download_config.force_download,
220
215
  )
221
216
 
222
217
  def run(self, file_data: FileData, **kwargs) -> download_responses:
223
218
  from bs4 import BeautifulSoup
224
219
 
225
- from unstructured_ingest.utils.html import convert_image_tags
226
-
227
220
  doc_id = file_data.identifier
228
221
  try:
229
222
  with self.connection_config.get_client() as client:
@@ -246,8 +239,8 @@ class ConfluenceDownloader(Downloader):
246
239
  content = f"<body class='Document' >{title_html}{content}</body>"
247
240
  if self.download_config.extract_images:
248
241
  with self.connection_config.get_client() as client:
249
- content = convert_image_tags(
250
- url=file_data.metadata.url, original_html=content, session=client._session
242
+ content = self.download_config.extract_html_images(
243
+ url=file_data.metadata.url, html=content, session=client._session
251
244
  )
252
245
 
253
246
  filepath = file_data.source_identifiers.relative_path