unstructured-ingest 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (45) hide show
  1. test/integration/connectors/sql/test_databricks_delta_tables.py +10 -10
  2. test/integration/connectors/weaviate/test_local.py +27 -6
  3. test/integration/embedders/test_azure_openai.py +1 -3
  4. test/integration/embedders/test_bedrock.py +2 -2
  5. test/integration/embedders/test_huggingface.py +1 -3
  6. test/integration/embedders/test_mixedbread.py +2 -2
  7. test/integration/embedders/test_octoai.py +2 -4
  8. test/integration/embedders/test_openai.py +2 -4
  9. test/integration/embedders/test_togetherai.py +2 -2
  10. test/integration/embedders/test_vertexai.py +2 -4
  11. test/integration/embedders/test_voyageai.py +2 -4
  12. test/integration/embedders/utils.py +12 -14
  13. test/unit/embed/test_openai.py +12 -4
  14. test/unit/test_html.py +112 -0
  15. test/unit/v2/connectors/databricks/__init__.py +0 -0
  16. test/unit/v2/connectors/databricks/test_volumes_table.py +44 -0
  17. test/unit/v2/embedders/test_voyageai.py +1 -1
  18. unstructured_ingest/__version__.py +1 -1
  19. unstructured_ingest/embed/huggingface.py +6 -1
  20. unstructured_ingest/embed/interfaces.py +9 -6
  21. unstructured_ingest/embed/mixedbreadai.py +3 -10
  22. unstructured_ingest/embed/octoai.py +14 -7
  23. unstructured_ingest/embed/openai.py +18 -5
  24. unstructured_ingest/embed/togetherai.py +19 -8
  25. unstructured_ingest/embed/vertexai.py +13 -6
  26. unstructured_ingest/embed/voyageai.py +19 -6
  27. unstructured_ingest/utils/data_prep.py +1 -1
  28. unstructured_ingest/utils/html.py +143 -93
  29. unstructured_ingest/v2/interfaces/__init__.py +2 -1
  30. unstructured_ingest/v2/interfaces/process.py +3 -0
  31. unstructured_ingest/v2/interfaces/uploader.py +14 -1
  32. unstructured_ingest/v2/pipeline/pipeline.py +20 -6
  33. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  34. unstructured_ingest/v2/processes/connectors/confluence.py +15 -22
  35. unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +14 -11
  36. unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +15 -15
  37. unstructured_ingest/v2/processes/connectors/sql/sql.py +4 -1
  38. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +41 -3
  39. unstructured_ingest/v2/processes/embedder.py +3 -0
  40. {unstructured_ingest-0.4.1.dist-info → unstructured_ingest-0.4.3.dist-info}/METADATA +22 -22
  41. {unstructured_ingest-0.4.1.dist-info → unstructured_ingest-0.4.3.dist-info}/RECORD +45 -41
  42. {unstructured_ingest-0.4.1.dist-info → unstructured_ingest-0.4.3.dist-info}/LICENSE.md +0 -0
  43. {unstructured_ingest-0.4.1.dist-info → unstructured_ingest-0.4.3.dist-info}/WHEEL +0 -0
  44. {unstructured_ingest-0.4.1.dist-info → unstructured_ingest-0.4.3.dist-info}/entry_points.txt +0 -0
  45. {unstructured_ingest-0.4.1.dist-info → unstructured_ingest-0.4.3.dist-info}/top_level.txt +0 -0
@@ -9,6 +9,7 @@ from unstructured_ingest.embed.interfaces import (
9
9
  EmbeddingConfig,
10
10
  )
11
11
  from unstructured_ingest.logger import logger
12
+ from unstructured_ingest.utils.data_prep import batch_generator
12
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
13
14
  from unstructured_ingest.v2.errors import (
14
15
  ProviderError,
@@ -89,12 +90,16 @@ class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
89
90
 
90
91
  def embed_documents(self, elements: list[dict]) -> list[dict]:
91
92
  texts = [e.get("text", "") for e in elements]
93
+ embeddings = []
94
+ client = self.config.get_client()
92
95
  try:
93
- client = self.config.get_client()
94
- response = client.embeddings.create(input=texts, model=self.config.embedder_model_name)
96
+ for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
97
+ response = client.embeddings.create(
98
+ input=batch, model=self.config.embedder_model_name
99
+ )
100
+ embeddings.extend([data.embedding for data in response.data])
95
101
  except Exception as e:
96
102
  raise self.wrap_error(e=e)
97
- embeddings = [data.embedding for data in response.data]
98
103
  elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
99
104
  return elements_with_embeddings
100
105
 
@@ -119,12 +124,14 @@ class AsyncOctoAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
119
124
  async def embed_documents(self, elements: list[dict]) -> list[dict]:
120
125
  texts = [e.get("text", "") for e in elements]
121
126
  client = self.config.get_async_client()
127
+ embeddings = []
122
128
  try:
123
- response = await client.embeddings.create(
124
- input=texts, model=self.config.embedder_model_name
125
- )
129
+ for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
130
+ response = await client.embeddings.create(
131
+ input=batch, model=self.config.embedder_model_name
132
+ )
133
+ embeddings.extend([data.embedding for data in response.data])
126
134
  except Exception as e:
127
135
  raise self.wrap_error(e=e)
128
- embeddings = [data.embedding for data in response.data]
129
136
  elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
130
137
  return elements_with_embeddings
@@ -9,6 +9,7 @@ from unstructured_ingest.embed.interfaces import (
9
9
  EmbeddingConfig,
10
10
  )
11
11
  from unstructured_ingest.logger import logger
12
+ from unstructured_ingest.utils.data_prep import batch_generator
12
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
13
14
  from unstructured_ingest.v2.errors import (
14
15
  ProviderError,
@@ -80,7 +81,17 @@ class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
80
81
  return response.data[0].embedding
81
82
 
82
83
  def embed_documents(self, elements: list[dict]) -> list[dict]:
83
- embeddings = self._embed_documents([e.get("text", "") for e in elements])
84
+ client = self.config.get_client()
85
+ texts = [e.get("text", "") for e in elements]
86
+ embeddings = []
87
+ try:
88
+ for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
89
+ response = client.embeddings.create(
90
+ input=batch, model=self.config.embedder_model_name
91
+ )
92
+ embeddings.extend([data.embedding for data in response.data])
93
+ except Exception as e:
94
+ raise self.wrap_error(e=e)
84
95
  elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
85
96
  return elements_with_embeddings
86
97
 
@@ -105,12 +116,14 @@ class AsyncOpenAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
105
116
  async def embed_documents(self, elements: list[dict]) -> list[dict]:
106
117
  client = self.config.get_async_client()
107
118
  texts = [e.get("text", "") for e in elements]
119
+ embeddings = []
108
120
  try:
109
- response = await client.embeddings.create(
110
- input=texts, model=self.config.embedder_model_name
111
- )
121
+ for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
122
+ response = await client.embeddings.create(
123
+ input=batch, model=self.config.embedder_model_name
124
+ )
125
+ embeddings.extend([data.embedding for data in response.data])
112
126
  except Exception as e:
113
127
  raise self.wrap_error(e=e)
114
- embeddings = [data.embedding for data in response.data]
115
128
  elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
116
129
  return elements_with_embeddings
@@ -9,6 +9,7 @@ from unstructured_ingest.embed.interfaces import (
9
9
  EmbeddingConfig,
10
10
  )
11
11
  from unstructured_ingest.logger import logger
12
+ from unstructured_ingest.utils.data_prep import batch_generator
12
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
13
14
  from unstructured_ingest.v2.errors import (
14
15
  RateLimitError as CustomRateLimitError,
@@ -71,13 +72,18 @@ class TogetherAIEmbeddingEncoder(BaseEmbeddingEncoder):
71
72
 
72
73
  def _embed_documents(self, elements: list[str]) -> list[list[float]]:
73
74
  client = self.config.get_client()
75
+ embeddings = []
74
76
  try:
75
- outputs = client.embeddings.create(
76
- model=self.config.embedder_model_name, input=elements
77
- )
77
+ for batch in batch_generator(
78
+ elements, batch_size=self.config.batch_size or len(elements)
79
+ ):
80
+ outputs = client.embeddings.create(
81
+ model=self.config.embedder_model_name, input=batch
82
+ )
83
+ embeddings.extend([outputs.data[i].embedding for i in range(len(batch))])
78
84
  except Exception as e:
79
85
  raise self.wrap_error(e=e)
80
- return [outputs.data[i].embedding for i in range(len(elements))]
86
+ return embeddings
81
87
 
82
88
 
83
89
  @dataclass
@@ -97,10 +103,15 @@ class AsyncTogetherAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
97
103
 
98
104
  async def _embed_documents(self, elements: list[str]) -> list[list[float]]:
99
105
  client = self.config.get_async_client()
106
+ embeddings = []
100
107
  try:
101
- outputs = await client.embeddings.create(
102
- model=self.config.embedder_model_name, input=elements
103
- )
108
+ for batch in batch_generator(
109
+ elements, batch_size=self.config.batch_size or len(elements)
110
+ ):
111
+ outputs = await client.embeddings.create(
112
+ model=self.config.embedder_model_name, input=batch
113
+ )
114
+ embeddings.extend([outputs.data[i].embedding for i in range(len(batch))])
104
115
  except Exception as e:
105
116
  raise self.wrap_error(e=e)
106
- return [outputs.data[i].embedding for i in range(len(elements))]
117
+ return embeddings
@@ -13,6 +13,7 @@ from unstructured_ingest.embed.interfaces import (
13
13
  BaseEmbeddingEncoder,
14
14
  EmbeddingConfig,
15
15
  )
16
+ from unstructured_ingest.utils.data_prep import batch_generator
16
17
  from unstructured_ingest.utils.dep_check import requires_dependencies
17
18
  from unstructured_ingest.v2.errors import UserAuthError
18
19
 
@@ -86,12 +87,15 @@ class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
86
87
  from vertexai.language_models import TextEmbeddingInput
87
88
 
88
89
  inputs = [TextEmbeddingInput(text=element) for element in elements]
90
+ client = self.config.get_client()
91
+ embeddings = []
89
92
  try:
90
- client = self.config.get_client()
91
- embeddings = client.get_embeddings(inputs)
93
+ for batch in batch_generator(inputs, batch_size=self.config.batch_size or len(inputs)):
94
+ response = client.get_embeddings(batch)
95
+ embeddings.extend([e.values for e in response])
92
96
  except Exception as e:
93
97
  raise self.wrap_error(e=e)
94
- return [e.values for e in embeddings]
98
+ return embeddings
95
99
 
96
100
 
97
101
  @dataclass
@@ -118,9 +122,12 @@ class AsyncVertexAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
118
122
  from vertexai.language_models import TextEmbeddingInput
119
123
 
120
124
  inputs = [TextEmbeddingInput(text=element) for element in elements]
125
+ client = self.config.get_client()
126
+ embeddings = []
121
127
  try:
122
- client = self.config.get_client()
123
- embeddings = await client.get_embeddings_async(inputs)
128
+ for batch in batch_generator(inputs, batch_size=self.config.batch_size or len(inputs)):
129
+ response = await client.get_embeddings_async(batch)
130
+ embeddings.extend([e.values for e in response])
124
131
  except Exception as e:
125
132
  raise self.wrap_error(e=e)
126
- return [e.values for e in embeddings]
133
+ return embeddings
@@ -9,6 +9,7 @@ from unstructured_ingest.embed.interfaces import (
9
9
  EmbeddingConfig,
10
10
  )
11
11
  from unstructured_ingest.logger import logger
12
+ from unstructured_ingest.utils.data_prep import batch_generator
12
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
13
14
  from unstructured_ingest.v2.errors import (
14
15
  ProviderError,
@@ -25,9 +26,13 @@ if TYPE_CHECKING:
25
26
 
26
27
 
27
28
  class VoyageAIEmbeddingConfig(EmbeddingConfig):
29
+ batch_size: int = Field(
30
+ default=32,
31
+ le=128,
32
+ description="Batch size for embedding requests. VoyageAI has a limit of 128.",
33
+ )
28
34
  api_key: SecretStr
29
35
  embedder_model_name: str = Field(default="voyage-3", alias="model_name")
30
- batch_size: Optional[int] = Field(default=None)
31
36
  truncation: Optional[bool] = Field(default=None)
32
37
  max_retries: int = 0
33
38
  timeout_in_seconds: Optional[int] = None
@@ -91,12 +96,15 @@ class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
91
96
  return self.config.wrap_error(e=e)
92
97
 
93
98
  def _embed_documents(self, elements: list[str]) -> list[list[float]]:
94
- client: VoyageAIClient = self.config.get_client()
99
+ client = self.config.get_client()
100
+ embeddings = []
95
101
  try:
96
- response = client.embed(texts=elements, model=self.config.embedder_model_name)
102
+ for batch in batch_generator(elements, batch_size=self.config.batch_size):
103
+ response = client.embed(texts=batch, model=self.config.embedder_model_name)
104
+ embeddings.extend(response.embeddings)
97
105
  except Exception as e:
98
106
  raise self.wrap_error(e=e)
99
- return response.embeddings
107
+ return embeddings
100
108
 
101
109
  def embed_documents(self, elements: list[dict]) -> list[dict]:
102
110
  embeddings = self._embed_documents([e.get("text", "") for e in elements])
@@ -115,11 +123,16 @@ class AsyncVoyageAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
115
123
 
116
124
  async def _embed_documents(self, elements: list[str]) -> list[list[float]]:
117
125
  client = self.config.get_async_client()
126
+ embeddings = []
118
127
  try:
119
- response = await client.embed(texts=elements, model=self.config.embedder_model_name)
128
+ for batch in batch_generator(
129
+ elements, batch_size=self.config.batch_size or len(elements)
130
+ ):
131
+ response = await client.embed(texts=batch, model=self.config.embedder_model_name)
132
+ embeddings.extend(response.embeddings)
120
133
  except Exception as e:
121
134
  raise self.wrap_error(e=e)
122
- return response.embeddings
135
+ return embeddings
123
136
 
124
137
  async def embed_documents(self, elements: list[dict]) -> list[dict]:
125
138
  embeddings = await self._embed_documents([e.get("text", "") for e in elements])
@@ -153,7 +153,7 @@ def get_data_by_suffix(path: Path) -> list[dict]:
153
153
  raise ValueError(f"Unsupported file type: {path}")
154
154
 
155
155
 
156
- def write_data(path: Path, data: list[dict], indent: int = 2) -> None:
156
+ def write_data(path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
157
157
  with path.open("w") as f:
158
158
  if path.suffix == ".json":
159
159
  json.dump(data, f, indent=indent, ensure_ascii=False)
@@ -1,109 +1,159 @@
1
1
  import base64
2
2
  from pathlib import Path
3
- from typing import Optional
3
+ from typing import TYPE_CHECKING, Optional
4
4
  from urllib.parse import urlparse
5
5
  from uuid import NAMESPACE_DNS, uuid5
6
6
 
7
- import requests
8
- from bs4 import BeautifulSoup
9
- from requests import Session
7
+ from pydantic import BaseModel, Field
10
8
 
9
+ from unstructured_ingest.utils.dep_check import requires_dependencies
11
10
  from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, SourceIdentifiers
12
11
  from unstructured_ingest.v2.logger import logger
13
12
 
13
+ if TYPE_CHECKING:
14
+ from bs4.element import Tag
15
+ from requests import Session
14
16
 
15
- def convert_image_tags(url: str, original_html: str, session: Optional[Session] = None) -> str:
16
- session = session or requests.Session()
17
- parsed_url = urlparse(url)
18
- base_url = parsed_url.scheme + "://" + parsed_url.netloc
19
- soup = BeautifulSoup(original_html, "html.parser")
20
- images = soup.find_all("img")
21
- for image in images:
22
- current_source = image["src"]
23
- if current_source.startswith("//"):
24
- source_url = f"{parsed_url.scheme}:{current_source}"
25
- elif current_source.startswith("http"):
26
- source_url = current_source
17
+
18
+ class HtmlMixin(BaseModel):
19
+ extract_images: bool = Field(
20
+ default=False,
21
+ description="if true, will download images and replace "
22
+ "the html content with base64 encoded images",
23
+ )
24
+ extract_files: bool = Field(
25
+ default=False, description="if true, will download any embedded files"
26
+ )
27
+ force_download: bool = Field(
28
+ default=False,
29
+ description="if true, will redownload extracted files even if they already exist locally",
30
+ )
31
+ allow_list: Optional[list[str]] = Field(
32
+ default=None,
33
+ description="list of allowed urls to download, if not set, "
34
+ "will default to the base url the original HTML came from",
35
+ )
36
+
37
+ @requires_dependencies(["requests"])
38
+ def get_default_session(self) -> "Session":
39
+ import requests
40
+
41
+ return requests.Session()
42
+
43
+ def get_absolute_url(self, tag_link: str, url: str) -> str:
44
+ parsed_url = urlparse(url)
45
+ base_url = parsed_url.scheme + "://" + parsed_url.netloc
46
+ if tag_link.startswith("//"):
47
+ return f"{parsed_url.scheme}:{tag_link}"
48
+ elif tag_link.startswith("http"):
49
+ return tag_link
27
50
  else:
28
- source_url = base_url + current_source
29
- try:
30
- response = session.get(source_url)
31
- response.raise_for_status()
32
- image_content = response.content
33
- logger.debug(
34
- "img tag having src updated from {} to base64 content".format(image["src"])
35
- )
36
- image["src"] = f"data:image/png;base64,{base64.b64encode(image_content).decode()}"
37
- except Exception as e:
38
- logger.warning(
39
- f"failed to download image content from {source_url}: {e}", exc_info=True
40
- )
41
- return str(soup)
42
-
43
-
44
- def download_link(
45
- download_dir: Path, link: str, session: Optional[Session] = None, force_download: bool = False
46
- ) -> Path:
47
- session = session or requests.Session()
48
- filename = Path(urlparse(url=link).path).name
49
- download_path = download_dir / filename
50
- logger.debug(f"downloading file from {link} to {download_path}")
51
- if download_path.exists() and download_path.is_file() and not force_download:
52
- return download_path
53
- with download_path.open("wb") as downloaded_file:
54
- response = session.get(link)
51
+ tag_link = tag_link.lstrip("/")
52
+ return f"{base_url}/{tag_link}"
53
+
54
+ def download_content(self, url: str, session: "Session") -> bytes:
55
+ response = session.get(url)
55
56
  response.raise_for_status()
56
- downloaded_file.write(response.content)
57
- return download_path
58
-
59
-
60
- def download_embedded_files(
61
- download_dir: Path,
62
- original_filedata: FileData,
63
- original_html: str,
64
- session: Optional[Session] = None,
65
- force_download: bool = False,
66
- ) -> list[DownloadResponse]:
67
- session = session or requests.Session()
68
- url = original_filedata.metadata.url
69
- parsed_url = urlparse(url)
70
- base_url = parsed_url.scheme + "://" + parsed_url.netloc
71
- soup = BeautifulSoup(original_html, "html.parser")
72
- tags = soup.find_all("a", href=True)
73
- hrefs = [
74
- tag["href"]
75
- for tag in tags
76
- if not tag["href"].startswith("#") and Path(tag["href"]).suffix != ""
77
- ]
78
- results = []
79
- for current_source in hrefs:
80
- download_dir.mkdir(parents=True, exist_ok=True)
81
- if current_source.startswith("//"):
82
- source_url = f"{parsed_url.scheme}:{current_source}"
83
- elif current_source.startswith("http"):
84
- source_url = current_source
85
- else:
86
- source_url = base_url + current_source
87
- try:
88
- downloaded_path = download_link(
89
- download_dir=download_dir,
90
- link=source_url,
91
- session=session,
92
- force_download=force_download,
93
- )
94
- except Exception as e:
95
- logger.warning(f"failed to download file content from {source_url}: {e}")
96
- continue
97
- result_file_data = original_filedata.model_copy(deep=True)
98
- result_file_data.metadata.url = source_url
99
- result_file_data.metadata.record_locator["parent_url"] = url
100
- result_file_data.identifier = str(
101
- uuid5(NAMESPACE_DNS, source_url + original_filedata.identifier)
57
+ return response.content
58
+
59
+ def can_download(self, url_to_download: str, original_url: str) -> bool:
60
+ parsed_original_url = urlparse(original_url)
61
+ base_url = parsed_original_url.scheme + "://" + parsed_original_url.netloc
62
+ allow_list = self.allow_list or [base_url]
63
+ for allowed_url in allow_list:
64
+ if url_to_download.startswith(allowed_url):
65
+ return True
66
+ logger.info(f"Skipping url because it does not match the allow list: {url_to_download}")
67
+ return False
68
+
69
+ def extract_image_src(self, image: "Tag", url: str, session: "Session") -> "Tag":
70
+ current_src = image["src"]
71
+ if current_src.startswith("data:image/png;base64"):
72
+ # already base64 encoded
73
+ return image
74
+ absolute_url = self.get_absolute_url(tag_link=image["src"], url=url)
75
+ if not self.can_download(url_to_download=absolute_url, original_url=url):
76
+ return image
77
+ image_content = self.download_content(url=absolute_url, session=session)
78
+ logger.debug("img tag having src updated from {} to base64 content".format(image["src"]))
79
+ image["src"] = f"data:image/png;base64,{base64.b64encode(image_content).decode()}"
80
+ return image
81
+
82
+ @requires_dependencies(["bs4"])
83
+ def extract_html_images(self, url: str, html: str, session: Optional["Session"] = None) -> str:
84
+ from bs4 import BeautifulSoup
85
+
86
+ session = session or self.get_default_session()
87
+ soup = BeautifulSoup(html, "html.parser")
88
+ images = soup.find_all("img")
89
+ for image in images:
90
+ self.extract_image_src(image=image, url=url, session=session)
91
+ return str(soup)
92
+
93
+ @requires_dependencies(["bs4"])
94
+ def get_hrefs(self, url: str, html: str) -> list:
95
+ from bs4 import BeautifulSoup
96
+
97
+ soup = BeautifulSoup(html, "html.parser")
98
+ tags = soup.find_all("a", href=True)
99
+ hrefs = [
100
+ tag["href"]
101
+ for tag in tags
102
+ if not tag["href"].startswith("#") and Path(tag["href"]).suffix != ""
103
+ ]
104
+ absolute_urls = [self.get_absolute_url(tag_link=href, url=url) for href in hrefs]
105
+ allowed_urls = [
106
+ url_to_download
107
+ for url_to_download in absolute_urls
108
+ if self.can_download(url_to_download=url_to_download, original_url=url)
109
+ ]
110
+ return allowed_urls
111
+
112
+ def write_content(self, content: bytes, path: Path) -> None:
113
+ if path.exists() and path.is_file() and not self.force_download:
114
+ return
115
+ if not path.parent.exists():
116
+ path.parent.mkdir(parents=True)
117
+ with path.open("wb") as f:
118
+ f.write(content)
119
+
120
+ def get_download_response(
121
+ self, url: str, download_dir: Path, file_data: FileData, session: "Session"
122
+ ) -> DownloadResponse:
123
+ filename = Path(urlparse(url=url).path).name
124
+ download_path = download_dir / filename
125
+ self.write_content(
126
+ content=self.download_content(url=url, session=session), path=download_path
102
127
  )
103
- filename = Path(urlparse(url=source_url).path).name
128
+ result_file_data = file_data.model_copy(deep=True)
129
+ result_file_data.metadata.url = url
130
+ if result_file_data.metadata.record_locator is None:
131
+ result_file_data.metadata.record_locator = {}
132
+ result_file_data.metadata.record_locator["parent_url"] = url
133
+ result_file_data.identifier = str(uuid5(NAMESPACE_DNS, url + file_data.identifier))
134
+ filename = Path(urlparse(url=url).path).name
104
135
  result_file_data.source_identifiers = SourceIdentifiers(
105
136
  filename=filename, fullpath=filename
106
137
  )
107
- result_file_data.local_download_path = downloaded_path.as_posix()
108
- results.append(DownloadResponse(file_data=result_file_data, path=downloaded_path))
109
- return results
138
+ result_file_data.local_download_path = download_path.as_posix()
139
+ return DownloadResponse(file_data=result_file_data, path=download_path)
140
+
141
+ def extract_embedded_files(
142
+ self,
143
+ url: str,
144
+ html: str,
145
+ download_dir: Path,
146
+ original_filedata: FileData,
147
+ session: Optional["Session"] = None,
148
+ ) -> list[DownloadResponse]:
149
+ session = session or self.get_default_session()
150
+ urls_to_download = self.get_hrefs(url=url, html=html)
151
+ return [
152
+ self.get_download_response(
153
+ url=url_to_download,
154
+ download_dir=download_dir,
155
+ file_data=original_filedata,
156
+ session=session,
157
+ )
158
+ for url_to_download in urls_to_download
159
+ ]
@@ -5,7 +5,7 @@ from .indexer import Indexer, IndexerConfig
5
5
  from .process import BaseProcess
6
6
  from .processor import ProcessorConfig
7
7
  from .upload_stager import UploadStager, UploadStagerConfig
8
- from .uploader import UploadContent, Uploader, UploaderConfig
8
+ from .uploader import UploadContent, Uploader, UploaderConfig, VectorDBUploader
9
9
 
10
10
  __all__ = [
11
11
  "DownloadResponse",
@@ -29,4 +29,5 @@ __all__ = [
29
29
  "FileDataSourceMetadata",
30
30
  "BatchFileData",
31
31
  "BatchItem",
32
+ "VectorDBUploader",
32
33
  ]
@@ -8,6 +8,9 @@ class BaseProcess(ABC):
8
8
  def is_async(self) -> bool:
9
9
  return False
10
10
 
11
+ def init(self, *kwargs: Any) -> None:
12
+ pass
13
+
11
14
  def precheck(self) -> None:
12
15
  pass
13
16
 
@@ -1,7 +1,7 @@
1
1
  from abc import ABC
2
2
  from dataclasses import dataclass
3
3
  from pathlib import Path
4
- from typing import Any, TypeVar
4
+ from typing import Any, Optional, TypeVar
5
5
 
6
6
  from pydantic import BaseModel
7
7
 
@@ -38,6 +38,11 @@ class Uploader(BaseProcess, BaseConnector, ABC):
38
38
  def run_batch(self, contents: list[UploadContent], **kwargs: Any) -> None:
39
39
  raise NotImplementedError()
40
40
 
41
+ def create_destination(self, destination_name: str = "elements", **kwargs: Any) -> bool:
42
+ # Update the uploader config if needed with a new destination that gets created.
43
+ # Return a flag on if anything was created or not.
44
+ return False
45
+
41
46
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
42
47
  data = get_data(path=path)
43
48
  self.run_data(data=data, file_data=file_data, **kwargs)
@@ -51,3 +56,11 @@ class Uploader(BaseProcess, BaseConnector, ABC):
51
56
 
52
57
  async def run_data_async(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
53
58
  return self.run_data(data=data, file_data=file_data, **kwargs)
59
+
60
+
61
+ @dataclass
62
+ class VectorDBUploader(Uploader, ABC):
63
+ def create_destination(
64
+ self, destination_name: str = "elements", vector_length: Optional[int] = None, **kwargs: Any
65
+ ) -> bool:
66
+ return False
@@ -11,6 +11,7 @@ from typing import Any
11
11
  from unstructured_ingest.v2.interfaces import ProcessorConfig, Uploader
12
12
  from unstructured_ingest.v2.logger import logger, make_default_logger
13
13
  from unstructured_ingest.v2.otel import OtelHandler
14
+ from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
14
15
  from unstructured_ingest.v2.pipeline.steps.chunk import Chunker, ChunkStep
15
16
  from unstructured_ingest.v2.pipeline.steps.download import DownloaderT, DownloadStep
16
17
  from unstructured_ingest.v2.pipeline.steps.embed import Embedder, EmbedStep
@@ -91,10 +92,6 @@ class Pipeline:
91
92
  self.chunker_step = ChunkStep(process=chunker, context=self.context) if chunker else None
92
93
 
93
94
  self.embedder_step = EmbedStep(process=embedder, context=self.context) if embedder else None
94
- # TODO: support initialize() call from each step process
95
- # Potential long call to download embedder models, run before any fanout:
96
- if embedder and embedder.config:
97
- embedder.config.get_embedder().initialize()
98
95
 
99
96
  self.stager_step = UploadStageStep(process=stager, context=self.context) if stager else None
100
97
  self.uploader_step = UploadStep(process=uploader, context=self.context)
@@ -135,6 +132,7 @@ class Pipeline:
135
132
  with otel_handler.get_tracer().start_as_current_span(
136
133
  "ingest process", record_exception=True
137
134
  ):
135
+ self._run_inits()
138
136
  self._run_prechecks()
139
137
  self._run()
140
138
  finally:
@@ -156,7 +154,7 @@ class Pipeline:
156
154
  final = [f for f in flat if f]
157
155
  return final or None
158
156
 
159
- def _run_prechecks(self):
157
+ def _get_all_steps(self) -> list[PipelineStep]:
160
158
  steps = [self.indexer_step, self.downloader_step, self.partitioner_step, self.uploader_step]
161
159
  if self.chunker_step:
162
160
  steps.append(self.chunker_step)
@@ -166,8 +164,24 @@ class Pipeline:
166
164
  steps.append(self.uncompress_step)
167
165
  if self.stager_step:
168
166
  steps.append(self.stager_step)
167
+ return steps
168
+
169
+ def _run_inits(self):
170
+ failures = {}
171
+
172
+ for step in self._get_all_steps():
173
+ try:
174
+ step.process.init()
175
+ except Exception as e:
176
+ failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
177
+ if failures:
178
+ for k, v in failures.items():
179
+ logger.error(f"Step init failure: {k}: {v}")
180
+ raise PipelineError("Init failed")
181
+
182
+ def _run_prechecks(self):
169
183
  failures = {}
170
- for step in steps:
184
+ for step in self._get_all_steps():
171
185
  try:
172
186
  step.process.precheck()
173
187
  except Exception as e: