unstructured-ingest 0.0.25__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (86) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +42 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +15 -0
  7. test/integration/connectors/databricks_tests/__init__.py +0 -0
  8. test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
  9. test/integration/connectors/sql/__init__.py +0 -0
  10. test/integration/connectors/sql/test_postgres.py +178 -0
  11. test/integration/connectors/sql/test_sqlite.py +151 -0
  12. test/integration/connectors/test_s3.py +152 -0
  13. test/integration/connectors/utils/__init__.py +0 -0
  14. test/integration/connectors/utils/constants.py +7 -0
  15. test/integration/connectors/utils/docker_compose.py +44 -0
  16. test/integration/connectors/utils/validation.py +203 -0
  17. test/integration/embedders/__init__.py +0 -0
  18. test/integration/embedders/conftest.py +13 -0
  19. test/integration/embedders/test_bedrock.py +49 -0
  20. test/integration/embedders/test_huggingface.py +26 -0
  21. test/integration/embedders/test_mixedbread.py +47 -0
  22. test/integration/embedders/test_octoai.py +41 -0
  23. test/integration/embedders/test_openai.py +41 -0
  24. test/integration/embedders/test_vertexai.py +41 -0
  25. test/integration/embedders/test_voyageai.py +41 -0
  26. test/integration/embedders/togetherai.py +43 -0
  27. test/integration/embedders/utils.py +44 -0
  28. test/integration/partitioners/__init__.py +0 -0
  29. test/integration/partitioners/test_partitioner.py +75 -0
  30. test/integration/utils.py +15 -0
  31. test/unit/__init__.py +0 -0
  32. test/unit/embed/__init__.py +0 -0
  33. test/unit/embed/test_mixedbreadai.py +41 -0
  34. test/unit/embed/test_octoai.py +20 -0
  35. test/unit/embed/test_openai.py +20 -0
  36. test/unit/embed/test_vertexai.py +25 -0
  37. test/unit/embed/test_voyageai.py +24 -0
  38. test/unit/test_chunking_utils.py +36 -0
  39. test/unit/test_error.py +27 -0
  40. test/unit/test_interfaces.py +280 -0
  41. test/unit/test_interfaces_v2.py +26 -0
  42. test/unit/test_logger.py +78 -0
  43. test/unit/test_utils.py +164 -0
  44. test/unit/test_utils_v2.py +82 -0
  45. unstructured_ingest/__version__.py +1 -1
  46. unstructured_ingest/cli/interfaces.py +2 -2
  47. unstructured_ingest/connector/notion/types/block.py +1 -0
  48. unstructured_ingest/connector/notion/types/database.py +1 -0
  49. unstructured_ingest/connector/notion/types/page.py +1 -0
  50. unstructured_ingest/embed/bedrock.py +0 -20
  51. unstructured_ingest/embed/huggingface.py +0 -21
  52. unstructured_ingest/embed/interfaces.py +29 -3
  53. unstructured_ingest/embed/mixedbreadai.py +0 -36
  54. unstructured_ingest/embed/octoai.py +2 -24
  55. unstructured_ingest/embed/openai.py +0 -20
  56. unstructured_ingest/embed/togetherai.py +40 -0
  57. unstructured_ingest/embed/vertexai.py +0 -20
  58. unstructured_ingest/embed/voyageai.py +1 -24
  59. unstructured_ingest/interfaces.py +1 -1
  60. unstructured_ingest/v2/cli/utils/click.py +21 -2
  61. unstructured_ingest/v2/interfaces/connector.py +22 -2
  62. unstructured_ingest/v2/interfaces/downloader.py +1 -0
  63. unstructured_ingest/v2/processes/chunker.py +1 -1
  64. unstructured_ingest/v2/processes/connectors/__init__.py +5 -18
  65. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  66. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
  67. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  68. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  69. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  70. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  71. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +17 -0
  72. unstructured_ingest/v2/processes/connectors/kdbai.py +14 -6
  73. unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
  74. unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
  75. unstructured_ingest/v2/processes/connectors/sql/postgres.py +177 -0
  76. unstructured_ingest/v2/processes/connectors/sql/sql.py +310 -0
  77. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +172 -0
  78. unstructured_ingest/v2/processes/embedder.py +13 -0
  79. unstructured_ingest/v2/processes/partitioner.py +2 -1
  80. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/METADATA +16 -14
  81. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/RECORD +85 -31
  82. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/top_level.txt +1 -0
  83. unstructured_ingest/v2/processes/connectors/sql.py +0 -275
  84. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/LICENSE.md +0 -0
  85. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/WHEEL +0 -0
  86. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/entry_points.txt +0 -0
@@ -3,7 +3,6 @@ import os
3
3
  from dataclasses import dataclass
4
4
  from typing import TYPE_CHECKING
5
5
 
6
- import numpy as np
7
6
  from pydantic import Field, SecretStr
8
7
 
9
8
  from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
@@ -45,17 +44,6 @@ class BedrockEmbeddingConfig(EmbeddingConfig):
45
44
  class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
46
45
  config: BedrockEmbeddingConfig
47
46
 
48
- def get_exemplary_embedding(self) -> list[float]:
49
- return self.embed_query(query="Q")
50
-
51
- def num_of_dimensions(self) -> tuple[int, ...]:
52
- exemplary_embedding = self.get_exemplary_embedding()
53
- return np.shape(exemplary_embedding)
54
-
55
- def is_unit_vector(self) -> bool:
56
- exemplary_embedding = self.get_exemplary_embedding()
57
- return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
58
-
59
47
  def embed_query(self, query: str) -> list[float]:
60
48
  """Call out to Bedrock embedding endpoint."""
61
49
  # replace newlines, which can negatively affect performance.
@@ -97,11 +85,3 @@ class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
97
85
  embeddings = [self.embed_query(query=e.get("text", "")) for e in elements]
98
86
  elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
99
87
  return elements_with_embeddings
100
-
101
- def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
102
- assert len(elements) == len(embeddings)
103
- elements_w_embedding = []
104
- for i, element in enumerate(elements):
105
- element["embeddings"] = embeddings[i]
106
- elements_w_embedding.append(element)
107
- return elements
@@ -1,7 +1,6 @@
1
1
  from dataclasses import dataclass
2
2
  from typing import TYPE_CHECKING, Optional
3
3
 
4
- import numpy as np
5
4
  from pydantic import Field
6
5
 
7
6
  from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
@@ -39,17 +38,6 @@ class HuggingFaceEmbeddingConfig(EmbeddingConfig):
39
38
  class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
40
39
  config: HuggingFaceEmbeddingConfig
41
40
 
42
- def get_exemplary_embedding(self) -> list[float]:
43
- return self.embed_query(query="Q")
44
-
45
- def num_of_dimensions(self) -> tuple[int, ...]:
46
- exemplary_embedding = self.get_exemplary_embedding()
47
- return np.shape(exemplary_embedding)
48
-
49
- def is_unit_vector(self) -> bool:
50
- exemplary_embedding = self.get_exemplary_embedding()
51
- return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
52
-
53
41
  def embed_query(self, query: str) -> list[float]:
54
42
  return self._embed_documents(texts=[query])[0]
55
43
 
@@ -62,12 +50,3 @@ class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
62
50
  embeddings = self._embed_documents([e.get("text", "") for e in elements])
63
51
  elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
64
52
  return elements_with_embeddings
65
-
66
- def _add_embeddings_to_elements(self, elements: list[dict], embeddings: list) -> list[dict]:
67
- assert len(elements) == len(embeddings)
68
- elements_w_embedding = []
69
-
70
- for i, element in enumerate(elements):
71
- element["embeddings"] = embeddings[i]
72
- elements_w_embedding.append(element)
73
- return elements
@@ -1,6 +1,7 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from dataclasses import dataclass
3
3
 
4
+ import numpy as np
4
5
  from pydantic import BaseModel
5
6
 
6
7
 
@@ -17,14 +18,18 @@ class BaseEmbeddingEncoder(ABC):
17
18
  is properly configured: e.g., embed a single a element"""
18
19
 
19
20
  @property
20
- @abstractmethod
21
21
  def num_of_dimensions(self) -> tuple[int, ...]:
22
- """Number of dimensions for the embedding vector."""
22
+ exemplary_embedding = self.get_exemplary_embedding()
23
+ return np.shape(exemplary_embedding)
24
+
25
+ def get_exemplary_embedding(self) -> list[float]:
26
+ return self.embed_query(query="Q")
23
27
 
24
28
  @property
25
- @abstractmethod
26
29
  def is_unit_vector(self) -> bool:
27
30
  """Denotes if the embedding vector is a unit vector."""
31
+ exemplary_embedding = self.get_exemplary_embedding()
32
+ return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
28
33
 
29
34
  @abstractmethod
30
35
  def embed_documents(self, elements: list[dict]) -> list[dict]:
@@ -41,3 +46,24 @@ class BaseEmbeddingEncoder(ABC):
41
46
  results.append(response)
42
47
 
43
48
  return results
49
+
50
+ @staticmethod
51
+ def _add_embeddings_to_elements(
52
+ elements: list[dict], embeddings: list[list[float]]
53
+ ) -> list[dict]:
54
+ """
55
+ Add embeddings to elements.
56
+
57
+ Args:
58
+ elements (list[Element]): List of elements.
59
+ embeddings (list[list[float]]): List of embeddings.
60
+
61
+ Returns:
62
+ list[Element]: Elements with embeddings added.
63
+ """
64
+ assert len(elements) == len(embeddings)
65
+ elements_w_embedding = []
66
+ for i, element in enumerate(elements):
67
+ element["embeddings"] = embeddings[i]
68
+ elements_w_embedding.append(element)
69
+ return elements
@@ -2,7 +2,6 @@ import os
2
2
  from dataclasses import dataclass, field
3
3
  from typing import TYPE_CHECKING, Optional
4
4
 
5
- import numpy as np
6
5
  from pydantic import Field, SecretStr
7
6
 
8
7
  from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
@@ -66,8 +65,6 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
66
65
  """
67
66
 
68
67
  config: MixedbreadAIEmbeddingConfig
69
-
70
- _exemplary_embedding: Optional[list[float]] = field(init=False, default=None)
71
68
  _request_options: Optional["RequestOptions"] = field(init=False, default=None)
72
69
 
73
70
  def get_exemplary_embedding(self) -> list[float]:
@@ -90,18 +87,6 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
90
87
  additional_headers={"User-Agent": USER_AGENT},
91
88
  )
92
89
 
93
- @property
94
- def num_of_dimensions(self) -> tuple[int, ...]:
95
- """Get the number of dimensions for the embeddings."""
96
- exemplary_embedding = self.get_exemplary_embedding()
97
- return np.shape(exemplary_embedding)
98
-
99
- @property
100
- def is_unit_vector(self) -> bool:
101
- """Check if the embedding is a unit vector."""
102
- exemplary_embedding = self.get_exemplary_embedding()
103
- return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
104
-
105
90
  def _embed(self, texts: list[str]) -> list[list[float]]:
106
91
  """
107
92
  Embed a list of texts using the Mixedbread AI API.
@@ -130,27 +115,6 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
130
115
  responses.append(response)
131
116
  return [item.embedding for response in responses for item in response.data]
132
117
 
133
- @staticmethod
134
- def _add_embeddings_to_elements(
135
- elements: list[dict], embeddings: list[list[float]]
136
- ) -> list[dict]:
137
- """
138
- Add embeddings to elements.
139
-
140
- Args:
141
- elements (list[Element]): List of elements.
142
- embeddings (list[list[float]]): List of embeddings.
143
-
144
- Returns:
145
- list[Element]: Elements with embeddings added.
146
- """
147
- assert len(elements) == len(embeddings)
148
- elements_w_embedding = []
149
- for i, element in enumerate(elements):
150
- element["embeddings"] = embeddings[i]
151
- elements_w_embedding.append(element)
152
- return elements
153
-
154
118
  def embed_documents(self, elements: list[dict]) -> list[dict]:
155
119
  """
156
120
  Embed a list of document elements.
@@ -1,7 +1,6 @@
1
- from dataclasses import dataclass, field
2
- from typing import TYPE_CHECKING, Optional
1
+ from dataclasses import dataclass
2
+ from typing import TYPE_CHECKING
3
3
 
4
- import numpy as np
5
4
  from pydantic import Field, SecretStr
6
5
 
7
6
  from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
@@ -30,19 +29,6 @@ class OctoAiEmbeddingConfig(EmbeddingConfig):
30
29
  @dataclass
31
30
  class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
32
31
  config: OctoAiEmbeddingConfig
33
- # Uses the OpenAI SDK
34
- _exemplary_embedding: Optional[list[float]] = field(init=False, default=None)
35
-
36
- def get_exemplary_embedding(self) -> list[float]:
37
- return self.embed_query("Q")
38
-
39
- def num_of_dimensions(self) -> tuple[int, ...]:
40
- exemplary_embedding = self.get_exemplary_embedding()
41
- return np.shape(exemplary_embedding)
42
-
43
- def is_unit_vector(self) -> bool:
44
- exemplary_embedding = self.get_exemplary_embedding()
45
- return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
46
32
 
47
33
  def embed_query(self, query: str):
48
34
  client = self.config.get_client()
@@ -53,11 +39,3 @@ class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
53
39
  embeddings = [self.embed_query(e.get("text", "")) for e in elements]
54
40
  elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
55
41
  return elements_with_embeddings
56
-
57
- def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
58
- assert len(elements) == len(embeddings)
59
- elements_w_embedding = []
60
- for i, element in enumerate(elements):
61
- element["embeddings"] = embeddings[i]
62
- elements_w_embedding.append(element)
63
- return elements
@@ -1,7 +1,6 @@
1
1
  from dataclasses import dataclass
2
2
  from typing import TYPE_CHECKING
3
3
 
4
- import numpy as np
5
4
  from pydantic import Field, SecretStr
6
5
 
7
6
  from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
@@ -26,17 +25,6 @@ class OpenAIEmbeddingConfig(EmbeddingConfig):
26
25
  class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
27
26
  config: OpenAIEmbeddingConfig
28
27
 
29
- def get_exemplary_embedding(self) -> list[float]:
30
- return self.embed_query(query="Q")
31
-
32
- def num_of_dimensions(self) -> tuple[int, ...]:
33
- exemplary_embedding = self.get_exemplary_embedding()
34
- return np.shape(exemplary_embedding)
35
-
36
- def is_unit_vector(self) -> bool:
37
- exemplary_embedding = self.get_exemplary_embedding()
38
- return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
39
-
40
28
  def embed_query(self, query: str) -> list[float]:
41
29
  client = self.config.get_client()
42
30
  response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
@@ -46,11 +34,3 @@ class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
46
34
  embeddings = self._embed_documents([e.get("text", "") for e in elements])
47
35
  elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
48
36
  return elements_with_embeddings
49
-
50
- def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
51
- assert len(elements) == len(embeddings)
52
- elements_w_embedding = []
53
- for i, element in enumerate(elements):
54
- element["embeddings"] = embeddings[i]
55
- elements_w_embedding.append(element)
56
- return elements
@@ -0,0 +1,40 @@
1
+ from dataclasses import dataclass
2
+ from typing import TYPE_CHECKING
3
+
4
+ from pydantic import Field, SecretStr
5
+
6
+ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
7
+ from unstructured_ingest.utils.dep_check import requires_dependencies
8
+
9
+ if TYPE_CHECKING:
10
+ from together import Together
11
+
12
+
13
+ class TogetherAIEmbeddingConfig(EmbeddingConfig):
14
+ api_key: SecretStr
15
+ embedder_model_name: str = Field(
16
+ default="togethercomputer/m2-bert-80M-8k-retrieval", alias="model_name"
17
+ )
18
+
19
+ @requires_dependencies(["together"], extras="togetherai")
20
+ def get_client(self) -> "Together":
21
+ from together import Together
22
+
23
+ return Together(api_key=self.api_key.get_secret_value())
24
+
25
+
26
+ @dataclass
27
+ class TogetherAIEmbeddingEncoder(BaseEmbeddingEncoder):
28
+ config: TogetherAIEmbeddingConfig
29
+
30
+ def embed_query(self, query: str) -> list[float]:
31
+ return self._embed_documents(elements=[query])[0]
32
+
33
+ def embed_documents(self, elements: list[dict]) -> list[dict]:
34
+ embeddings = self._embed_documents([e.get("text", "") for e in elements])
35
+ return self._add_embeddings_to_elements(elements, embeddings)
36
+
37
+ def _embed_documents(self, elements: list[str]) -> list[list[float]]:
38
+ client = self.config.get_client()
39
+ outputs = client.embeddings.create(model=self.config.embedder_model_name, input=elements)
40
+ return [outputs.data[i].embedding for i in range(len(elements))]
@@ -5,7 +5,6 @@ from dataclasses import dataclass
5
5
  from pathlib import Path
6
6
  from typing import TYPE_CHECKING, Annotated, Any, Optional
7
7
 
8
- import numpy as np
9
8
  from pydantic import Field, Secret, ValidationError
10
9
  from pydantic.functional_validators import BeforeValidator
11
10
 
@@ -56,17 +55,6 @@ class VertexAIEmbeddingConfig(EmbeddingConfig):
56
55
  class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
57
56
  config: VertexAIEmbeddingConfig
58
57
 
59
- def get_exemplary_embedding(self) -> list[float]:
60
- return self.embed_query(query="A sample query.")
61
-
62
- def num_of_dimensions(self) -> tuple[int, ...]:
63
- exemplary_embedding = self.get_exemplary_embedding()
64
- return np.shape(exemplary_embedding)
65
-
66
- def is_unit_vector(self) -> bool:
67
- exemplary_embedding = self.get_exemplary_embedding()
68
- return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
69
-
70
58
  def embed_query(self, query):
71
59
  return self._embed_documents(elements=[query])[0]
72
60
 
@@ -86,11 +74,3 @@ class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
86
74
  inputs = [TextEmbeddingInput(text=element) for element in elements]
87
75
  embeddings = client.get_embeddings(inputs)
88
76
  return [e.values for e in embeddings]
89
-
90
- def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
91
- assert len(elements) == len(embeddings)
92
- elements_w_embedding = []
93
- for i, element in enumerate(elements):
94
- element["embeddings"] = embeddings[i]
95
- elements_w_embedding.append(element)
96
- return elements
@@ -1,7 +1,6 @@
1
1
  from dataclasses import dataclass
2
2
  from typing import TYPE_CHECKING, Optional
3
3
 
4
- import numpy as np
5
4
  from pydantic import Field, SecretStr
6
5
 
7
6
  from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
@@ -13,7 +12,7 @@ if TYPE_CHECKING:
13
12
 
14
13
  class VoyageAIEmbeddingConfig(EmbeddingConfig):
15
14
  api_key: SecretStr
16
- embedder_model_name: str = Field(alias="model_name")
15
+ embedder_model_name: str = Field(default="voyage-3", alias="model_name")
17
16
  batch_size: Optional[int] = Field(default=None)
18
17
  truncation: Optional[bool] = Field(default=None)
19
18
  max_retries: int = 0
@@ -39,19 +38,6 @@ class VoyageAIEmbeddingConfig(EmbeddingConfig):
39
38
  class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
40
39
  config: VoyageAIEmbeddingConfig
41
40
 
42
- def get_exemplary_embedding(self) -> list[float]:
43
- return self.embed_query(query="A sample query.")
44
-
45
- @property
46
- def num_of_dimensions(self) -> tuple[int, ...]:
47
- exemplary_embedding = self.get_exemplary_embedding()
48
- return np.shape(exemplary_embedding)
49
-
50
- @property
51
- def is_unit_vector(self) -> bool:
52
- exemplary_embedding = self.get_exemplary_embedding()
53
- return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
54
-
55
41
  def _embed_documents(self, elements: list[str]) -> list[list[float]]:
56
42
  client: VoyageAIClient = self.config.get_client()
57
43
  response = client.embed(texts=elements, model=self.config.embedder_model_name)
@@ -63,12 +49,3 @@ class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
63
49
 
64
50
  def embed_query(self, query: str) -> list[float]:
65
51
  return self._embed_documents(elements=[query])[0]
66
-
67
- @staticmethod
68
- def _add_embeddings_to_elements(elements, embeddings) -> list[dict]:
69
- assert len(elements) == len(embeddings)
70
- elements_w_embedding = []
71
- for i, element in enumerate(elements):
72
- element["embeddings"] = embeddings[i]
73
- elements_w_embedding.append(element)
74
- return elements
@@ -100,7 +100,7 @@ class PartitionConfig(BaseConfig):
100
100
  flatten_metadata: bool = False
101
101
  metadata_exclude: list[str] = field(default_factory=list)
102
102
  metadata_include: list[str] = field(default_factory=list)
103
- partition_endpoint: Optional[str] = "https://api.unstructured.io/general/v0/general"
103
+ partition_endpoint: Optional[str] = "https://api.unstructuredapp.io/general/v0/general"
104
104
  partition_by_api: bool = False
105
105
  api_key: Optional[str] = str(enhanced_field(default=None, sensitive=True)) or None
106
106
  hi_res_model_name: Optional[str] = None
@@ -3,7 +3,7 @@ import os.path
3
3
  from gettext import gettext, ngettext
4
4
  from gettext import gettext as _
5
5
  from pathlib import Path
6
- from typing import Any, Optional, Type, TypeVar
6
+ from typing import Any, Optional, Type, TypeVar, Union
7
7
 
8
8
  import click
9
9
  from pydantic import BaseModel, ConfigDict, Secret
@@ -112,6 +112,20 @@ class DelimitedString(click.ParamType):
112
112
  BaseModelT = TypeVar("BaseModelT", bound=BaseModel)
113
113
 
114
114
 
115
+ def unwrap_optional(val: Any) -> tuple[Any, bool]:
116
+ if (
117
+ hasattr(val, "__origin__")
118
+ and hasattr(val, "__args__")
119
+ and val.__origin__ is Union
120
+ and len(val.__args__) == 2
121
+ and type(None) in val.__args__
122
+ ):
123
+ args = val.__args__
124
+ args = [a for a in args if a is not None]
125
+ return args[0], True
126
+ return val, False
127
+
128
+
115
129
  def extract_config(flat_data: dict, config: Type[BaseModelT]) -> BaseModelT:
116
130
  fields = config.model_fields
117
131
  config.model_config = ConfigDict(extra="ignore")
@@ -119,6 +133,7 @@ def extract_config(flat_data: dict, config: Type[BaseModelT]) -> BaseModelT:
119
133
  data = {k: v for k, v in flat_data.items() if k in field_names and v is not None}
120
134
  if access_config := fields.get("access_config"):
121
135
  access_config_type = access_config.annotation
136
+ access_config_type, is_optional = unwrap_optional(access_config_type)
122
137
  # Check if raw type is wrapped by a secret
123
138
  if (
124
139
  hasattr(access_config_type, "__origin__")
@@ -132,9 +147,13 @@ def extract_config(flat_data: dict, config: Type[BaseModelT]) -> BaseModelT:
132
147
  else:
133
148
  raise TypeError(f"Unrecognized access_config type: {access_config_type}")
134
149
  ac_field_names = [v.alias or k for k, v in ac_fields.items()]
135
- data["access_config"] = {
150
+ access_config_data = {
136
151
  k: v for k, v in flat_data.items() if k in ac_field_names and v is not None
137
152
  }
153
+ if not access_config_data and is_optional:
154
+ data["access_config"] = None
155
+ else:
156
+ data["access_config"] = access_config_data
138
157
  return config.model_validate(obj=data)
139
158
 
140
159
 
@@ -1,8 +1,9 @@
1
1
  from abc import ABC
2
2
  from dataclasses import dataclass
3
- from typing import Any, TypeVar
3
+ from typing import Any, TypeVar, Union
4
4
 
5
- from pydantic import BaseModel, Secret
5
+ from pydantic import BaseModel, Secret, model_validator
6
+ from pydantic.types import _SecretBase
6
7
 
7
8
 
8
9
  class AccessConfig(BaseModel):
@@ -21,6 +22,25 @@ class ConnectionConfig(BaseModel):
21
22
  return {}
22
23
  return self.access_config.get_secret_value().model_dump()
23
24
 
25
+ @model_validator(mode="after")
26
+ def check_access_config(self):
27
+ access_config = self.access_config
28
+ if self._is_access_config_optional() and access_config is None:
29
+ return self
30
+ if not isinstance(access_config, _SecretBase):
31
+ raise ValueError("access_config must be an instance of SecretBase")
32
+ return self
33
+
34
+ def _is_access_config_optional(self) -> bool:
35
+ access_config_type = self.model_fields["access_config"].annotation
36
+ return (
37
+ hasattr(access_config_type, "__origin__")
38
+ and hasattr(access_config_type, "__args__")
39
+ and access_config_type.__origin__ is Union
40
+ and len(access_config_type.__args__) == 2
41
+ and type(None) in access_config_type.__args__
42
+ )
43
+
24
44
 
25
45
  ConnectionConfigT = TypeVar("ConnectionConfigT", bound=ConnectionConfig)
26
46
 
@@ -62,6 +62,7 @@ class Downloader(BaseProcess, BaseConnector, ABC):
62
62
  date_modified = float(file_data.metadata.date_modified)
63
63
  date_created = float(file_data.metadata.date_created)
64
64
  os.utime(download_path, times=(date_created, date_modified))
65
+ file_data.local_download_path = str(download_path.resolve())
65
66
  return DownloadResponse(file_data=file_data, path=download_path)
66
67
 
67
68
  @property
@@ -20,7 +20,7 @@ class ChunkerConfig(BaseModel):
20
20
  default=None, description="The rule-set to use to form chunks. Omit to disable chunking."
21
21
  )
22
22
  chunking_endpoint: Optional[str] = Field(
23
- default="https://api.unstructured.io/general/v0/general",
23
+ default="https://api.unstructuredapp.io/general/v0/general",
24
24
  description="If chunking via api, use the following host.",
25
25
  )
26
26
  chunk_by_api: bool = Field(default=False, description="Flag to use api for chunking")
@@ -1,6 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import unstructured_ingest.v2.processes.connectors.databricks # noqa: F401
3
4
  import unstructured_ingest.v2.processes.connectors.fsspec # noqa: F401
5
+ import unstructured_ingest.v2.processes.connectors.sql # noqa: F401
4
6
  from unstructured_ingest.v2.processes.connector_registry import (
5
7
  add_destination_entry,
6
8
  add_source_entry,
@@ -16,11 +18,6 @@ from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
16
18
  from .chroma import chroma_destination_entry
17
19
  from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
18
20
  from .couchbase import couchbase_destination_entry, couchbase_source_entry
19
- from .databricks_volumes import CONNECTOR_TYPE as DATABRICKS_VOLUMES_CONNECTOR_TYPE
20
- from .databricks_volumes import (
21
- databricks_volumes_destination_entry,
22
- databricks_volumes_source_entry,
23
- )
24
21
  from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
25
22
  from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
26
23
  from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
@@ -32,7 +29,7 @@ from .local import local_destination_entry, local_source_entry
32
29
  from .milvus import CONNECTOR_TYPE as MILVUS_CONNECTOR_TYPE
33
30
  from .milvus import milvus_destination_entry
34
31
  from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
35
- from .mongodb import mongodb_destination_entry
32
+ from .mongodb import mongodb_destination_entry, mongodb_source_entry
36
33
  from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
37
34
  from .onedrive import onedrive_source_entry
38
35
  from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
@@ -47,8 +44,6 @@ from .sharepoint import CONNECTOR_TYPE as SHAREPOINT_CONNECTOR_TYPE
47
44
  from .sharepoint import sharepoint_source_entry
48
45
  from .singlestore import CONNECTOR_TYPE as SINGLESTORE_CONNECTOR_TYPE
49
46
  from .singlestore import singlestore_destination_entry
50
- from .sql import CONNECTOR_TYPE as SQL_CONNECTOR_TYPE
51
- from .sql import sql_destination_entry
52
47
  from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
53
48
  from .weaviate import weaviate_destination_entry
54
49
 
@@ -80,17 +75,9 @@ add_source_entry(source_type=SALESFORCE_CONNECTOR_TYPE, entry=salesforce_source_
80
75
 
81
76
  add_destination_entry(destination_type=WEAVIATE_CONNECTOR_TYPE, entry=weaviate_destination_entry)
82
77
 
83
- add_destination_entry(
84
- destination_type=DATABRICKS_VOLUMES_CONNECTOR_TYPE, entry=databricks_volumes_destination_entry
85
- )
86
- add_source_entry(
87
- source_type=DATABRICKS_VOLUMES_CONNECTOR_TYPE, entry=databricks_volumes_source_entry
88
- )
89
-
90
-
91
- add_destination_entry(destination_type=SQL_CONNECTOR_TYPE, entry=sql_destination_entry)
92
-
93
78
  add_destination_entry(destination_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_destination_entry)
79
+ add_source_entry(source_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_source_entry)
80
+
94
81
  add_destination_entry(destination_type=PINECONE_CONNECTOR_TYPE, entry=pinecone_destination_entry)
95
82
  add_source_entry(source_type=SHAREPOINT_CONNECTOR_TYPE, entry=sharepoint_source_entry)
96
83
  add_destination_entry(
@@ -0,0 +1,52 @@
1
+ from __future__ import annotations
2
+
3
+ from unstructured_ingest.v2.processes.connector_registry import (
4
+ add_destination_entry,
5
+ add_source_entry,
6
+ )
7
+
8
+ from .volumes_aws import CONNECTOR_TYPE as VOLUMES_AWS_CONNECTOR_TYPE
9
+ from .volumes_aws import (
10
+ databricks_aws_volumes_destination_entry,
11
+ databricks_aws_volumes_source_entry,
12
+ )
13
+ from .volumes_azure import CONNECTOR_TYPE as VOLUMES_AZURE_CONNECTOR_TYPE
14
+ from .volumes_azure import (
15
+ databricks_azure_volumes_destination_entry,
16
+ databricks_azure_volumes_source_entry,
17
+ )
18
+ from .volumes_gcp import CONNECTOR_TYPE as VOLUMES_GCP_CONNECTOR_TYPE
19
+ from .volumes_gcp import (
20
+ databricks_gcp_volumes_destination_entry,
21
+ databricks_gcp_volumes_source_entry,
22
+ )
23
+ from .volumes_native import CONNECTOR_TYPE as VOLUMES_NATIVE_CONNECTOR_TYPE
24
+ from .volumes_native import (
25
+ databricks_native_volumes_destination_entry,
26
+ databricks_native_volumes_source_entry,
27
+ )
28
+
29
+ add_source_entry(source_type=VOLUMES_AWS_CONNECTOR_TYPE, entry=databricks_aws_volumes_source_entry)
30
+ add_destination_entry(
31
+ destination_type=VOLUMES_AWS_CONNECTOR_TYPE, entry=databricks_aws_volumes_destination_entry
32
+ )
33
+
34
+ add_source_entry(source_type=VOLUMES_GCP_CONNECTOR_TYPE, entry=databricks_gcp_volumes_source_entry)
35
+ add_destination_entry(
36
+ destination_type=VOLUMES_GCP_CONNECTOR_TYPE, entry=databricks_gcp_volumes_destination_entry
37
+ )
38
+
39
+ add_source_entry(
40
+ source_type=VOLUMES_NATIVE_CONNECTOR_TYPE, entry=databricks_native_volumes_source_entry
41
+ )
42
+ add_destination_entry(
43
+ destination_type=VOLUMES_NATIVE_CONNECTOR_TYPE,
44
+ entry=databricks_native_volumes_destination_entry,
45
+ )
46
+
47
+ add_source_entry(
48
+ source_type=VOLUMES_AZURE_CONNECTOR_TYPE, entry=databricks_azure_volumes_source_entry
49
+ )
50
+ add_destination_entry(
51
+ destination_type=VOLUMES_AZURE_CONNECTOR_TYPE, entry=databricks_azure_volumes_destination_entry
52
+ )