unstructured-ingest 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1,6 +1,8 @@
1
1
  import os
2
+ import uuid
2
3
 
3
4
  import pytest
5
+ from googleapiclient.errors import HttpError
4
6
 
5
7
  from test.integration.connectors.utils.constants import (
6
8
  SOURCE_TAG,
@@ -13,6 +15,9 @@ from test.integration.connectors.utils.validation.source import (
13
15
  update_fixtures,
14
16
  )
15
17
  from test.integration.utils import requires_env
18
+ from unstructured_ingest.error import (
19
+ SourceConnectionError,
20
+ )
16
21
  from unstructured_ingest.v2.interfaces import Downloader, Indexer
17
22
  from unstructured_ingest.v2.processes.connectors.google_drive import (
18
23
  CONNECTOR_TYPE,
@@ -25,6 +30,49 @@ from unstructured_ingest.v2.processes.connectors.google_drive import (
25
30
  )
26
31
 
27
32
 
33
+ @pytest.fixture
34
+ def google_drive_connection_config():
35
+ """
36
+ Build a valid GoogleDriveConnectionConfig using the environment variables.
37
+ Expects:
38
+ - GOOGLE_DRIVE_ID
39
+ - GOOGLE_DRIVE_SERVICE_KEY
40
+ """
41
+ drive_id = os.getenv("GOOGLE_DRIVE_ID")
42
+ service_key = os.getenv("GOOGLE_DRIVE_SERVICE_KEY")
43
+ if not drive_id or not service_key:
44
+ pytest.skip("Google Drive credentials not provided in environment variables.")
45
+
46
+ access_config = GoogleDriveAccessConfig(service_account_key=service_key)
47
+ return GoogleDriveConnectionConfig(drive_id=drive_id, access_config=access_config)
48
+
49
+
50
+ @pytest.fixture
51
+ def google_drive_empty_folder(google_drive_connection_config):
52
+ """
53
+ Creates an empty folder on Google Drive for testing the "empty folder" case.
54
+ The folder is deleted after the test.
55
+ """
56
+ from google.oauth2 import service_account
57
+ from googleapiclient.discovery import build
58
+
59
+ access_config = google_drive_connection_config.access_config.get_secret_value()
60
+ creds = service_account.Credentials.from_service_account_info(access_config.service_account_key)
61
+ service = build("drive", "v3", credentials=creds)
62
+
63
+ # Create an empty folder.
64
+ file_metadata = {
65
+ "name": f"utic-empty-folder-{uuid.uuid4()}",
66
+ "mimeType": "application/vnd.google-apps.folder",
67
+ }
68
+ folder = service.files().create(body=file_metadata, fields="id, name").execute()
69
+ folder_id = folder.get("id")
70
+ try:
71
+ yield folder_id
72
+ finally:
73
+ service.files().delete(fileId=folder_id).execute()
74
+
75
+
28
76
  @requires_env("GOOGLE_DRIVE_SERVICE_KEY")
29
77
  @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE)
30
78
  def test_google_drive_source(temp_dir):
@@ -114,3 +162,96 @@ def source_connector_validation(
114
162
  save_downloads=configs.validate_downloaded_files,
115
163
  save_filedata=configs.validate_file_data,
116
164
  )
165
+
166
+
167
+ # Precheck fails when the drive ID has an appended parameter (simulate copy-paste error)
168
+ @pytest.mark.tags("google-drive", "precheck")
169
+ @requires_env("GOOGLE_DRIVE_ID", "GOOGLE_DRIVE_SERVICE_KEY")
170
+ def test_google_drive_precheck_invalid_parameter(google_drive_connection_config):
171
+ # Append a query parameter as often happens when copying from a URL.
172
+ invalid_drive_id = google_drive_connection_config.drive_id + "?usp=sharing"
173
+ connection_config = GoogleDriveConnectionConfig(
174
+ drive_id=invalid_drive_id,
175
+ access_config=google_drive_connection_config.access_config,
176
+ )
177
+ index_config = GoogleDriveIndexerConfig(recursive=True)
178
+ indexer = GoogleDriveIndexer(connection_config=connection_config, index_config=index_config)
179
+ with pytest.raises(SourceConnectionError) as excinfo:
180
+ indexer.precheck()
181
+ assert "invalid" in str(excinfo.value).lower() or "not found" in str(excinfo.value).lower()
182
+
183
+
184
+ # Precheck fails due to lack of permission (simulate via monkeypatching).
185
+ @pytest.mark.tags("google-drive", "precheck")
186
+ @requires_env("GOOGLE_DRIVE_ID", "GOOGLE_DRIVE_SERVICE_KEY")
187
+ def test_google_drive_precheck_no_permission(google_drive_connection_config, monkeypatch):
188
+ index_config = GoogleDriveIndexerConfig(recursive=True)
189
+ indexer = GoogleDriveIndexer(
190
+ connection_config=google_drive_connection_config,
191
+ index_config=index_config,
192
+ )
193
+
194
+ # Monkeypatch get_root_info to always raise an HTTP 403 error.
195
+ def fake_get_root_info(files_client, object_id):
196
+ raise HttpError(
197
+ resp=type("Response", (), {"status": 403, "reason": "Forbidden"})(),
198
+ content=b"Forbidden",
199
+ )
200
+
201
+ monkeypatch.setattr(indexer, "get_root_info", fake_get_root_info)
202
+ with pytest.raises(SourceConnectionError) as excinfo:
203
+ indexer.precheck()
204
+ assert "forbidden" in str(excinfo.value).lower() or "permission" in str(excinfo.value).lower()
205
+
206
+
207
+ # Precheck fails when the folder is empty.
208
+ # @pytest.mark.tags("google-drive", "precheck")
209
+ # @requires_env("GOOGLE_DRIVE_ID", "GOOGLE_DRIVE_SERVICE_KEY")
210
+ # def test_google_drive_precheck_empty_folder(
211
+ # google_drive_connection_config, google_drive_empty_folder
212
+ # ):
213
+ # # Use the empty folder's ID as the target.
214
+ # connection_config = GoogleDriveConnectionConfig(
215
+ # drive_id=google_drive_empty_folder,
216
+ # access_config=google_drive_connection_config.access_config,
217
+ # )
218
+
219
+ # index_config = GoogleDriveIndexerConfig(recursive=True)
220
+ # indexer = GoogleDriveIndexer(connection_config=connection_config, index_config=index_config)
221
+ # with pytest.raises(SourceConnectionError) as excinfo:
222
+ # indexer.precheck()
223
+ # assert "empty folder" in str(excinfo.value).lower()
224
+
225
+
226
+ @pytest.mark.tags("google-drive", "count", "integration")
227
+ @requires_env("GOOGLE_DRIVE_ID", "GOOGLE_DRIVE_SERVICE_KEY")
228
+ def test_google_drive_count_files(google_drive_connection_config):
229
+ """
230
+ This test verifies that the count_files_recursively method returns the expected count of files.
231
+ According to the test credentials, there are 3 files in the root directory and 1 nested file,
232
+ so the total count should be 4.
233
+ """
234
+ # I assumed that we're applying the same extension filter as with other tests
235
+ # However there's 6 files in total in the test dir
236
+ extensions_filter = ["pdf", "docx"]
237
+ with google_drive_connection_config.get_client() as client:
238
+ count = GoogleDriveIndexer.count_files_recursively(
239
+ client, google_drive_connection_config.drive_id, extensions_filter
240
+ )
241
+ assert count == 4, f"Expected file count of 4, but got {count}"
242
+
243
+
244
+ # Precheck fails with a completely invalid drive ID.
245
+ @pytest.mark.tags("google-drive", "precheck")
246
+ @requires_env("GOOGLE_DRIVE_ID", "GOOGLE_DRIVE_SERVICE_KEY")
247
+ def test_google_drive_precheck_invalid_drive_id(google_drive_connection_config):
248
+ invalid_drive_id = "invalid_drive_id"
249
+ connection_config = GoogleDriveConnectionConfig(
250
+ drive_id=invalid_drive_id,
251
+ access_config=google_drive_connection_config.access_config,
252
+ )
253
+ index_config = GoogleDriveIndexerConfig(recursive=True)
254
+ indexer = GoogleDriveIndexer(connection_config=connection_config, index_config=index_config)
255
+ with pytest.raises(SourceConnectionError) as excinfo:
256
+ indexer.precheck()
257
+ assert "invalid" in str(excinfo.value).lower() or "not found" in str(excinfo.value).lower()
@@ -1 +1 @@
1
- __version__ = "0.5.1" # pragma: no cover
1
+ __version__ = "0.5.2" # pragma: no cover
@@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, AsyncIterable
8
8
  from pydantic import Field, SecretStr
9
9
 
10
10
  from unstructured_ingest.embed.interfaces import (
11
+ EMBEDDINGS_KEY,
11
12
  AsyncBaseEmbeddingEncoder,
12
13
  BaseEmbeddingEncoder,
13
14
  EmbeddingConfig,
@@ -145,9 +146,12 @@ class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
145
146
  return response_body.get("embedding")
146
147
 
147
148
  def embed_documents(self, elements: list[dict]) -> list[dict]:
148
- embeddings = [self.embed_query(query=e.get("text", "")) for e in elements]
149
- elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
150
- return elements_with_embeddings
149
+ elements = elements.copy()
150
+ elements_with_text = [e for e in elements if e.get("text")]
151
+ embeddings = [self.embed_query(query=e["text"]) for e in elements_with_text]
152
+ for element, embedding in zip(elements_with_text, embeddings):
153
+ element[EMBEDDINGS_KEY] = embedding
154
+ return elements
151
155
 
152
156
 
153
157
  @dataclass
@@ -186,8 +190,11 @@ class AsyncBedrockEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
186
190
  raise ValueError(f"Error raised by inference endpoint: {e}")
187
191
 
188
192
  async def embed_documents(self, elements: list[dict]) -> list[dict]:
193
+ elements = elements.copy()
194
+ elements_with_text = [e for e in elements if e.get("text")]
189
195
  embeddings = await asyncio.gather(
190
- *[self.embed_query(query=e.get("text", "")) for e in elements]
196
+ *[self.embed_query(query=e.get("text", "")) for e in elements_with_text]
191
197
  )
192
- elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
193
- return elements_with_embeddings
198
+ for element, embedding in zip(elements_with_text, embeddings):
199
+ element[EMBEDDINGS_KEY] = embedding
200
+ return elements
@@ -3,7 +3,11 @@ from typing import TYPE_CHECKING, Optional
3
3
 
4
4
  from pydantic import Field
5
5
 
6
- from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
6
+ from unstructured_ingest.embed.interfaces import (
7
+ EMBEDDINGS_KEY,
8
+ BaseEmbeddingEncoder,
9
+ EmbeddingConfig,
10
+ )
7
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
8
12
 
9
13
  if TYPE_CHECKING:
@@ -52,6 +56,9 @@ class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
52
56
  return embeddings.tolist()
53
57
 
54
58
  def embed_documents(self, elements: list[dict]) -> list[dict]:
55
- embeddings = self._embed_documents([e.get("text", "") for e in elements])
56
- elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
57
- return elements_with_embeddings
59
+ elements = elements.copy()
60
+ elements_with_text = [e for e in elements if e.get("text")]
61
+ embeddings = self._embed_documents([e["text"] for e in elements_with_text])
62
+ for element, embedding in zip(elements_with_text, embeddings):
63
+ element[EMBEDDINGS_KEY] = embedding
64
+ return elements
@@ -6,6 +6,8 @@ from typing import Optional
6
6
  import numpy as np
7
7
  from pydantic import BaseModel, Field
8
8
 
9
+ EMBEDDINGS_KEY = "embeddings"
10
+
9
11
 
10
12
  class EmbeddingConfig(BaseModel):
11
13
  batch_size: Optional[int] = Field(
@@ -26,27 +28,6 @@ class BaseEncoder(ABC):
26
28
  if possible"""
27
29
  return e
28
30
 
29
- @staticmethod
30
- def _add_embeddings_to_elements(
31
- elements: list[dict], embeddings: list[list[float]]
32
- ) -> list[dict]:
33
- """
34
- Add embeddings to elements.
35
-
36
- Args:
37
- elements (list[Element]): List of elements.
38
- embeddings (list[list[float]]): List of embeddings.
39
-
40
- Returns:
41
- list[Element]: Elements with embeddings added.
42
- """
43
- assert len(elements) == len(embeddings)
44
- elements_w_embedding = []
45
- for i, element in enumerate(elements):
46
- element["embeddings"] = embeddings[i]
47
- elements_w_embedding.append(element)
48
- return elements
49
-
50
31
 
51
32
  @dataclass
52
33
  class BaseEmbeddingEncoder(BaseEncoder, ABC):
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING
6
6
  from pydantic import Field, SecretStr
7
7
 
8
8
  from unstructured_ingest.embed.interfaces import (
9
+ EMBEDDINGS_KEY,
9
10
  AsyncBaseEmbeddingEncoder,
10
11
  BaseEmbeddingEncoder,
11
12
  EmbeddingConfig,
@@ -134,8 +135,12 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
134
135
  Returns:
135
136
  list[Element]: Elements with embeddings.
136
137
  """
137
- embeddings = self._embed([e.get("text", "") for e in elements])
138
- return self._add_embeddings_to_elements(elements, embeddings)
138
+ elements = elements.copy()
139
+ elements_with_text = [e for e in elements if e.get("text")]
140
+ embeddings = self._embed([e["text"] for e in elements_with_text])
141
+ for element, embedding in zip(elements_with_text, embeddings):
142
+ element[EMBEDDINGS_KEY] = embedding
143
+ return elements
139
144
 
140
145
  def embed_query(self, query: str) -> list[float]:
141
146
  """
@@ -209,8 +214,12 @@ class AsyncMixedbreadAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
209
214
  Returns:
210
215
  list[Element]: Elements with embeddings.
211
216
  """
212
- embeddings = await self._embed([e.get("text", "") for e in elements])
213
- return self._add_embeddings_to_elements(elements, embeddings)
217
+ elements = elements.copy()
218
+ elements_with_text = [e for e in elements if e.get("text")]
219
+ embeddings = await self._embed([e["text"] for e in elements_with_text])
220
+ for element, embedding in zip(elements_with_text, embeddings):
221
+ element[EMBEDDINGS_KEY] = embedding
222
+ return elements
214
223
 
215
224
  async def embed_query(self, query: str) -> list[float]:
216
225
  """
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING
4
4
  from pydantic import Field, SecretStr
5
5
 
6
6
  from unstructured_ingest.embed.interfaces import (
7
+ EMBEDDINGS_KEY,
7
8
  AsyncBaseEmbeddingEncoder,
8
9
  BaseEmbeddingEncoder,
9
10
  EmbeddingConfig,
@@ -89,7 +90,9 @@ class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
89
90
  return response.data[0].embedding
90
91
 
91
92
  def embed_documents(self, elements: list[dict]) -> list[dict]:
92
- texts = [e.get("text", "") for e in elements]
93
+ elements = elements.copy()
94
+ elements_with_text = [e for e in elements if e.get("text")]
95
+ texts = [e["text"] for e in elements_with_text]
93
96
  embeddings = []
94
97
  client = self.config.get_client()
95
98
  try:
@@ -100,8 +103,9 @@ class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
100
103
  embeddings.extend([data.embedding for data in response.data])
101
104
  except Exception as e:
102
105
  raise self.wrap_error(e=e)
103
- elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
104
- return elements_with_embeddings
106
+ for element, embedding in zip(elements_with_text, embeddings):
107
+ element[EMBEDDINGS_KEY] = embedding
108
+ return elements
105
109
 
106
110
 
107
111
  @dataclass
@@ -122,7 +126,9 @@ class AsyncOctoAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
122
126
  return response.data[0].embedding
123
127
 
124
128
  async def embed_documents(self, elements: list[dict]) -> list[dict]:
125
- texts = [e.get("text", "") for e in elements]
129
+ elements = elements.copy()
130
+ elements_with_text = [e for e in elements if e.get("text")]
131
+ texts = [e["text"] for e in elements_with_text]
126
132
  client = self.config.get_async_client()
127
133
  embeddings = []
128
134
  try:
@@ -133,5 +139,6 @@ class AsyncOctoAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
133
139
  embeddings.extend([data.embedding for data in response.data])
134
140
  except Exception as e:
135
141
  raise self.wrap_error(e=e)
136
- elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
137
- return elements_with_embeddings
142
+ for element, embedding in zip(elements_with_text, embeddings):
143
+ element[EMBEDDINGS_KEY] = embedding
144
+ return elements
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING
4
4
  from pydantic import Field, SecretStr
5
5
 
6
6
  from unstructured_ingest.embed.interfaces import (
7
+ EMBEDDINGS_KEY,
7
8
  AsyncBaseEmbeddingEncoder,
8
9
  BaseEmbeddingEncoder,
9
10
  EmbeddingConfig,
@@ -82,7 +83,9 @@ class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
82
83
 
83
84
  def embed_documents(self, elements: list[dict]) -> list[dict]:
84
85
  client = self.config.get_client()
85
- texts = [e.get("text", "") for e in elements]
86
+ elements = elements.copy()
87
+ elements_with_text = [e for e in elements if e.get("text")]
88
+ texts = [e["text"] for e in elements_with_text]
86
89
  embeddings = []
87
90
  try:
88
91
  for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
@@ -92,8 +95,9 @@ class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
92
95
  embeddings.extend([data.embedding for data in response.data])
93
96
  except Exception as e:
94
97
  raise self.wrap_error(e=e)
95
- elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
96
- return elements_with_embeddings
98
+ for element, embedding in zip(elements_with_text, embeddings):
99
+ element[EMBEDDINGS_KEY] = embedding
100
+ return elements
97
101
 
98
102
 
99
103
  @dataclass
@@ -115,7 +119,9 @@ class AsyncOpenAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
115
119
 
116
120
  async def embed_documents(self, elements: list[dict]) -> list[dict]:
117
121
  client = self.config.get_async_client()
118
- texts = [e.get("text", "") for e in elements]
122
+ elements = elements.copy()
123
+ elements_with_text = [e for e in elements if e.get("text")]
124
+ texts = [e["text"] for e in elements_with_text]
119
125
  embeddings = []
120
126
  try:
121
127
  for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
@@ -125,5 +131,6 @@ class AsyncOpenAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
125
131
  embeddings.extend([data.embedding for data in response.data])
126
132
  except Exception as e:
127
133
  raise self.wrap_error(e=e)
128
- elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
129
- return elements_with_embeddings
134
+ for element, embedding in zip(elements_with_text, embeddings):
135
+ element[EMBEDDINGS_KEY] = embedding
136
+ return elements
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING
4
4
  from pydantic import Field, SecretStr
5
5
 
6
6
  from unstructured_ingest.embed.interfaces import (
7
+ EMBEDDINGS_KEY,
7
8
  AsyncBaseEmbeddingEncoder,
8
9
  BaseEmbeddingEncoder,
9
10
  EmbeddingConfig,
@@ -67,8 +68,12 @@ class TogetherAIEmbeddingEncoder(BaseEmbeddingEncoder):
67
68
  return self._embed_documents(elements=[query])[0]
68
69
 
69
70
  def embed_documents(self, elements: list[dict]) -> list[dict]:
70
- embeddings = self._embed_documents([e.get("text", "") for e in elements])
71
- return self._add_embeddings_to_elements(elements, embeddings)
71
+ elements = elements.copy()
72
+ elements_with_text = [e for e in elements if e.get("text")]
73
+ embeddings = self._embed_documents([e["text"] for e in elements_with_text])
74
+ for element, embedding in zip(elements_with_text, embeddings):
75
+ element[EMBEDDINGS_KEY] = embedding
76
+ return elements
72
77
 
73
78
  def _embed_documents(self, elements: list[str]) -> list[list[float]]:
74
79
  client = self.config.get_client()
@@ -98,8 +103,12 @@ class AsyncTogetherAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
98
103
  return embedding[0]
99
104
 
100
105
  async def embed_documents(self, elements: list[dict]) -> list[dict]:
101
- embeddings = await self._embed_documents([e.get("text", "") for e in elements])
102
- return self._add_embeddings_to_elements(elements, embeddings)
106
+ elements = elements.copy()
107
+ elements_with_text = [e for e in elements if e.get("text")]
108
+ embeddings = await self._embed_documents([e["text"] for e in elements_with_text])
109
+ for element, embedding in zip(elements_with_text, embeddings):
110
+ element[EMBEDDINGS_KEY] = embedding
111
+ return elements
103
112
 
104
113
  async def _embed_documents(self, elements: list[str]) -> list[list[float]]:
105
114
  client = self.config.get_async_client()
@@ -9,6 +9,7 @@ from pydantic import Field, Secret, ValidationError
9
9
  from pydantic.functional_validators import BeforeValidator
10
10
 
11
11
  from unstructured_ingest.embed.interfaces import (
12
+ EMBEDDINGS_KEY,
12
13
  AsyncBaseEmbeddingEncoder,
13
14
  BaseEmbeddingEncoder,
14
15
  EmbeddingConfig,
@@ -75,9 +76,12 @@ class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
75
76
  return self._embed_documents(elements=[query])[0]
76
77
 
77
78
  def embed_documents(self, elements: list[dict]) -> list[dict]:
78
- embeddings = self._embed_documents([e.get("text", "") for e in elements])
79
- elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
80
- return elements_with_embeddings
79
+ elements = elements.copy()
80
+ elements_with_text = [e for e in elements if e.get("text")]
81
+ embeddings = self._embed_documents([e["text"] for e in elements_with_text])
82
+ for element, embedding in zip(elements_with_text, embeddings):
83
+ element[EMBEDDINGS_KEY] = embedding
84
+ return elements
81
85
 
82
86
  @requires_dependencies(
83
87
  ["vertexai"],
@@ -110,9 +114,12 @@ class AsyncVertexAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
110
114
  return embedding[0]
111
115
 
112
116
  async def embed_documents(self, elements: list[dict]) -> list[dict]:
113
- embeddings = await self._embed_documents([e.get("text", "") for e in elements])
114
- elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
115
- return elements_with_embeddings
117
+ elements = elements.copy()
118
+ elements_with_text = [e for e in elements if e.get("text")]
119
+ embeddings = await self._embed_documents([e["text"] for e in elements_with_text])
120
+ for element, embedding in zip(elements_with_text, embeddings):
121
+ element[EMBEDDINGS_KEY] = embedding
122
+ return elements
116
123
 
117
124
  @requires_dependencies(
118
125
  ["vertexai"],
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING, Optional
4
4
  from pydantic import Field, SecretStr
5
5
 
6
6
  from unstructured_ingest.embed.interfaces import (
7
+ EMBEDDINGS_KEY,
7
8
  AsyncBaseEmbeddingEncoder,
8
9
  BaseEmbeddingEncoder,
9
10
  EmbeddingConfig,
@@ -107,8 +108,12 @@ class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
107
108
  return embeddings
108
109
 
109
110
  def embed_documents(self, elements: list[dict]) -> list[dict]:
110
- embeddings = self._embed_documents([e.get("text", "") for e in elements])
111
- return self._add_embeddings_to_elements(elements, embeddings)
111
+ elements = elements.copy()
112
+ elements_with_text = [e for e in elements if e.get("text")]
113
+ embeddings = self._embed_documents([e["text"] for e in elements_with_text])
114
+ for element, embedding in zip(elements_with_text, embeddings):
115
+ element[EMBEDDINGS_KEY] = embedding
116
+ return elements
112
117
 
113
118
  def embed_query(self, query: str) -> list[float]:
114
119
  return self._embed_documents(elements=[query])[0]
@@ -135,8 +140,12 @@ class AsyncVoyageAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
135
140
  return embeddings
136
141
 
137
142
  async def embed_documents(self, elements: list[dict]) -> list[dict]:
138
- embeddings = await self._embed_documents([e.get("text", "") for e in elements])
139
- return self._add_embeddings_to_elements(elements, embeddings)
143
+ elements = elements.copy()
144
+ elements_with_text = [e for e in elements if e.get("text")]
145
+ embeddings = await self._embed_documents([e["text"] for e in elements_with_text])
146
+ for element, embedding in zip(elements_with_text, embeddings):
147
+ element[EMBEDDINGS_KEY] = embedding
148
+ return elements
140
149
 
141
150
  async def embed_query(self, query: str) -> list[float]:
142
151
  embedding = await self._embed_documents(elements=[query])
@@ -132,12 +132,141 @@ class GoogleDriveIndexer(Indexer):
132
132
  ]
133
133
  )
134
134
 
135
+ @staticmethod
136
+ def verify_drive_api_enabled(client) -> None:
137
+ from googleapiclient.errors import HttpError
138
+
139
+ """
140
+ Makes a lightweight API call to verify that the Drive API is enabled.
141
+ If the API is not enabled, an HttpError should be raised.
142
+ """
143
+ try:
144
+ # A very minimal call: list 1 file from the drive.
145
+ client.list(spaces="drive", pageSize=1, fields="files(id)").execute()
146
+ except HttpError as e:
147
+ error_content = e.content.decode() if hasattr(e, "content") else ""
148
+ lower_error = error_content.lower()
149
+ if "drive api" in lower_error and (
150
+ "not enabled" in lower_error or "not been used" in lower_error
151
+ ):
152
+ raise SourceConnectionError(
153
+ "Google Drive API is not enabled for your project. \
154
+ Please enable it in the Google Cloud Console."
155
+ )
156
+ else:
157
+ raise SourceConnectionError("Google drive API unreachable for an unknown reason!")
158
+
159
+ @staticmethod
160
+ def count_files_recursively(files_client, folder_id: str, extensions: list[str] = None) -> int:
161
+ """
162
+ Count non-folder files recursively under the given folder.
163
+ If `extensions` is provided, only count files
164
+ whose `fileExtension` matches one of the values.
165
+ """
166
+ count = 0
167
+ stack = [folder_id]
168
+ while stack:
169
+ current_folder = stack.pop()
170
+ # Always list all items under the current folder.
171
+ query = f"'{current_folder}' in parents"
172
+ page_token = None
173
+ while True:
174
+ response = files_client.list(
175
+ spaces="drive",
176
+ q=query,
177
+ fields="nextPageToken, files(id, mimeType, fileExtension)",
178
+ pageToken=page_token,
179
+ pageSize=1000,
180
+ ).execute()
181
+ for item in response.get("files", []):
182
+ if item.get("mimeType") == "application/vnd.google-apps.folder":
183
+ # Always traverse sub-folders regardless of extension filter.
184
+ stack.append(item["id"])
185
+ else:
186
+ if extensions:
187
+ # Use a case-insensitive comparison for the file extension.
188
+ file_ext = (item.get("fileExtension") or "").lower()
189
+ valid_exts = [e.lower() for e in extensions]
190
+ if file_ext in valid_exts:
191
+ count += 1
192
+ else:
193
+ count += 1
194
+ page_token = response.get("nextPageToken")
195
+ if not page_token:
196
+ break
197
+ return count
198
+
135
199
  def precheck(self) -> None:
200
+ """
201
+ Enhanced precheck that verifies not only connectivity
202
+ but also that the provided drive_id is valid and accessible.
203
+ """
136
204
  try:
137
- self.connection_config.get_client()
205
+ with self.connection_config.get_client() as client:
206
+ # First, verify that the Drive API is enabled.
207
+ self.verify_drive_api_enabled(client)
208
+
209
+ # Try to retrieve metadata for the drive id.
210
+ # This will catch errors such as an invalid drive id or insufficient permissions.
211
+ root_info = self.get_root_info(
212
+ files_client=client, object_id=self.connection_config.drive_id
213
+ )
214
+ logger.info(
215
+ f"Successfully retrieved drive root info: "
216
+ f"{root_info.get('name', 'Unnamed')} (ID: {root_info.get('id')})"
217
+ )
218
+
219
+ # If the target is a folder, perform file count check.
220
+ if self.is_dir(root_info):
221
+ if self.index_config.recursive:
222
+ file_count = self.count_files_recursively(
223
+ client,
224
+ self.connection_config.drive_id,
225
+ extensions=self.index_config.extensions,
226
+ )
227
+ if file_count == 0:
228
+ logger.warning(
229
+ "Empty folder: no files found recursively in the folder. \
230
+ Please verify that the folder contains files and \
231
+ that the service account has proper permissions."
232
+ )
233
+ # raise SourceConnectionError(
234
+ # "Empty folder: no files found recursively in the folder. "
235
+ # "Please verify that the folder contains files and \
236
+ # that the service account has proper permissions."
237
+ # )
238
+ else:
239
+ logger.info(f"Found {file_count} files recursively in the folder.")
240
+ else:
241
+ # Non-recursive: check for at least one immediate non-folder child.
242
+ response = client.list(
243
+ spaces="drive",
244
+ fields="files(id)",
245
+ pageSize=1,
246
+ q=f"'{self.connection_config.drive_id}' in parents",
247
+ ).execute()
248
+ if not response.get("files"):
249
+ logger.warning(
250
+ "Empty folder: no files found at the folder's root level. "
251
+ "Please verify that the folder contains files and \
252
+ that the service account has proper permissions."
253
+ )
254
+ # raise SourceConnectionError(
255
+ # "Empty folder: no files found at the folder's root level. "
256
+ # "Please verify that the folder contains files and \
257
+ # that the service account has proper permissions."
258
+ # )
259
+ else:
260
+ logger.info("Found files at the folder's root level.")
261
+ else:
262
+ # If the target is a file, precheck passes.
263
+ logger.info("Drive ID corresponds to a file. Precheck passed.")
264
+
138
265
  except Exception as e:
139
- logger.error(f"failed to validate connection: {e}", exc_info=True)
140
- raise SourceConnectionError(f"failed to validate connection: {e}")
266
+ logger.error(
267
+ "Failed to validate Google Drive connection during precheck", exc_info=True
268
+ )
269
+ raise SourceConnectionError(f"Precheck failed: {e}")
141
270
 
142
271
  @staticmethod
143
272
  def is_dir(record: dict) -> bool:
@@ -1,6 +1,7 @@
1
+ import json
1
2
  from contextlib import contextmanager
2
3
  from dataclasses import dataclass, field
3
- from typing import TYPE_CHECKING, Generator, Optional
4
+ from typing import TYPE_CHECKING, Any, Generator, Optional
4
5
 
5
6
  import numpy as np
6
7
  import pandas as pd
@@ -15,6 +16,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
15
16
  SourceRegistryEntry,
16
17
  )
17
18
  from unstructured_ingest.v2.processes.connectors.sql.sql import (
19
+ _DATE_COLUMNS,
18
20
  SQLAccessConfig,
19
21
  SqlBatchFileData,
20
22
  SQLConnectionConfig,
@@ -26,6 +28,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sql import (
26
28
  SQLUploaderConfig,
27
29
  SQLUploadStager,
28
30
  SQLUploadStagerConfig,
31
+ parse_date_string,
29
32
  )
30
33
 
31
34
  if TYPE_CHECKING:
@@ -34,6 +37,17 @@ if TYPE_CHECKING:
34
37
 
35
38
  CONNECTOR_TYPE = "snowflake"
36
39
 
40
+ _ARRAY_COLUMNS = (
41
+ "embeddings",
42
+ "languages",
43
+ "link_urls",
44
+ "link_texts",
45
+ "sent_from",
46
+ "sent_to",
47
+ "emphasized_text_contents",
48
+ "emphasized_text_tags",
49
+ )
50
+
37
51
 
38
52
  class SnowflakeAccessConfig(SQLAccessConfig):
39
53
  password: Optional[str] = Field(default=None, description="DB password")
@@ -160,6 +174,42 @@ class SnowflakeUploader(SQLUploader):
160
174
  connector_type: str = CONNECTOR_TYPE
161
175
  values_delimiter: str = "?"
162
176
 
177
+ def prepare_data(
178
+ self, columns: list[str], data: tuple[tuple[Any, ...], ...]
179
+ ) -> list[tuple[Any, ...]]:
180
+ output = []
181
+ for row in data:
182
+ parsed = []
183
+ for column_name, value in zip(columns, row):
184
+ if column_name in _DATE_COLUMNS:
185
+ if value is None or pd.isna(value): # pandas is nan
186
+ parsed.append(None)
187
+ else:
188
+ parsed.append(parse_date_string(value))
189
+ elif column_name in _ARRAY_COLUMNS:
190
+ if not isinstance(value, list) and (
191
+ value is None or pd.isna(value)
192
+ ): # pandas is nan
193
+ parsed.append(None)
194
+ else:
195
+ parsed.append(json.dumps(value))
196
+ else:
197
+ parsed.append(value)
198
+ output.append(tuple(parsed))
199
+ return output
200
+
201
+ def _parse_values(self, columns: list[str]) -> str:
202
+ return ",".join(
203
+ [
204
+ (
205
+ f"PARSE_JSON({self.values_delimiter})"
206
+ if col in _ARRAY_COLUMNS
207
+ else self.values_delimiter
208
+ )
209
+ for col in columns
210
+ ]
211
+ )
212
+
163
213
  def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
164
214
  if self.can_delete():
165
215
  self.delete_by_record_id(file_data=file_data)
@@ -173,10 +223,10 @@ class SnowflakeUploader(SQLUploader):
173
223
  self._fit_to_schema(df=df)
174
224
 
175
225
  columns = list(df.columns)
176
- stmt = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
226
+ stmt = "INSERT INTO {table_name} ({columns}) SELECT {values}".format(
177
227
  table_name=self.upload_config.table_name,
178
228
  columns=",".join(columns),
179
- values=",".join([self.values_delimiter for _ in columns]),
229
+ values=self._parse_values(columns),
180
230
  )
181
231
  logger.info(
182
232
  f"writing a total of {len(df)} elements via"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.5.1
3
+ Version: 0.5.2
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,31 +22,31 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.14
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
- Requires-Dist: dataclasses-json
26
- Requires-Dist: click
27
- Requires-Dist: opentelemetry-sdk
25
+ Requires-Dist: pandas
28
26
  Requires-Dist: pydantic>=2.7
27
+ Requires-Dist: dataclasses-json
29
28
  Requires-Dist: python-dateutil
30
- Requires-Dist: pandas
29
+ Requires-Dist: opentelemetry-sdk
30
+ Requires-Dist: click
31
31
  Requires-Dist: tqdm
32
32
  Provides-Extra: airtable
33
33
  Requires-Dist: pyairtable; extra == "airtable"
34
34
  Provides-Extra: astradb
35
35
  Requires-Dist: astrapy; extra == "astradb"
36
36
  Provides-Extra: azure
37
- Requires-Dist: adlfs; extra == "azure"
38
37
  Requires-Dist: fsspec; extra == "azure"
38
+ Requires-Dist: adlfs; extra == "azure"
39
39
  Provides-Extra: azure-ai-search
40
40
  Requires-Dist: azure-search-documents; extra == "azure-ai-search"
41
41
  Provides-Extra: bedrock
42
- Requires-Dist: aioboto3; extra == "bedrock"
43
42
  Requires-Dist: boto3; extra == "bedrock"
43
+ Requires-Dist: aioboto3; extra == "bedrock"
44
44
  Provides-Extra: biomed
45
45
  Requires-Dist: requests; extra == "biomed"
46
46
  Requires-Dist: bs4; extra == "biomed"
47
47
  Provides-Extra: box
48
- Requires-Dist: boxfs; extra == "box"
49
48
  Requires-Dist: fsspec; extra == "box"
49
+ Requires-Dist: boxfs; extra == "box"
50
50
  Provides-Extra: chroma
51
51
  Requires-Dist: chromadb; extra == "chroma"
52
52
  Provides-Extra: clarifai
@@ -92,12 +92,12 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
92
92
  Provides-Extra: epub
93
93
  Requires-Dist: unstructured[epub]; extra == "epub"
94
94
  Provides-Extra: gcs
95
- Requires-Dist: gcsfs; extra == "gcs"
96
- Requires-Dist: bs4; extra == "gcs"
97
95
  Requires-Dist: fsspec; extra == "gcs"
96
+ Requires-Dist: bs4; extra == "gcs"
97
+ Requires-Dist: gcsfs; extra == "gcs"
98
98
  Provides-Extra: github
99
- Requires-Dist: requests; extra == "github"
100
99
  Requires-Dist: pygithub>1.58.0; extra == "github"
100
+ Requires-Dist: requests; extra == "github"
101
101
  Provides-Extra: gitlab
102
102
  Requires-Dist: python-gitlab; extra == "gitlab"
103
103
  Provides-Extra: google-drive
@@ -122,20 +122,20 @@ Requires-Dist: pymongo; extra == "mongodb"
122
122
  Provides-Extra: msg
123
123
  Requires-Dist: unstructured[msg]; extra == "msg"
124
124
  Provides-Extra: neo4j
125
- Requires-Dist: neo4j; extra == "neo4j"
126
- Requires-Dist: cymple; extra == "neo4j"
127
125
  Requires-Dist: networkx; extra == "neo4j"
126
+ Requires-Dist: cymple; extra == "neo4j"
127
+ Requires-Dist: neo4j; extra == "neo4j"
128
128
  Provides-Extra: notion
129
- Requires-Dist: backoff; extra == "notion"
130
129
  Requires-Dist: htmlBuilder; extra == "notion"
131
- Requires-Dist: httpx; extra == "notion"
130
+ Requires-Dist: backoff; extra == "notion"
132
131
  Requires-Dist: notion-client; extra == "notion"
132
+ Requires-Dist: httpx; extra == "notion"
133
133
  Provides-Extra: odt
134
134
  Requires-Dist: unstructured[odt]; extra == "odt"
135
135
  Provides-Extra: onedrive
136
136
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
137
- Requires-Dist: msal; extra == "onedrive"
138
137
  Requires-Dist: bs4; extra == "onedrive"
138
+ Requires-Dist: msal; extra == "onedrive"
139
139
  Provides-Extra: openai
140
140
  Requires-Dist: openai; extra == "openai"
141
141
  Requires-Dist: tiktoken; extra == "openai"
@@ -169,13 +169,13 @@ Requires-Dist: unstructured[rst]; extra == "rst"
169
169
  Provides-Extra: rtf
170
170
  Requires-Dist: unstructured[rtf]; extra == "rtf"
171
171
  Provides-Extra: s3
172
- Requires-Dist: fsspec; extra == "s3"
173
172
  Requires-Dist: s3fs; extra == "s3"
173
+ Requires-Dist: fsspec; extra == "s3"
174
174
  Provides-Extra: salesforce
175
175
  Requires-Dist: simple-salesforce; extra == "salesforce"
176
176
  Provides-Extra: sftp
177
- Requires-Dist: paramiko; extra == "sftp"
178
177
  Requires-Dist: fsspec; extra == "sftp"
178
+ Requires-Dist: paramiko; extra == "sftp"
179
179
  Provides-Extra: sharepoint
180
180
  Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
181
181
  Requires-Dist: msal; extra == "sharepoint"
@@ -184,19 +184,19 @@ Requires-Dist: singlestoredb; extra == "singlestore"
184
184
  Provides-Extra: slack
185
185
  Requires-Dist: slack-sdk[optional]; extra == "slack"
186
186
  Provides-Extra: snowflake
187
- Requires-Dist: snowflake-connector-python; extra == "snowflake"
188
187
  Requires-Dist: psycopg2-binary; extra == "snowflake"
188
+ Requires-Dist: snowflake-connector-python; extra == "snowflake"
189
189
  Provides-Extra: togetherai
190
190
  Requires-Dist: together; extra == "togetherai"
191
191
  Provides-Extra: tsv
192
192
  Requires-Dist: unstructured[tsv]; extra == "tsv"
193
193
  Provides-Extra: vastdb
194
- Requires-Dist: pyarrow; extra == "vastdb"
195
194
  Requires-Dist: vastdb; extra == "vastdb"
195
+ Requires-Dist: pyarrow; extra == "vastdb"
196
196
  Requires-Dist: ibis; extra == "vastdb"
197
197
  Provides-Extra: vectara
198
- Requires-Dist: httpx; extra == "vectara"
199
198
  Requires-Dist: requests; extra == "vectara"
199
+ Requires-Dist: httpx; extra == "vectara"
200
200
  Requires-Dist: aiofiles; extra == "vectara"
201
201
  Provides-Extra: weaviate
202
202
  Requires-Dist: weaviate-client; extra == "weaviate"
@@ -10,7 +10,7 @@ test/integration/connectors/test_azure_ai_search.py,sha256=MxFwk84vI_HT4taQTGrNp
10
10
  test/integration/connectors/test_chroma.py,sha256=NuQv0PWPM0_LQfdPeUd6IYKqaKKXWmVaHGWjq5aBfOY,3721
11
11
  test/integration/connectors/test_confluence.py,sha256=Ju0gRQbD2g9l9iRf2HDZKi7RyPnBGtFRWcGpsqhO3F8,3588
12
12
  test/integration/connectors/test_delta_table.py,sha256=4qm2Arfc9Eb7SOZOnOlLF-vNpHy6Eqvr5Q45svfX1PY,6911
13
- test/integration/connectors/test_google_drive.py,sha256=0zJZ4UJOq4TkfU-bkc556_abV7q6zVS9ZgIvW9qcTU4,4204
13
+ test/integration/connectors/test_google_drive.py,sha256=ubjn3wvMhgpGHQs-wT_5icGgTIx2coS6hwNkAHOCEI8,10306
14
14
  test/integration/connectors/test_lancedb.py,sha256=8MBxK_CUtOt87-4B7svDDK82NFII5psceo5cNN8HJMs,9228
15
15
  test/integration/connectors/test_milvus.py,sha256=7mI6zznN0PTxDL9DLogH1k3dxx6R8DgGzlpyevsFu2w,7173
16
16
  test/integration/connectors/test_mongodb.py,sha256=0A6DvF-iTCSZzOefisd_i20j9li8uNWTF2wyLGwlhco,12446
@@ -107,7 +107,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
107
107
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
108
108
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
109
109
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
110
- unstructured_ingest/__version__.py,sha256=LXdgOM6QWErpDu1oCqJrypfmAkBaXzRxVPcjHL8yPrI,42
110
+ unstructured_ingest/__version__.py,sha256=vh_Kp5pxLwbO2jsp-9Z1oAftmDaQZ9nkLMwneOowYbU,42
111
111
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
112
112
  unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
113
113
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -276,15 +276,15 @@ unstructured_ingest/connector/notion/types/database_properties/url.py,sha256=iXQ
276
276
  unstructured_ingest/connector/notion/types/database_properties/verification.py,sha256=J_DLjY-v2T6xDGMQ7FkI0YMKMA6SG6Y3yYW7qUD1hKA,2334
277
277
  unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
278
278
  unstructured_ingest/embed/azure_openai.py,sha256=u9reyZzY6BtsT5U_TdIfS6vH_42lvohVBwKMPQAqvkI,1528
279
- unstructured_ingest/embed/bedrock.py,sha256=50G8PBEdW3ILwyWXAWl4w-gUA9I0AR7LuFq6NLz-sWI,7284
280
- unstructured_ingest/embed/huggingface.py,sha256=Avcc16st9Cp2xGScG6TeNEEd3T8YjjnESNN4OdIlnh0,2119
281
- unstructured_ingest/embed/interfaces.py,sha256=7jsQ3rLOXy1hq__muf-EPcLnv17XzNQaD05AyGbZeNo,3739
282
- unstructured_ingest/embed/mixedbreadai.py,sha256=OhF5cMxWMq8-0mt8_-Xe3ZkjGjf2u6QYzfzgHnOEYtU,6838
283
- unstructured_ingest/embed/octoai.py,sha256=oLNlM02W1CNUYRG_j6qWyI7yE24vYGKYradNzeeP6mE,5062
284
- unstructured_ingest/embed/openai.py,sha256=H1sURGuRvXBUSXJcAVzrLObV5wSCVM29tkaXJ-9ZR30,4727
285
- unstructured_ingest/embed/togetherai.py,sha256=SUd16JEUPlR8aCrd4q_T3CHwMTRUi-1yenq_r1AWlak,4266
286
- unstructured_ingest/embed/vertexai.py,sha256=CPptS7U5W1CgvxIN8CgVz5J1Ia4FctV6BsmpN9c92A0,4890
287
- unstructured_ingest/embed/voyageai.py,sha256=lydMASUDcTuyfWBPS3uIqDJPQbjf95bEI5Kr4tytONs,5111
279
+ unstructured_ingest/embed/bedrock.py,sha256=LnlxU2cC7mrq5wLiZ6D_7lv0_z_O2YtmY_8oRMu1N8E,7548
280
+ unstructured_ingest/embed/huggingface.py,sha256=4ytvdGrXur-PllLaNdKGQ4BhxxOJlHNzj4NfBwTUEsk,2270
281
+ unstructured_ingest/embed/interfaces.py,sha256=-SLdQKX6-KIa2Jq_-rz14noBnH9VuV8flOUKr8WJKMM,3109
282
+ unstructured_ingest/embed/mixedbreadai.py,sha256=ALRedRFg9xzFkYuV26uSjLGU4_3kHS46P8uSWeNdfoY,7214
283
+ unstructured_ingest/embed/octoai.py,sha256=lvfgs5Bnpn6lb_q4LenhFZ6IXC_L8xYilGC03ecJIGc,5318
284
+ unstructured_ingest/embed/openai.py,sha256=3QWindgIsziI5ChQ6Zzqt4hQ9g-qKeTFDZvt55YsA1k,4983
285
+ unstructured_ingest/embed/togetherai.py,sha256=lhUgiC24xuXbzmVlqgjrqtU4cJs-sIP-myxkaRK_tnk,4642
286
+ unstructured_ingest/embed/vertexai.py,sha256=LSLR9iOBX07e7bsIcMyU5kgmHm0zgC4GZ_gO6WL4xYY,5146
287
+ unstructured_ingest/embed/voyageai.py,sha256=M9LZ-YP54FSXzLt7XyWP4UoLb0naP6acpUNdQS-MucQ,5487
288
288
  unstructured_ingest/enhanced_dataclass/__init__.py,sha256=gDZOUsv5eo-8jm4Yu7DdDwi101aGbfG7JctTdOYnTOM,151
289
289
  unstructured_ingest/enhanced_dataclass/core.py,sha256=d6aUkDynuKX87cHx9_N5UDUWrvISR4jYRFRTvd_avlI,3038
290
290
  unstructured_ingest/enhanced_dataclass/dataclasses.py,sha256=aZMsoCzAGRb8Rmh3BTSBFtNr6FmFTY93KYGLk3gYJKQ,1949
@@ -432,7 +432,7 @@ unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=i7vuNKsUkN93JRVm
432
432
  unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=SotSXZQ85_6TO906YvFi3yTml8jE9A_zV6nBJ4oTx8A,7075
433
433
  unstructured_ingest/v2/processes/connectors/discord.py,sha256=-e4-cBK4TnHkknK1qIb86AIVMy81lBgC288_iLpTzM8,5246
434
434
  unstructured_ingest/v2/processes/connectors/gitlab.py,sha256=ufE65Z8q_tC4oppGg5BsGXwSaL7RbEXcaagJQYsylNo,9984
435
- unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=tSbyibwm9RQyXD-HJGZa1Y9lBSCXaEFnvxpf6bHwBSE,13394
435
+ unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=QzcHNelUbnubsDtanFIgDCRzmYTuP-GjJ_g9y8fButE,19623
436
436
  unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=VRDAiou_7oWOIAgQTdOGQWxudzQEDopXM8XkfkQ2j6g,5004
437
437
  unstructured_ingest/v2/processes/connectors/local.py,sha256=ZvWTj6ZYkwnvQMNFsZWoaQyp9zp0WVqAywMaHJ2kcAc,7153
438
438
  unstructured_ingest/v2/processes/connectors/milvus.py,sha256=wmcu9NVy3gYlQGT25inN5w_QrhFoL8-hRq0pJFSNw8g,8866
@@ -558,7 +558,7 @@ unstructured_ingest/v2/processes/connectors/sql/__init__.py,sha256=NSEZwJDHh_9kF
558
558
  unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py,sha256=xbZ90rmehiCnBoqFXMz-3ZMXeYb0PzWB6iobCNSHTmQ,8955
559
559
  unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=BATfX1PQGT2kl8jAbdNKXTojYKJxh3pJV9-h3OBnHGo,5124
560
560
  unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=OPBDQ2c_5KjWHEFfqXxf3pQ2tWC-N4MtslMulMgP1Wc,5503
561
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=QE-WBqrPVjCgcxR5EdVD9iTHBjgDSSSQgWYvq5N61qU,7746
561
+ unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=0hfiX_u7V38k_RfoeDmXJp8WIHZ19ilIHnrgZVSleKw,9270
562
562
  unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=F5PPUxt2W8JaAQGfz5Od0FvKqYa15RfwMIlnrdJu1nk,15317
563
563
  unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=PRjN_S7UQv0k4ZpSyclW1AJrsrugyxbR-GoOrHvBpks,5200
564
564
  unstructured_ingest/v2/processes/connectors/sql/vastdb.py,sha256=0rxrb1ByXIefB9umzMTEJbpvzdTttXHK5DjRY97-GG8,9618
@@ -567,9 +567,9 @@ unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-
567
567
  unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
568
568
  unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
569
569
  unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=yJza_jBSEFnzZRq5L6vJ0Mm3uS1uxkOiKIimPpUyQds,12418
570
- unstructured_ingest-0.5.1.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
571
- unstructured_ingest-0.5.1.dist-info/METADATA,sha256=4fo4K5ac0RNRlWGGyNumZ5gXJf-0PwknZWjS6HvAD6w,8051
572
- unstructured_ingest-0.5.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
573
- unstructured_ingest-0.5.1.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
574
- unstructured_ingest-0.5.1.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
575
- unstructured_ingest-0.5.1.dist-info/RECORD,,
570
+ unstructured_ingest-0.5.2.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
571
+ unstructured_ingest-0.5.2.dist-info/METADATA,sha256=SiWzXim0-JmdQF6rlC6RbnmRBHcheGOk2VMlKN2A2ms,8051
572
+ unstructured_ingest-0.5.2.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
573
+ unstructured_ingest-0.5.2.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
574
+ unstructured_ingest-0.5.2.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
575
+ unstructured_ingest-0.5.2.dist-info/RECORD,,