unstructured-ingest 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_google_drive.py +141 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/bedrock.py +13 -6
- unstructured_ingest/embed/huggingface.py +11 -4
- unstructured_ingest/embed/interfaces.py +2 -21
- unstructured_ingest/embed/mixedbreadai.py +13 -4
- unstructured_ingest/embed/octoai.py +13 -6
- unstructured_ingest/embed/openai.py +13 -6
- unstructured_ingest/embed/togetherai.py +13 -4
- unstructured_ingest/embed/vertexai.py +13 -6
- unstructured_ingest/embed/voyageai.py +13 -4
- unstructured_ingest/v2/processes/connectors/google_drive.py +132 -3
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +53 -3
- {unstructured_ingest-0.5.1.dist-info → unstructured_ingest-0.5.2.dist-info}/METADATA +21 -21
- {unstructured_ingest-0.5.1.dist-info → unstructured_ingest-0.5.2.dist-info}/RECORD +19 -19
- {unstructured_ingest-0.5.1.dist-info → unstructured_ingest-0.5.2.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.5.1.dist-info → unstructured_ingest-0.5.2.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.5.1.dist-info → unstructured_ingest-0.5.2.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.5.1.dist-info → unstructured_ingest-0.5.2.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import uuid
|
|
2
3
|
|
|
3
4
|
import pytest
|
|
5
|
+
from googleapiclient.errors import HttpError
|
|
4
6
|
|
|
5
7
|
from test.integration.connectors.utils.constants import (
|
|
6
8
|
SOURCE_TAG,
|
|
@@ -13,6 +15,9 @@ from test.integration.connectors.utils.validation.source import (
|
|
|
13
15
|
update_fixtures,
|
|
14
16
|
)
|
|
15
17
|
from test.integration.utils import requires_env
|
|
18
|
+
from unstructured_ingest.error import (
|
|
19
|
+
SourceConnectionError,
|
|
20
|
+
)
|
|
16
21
|
from unstructured_ingest.v2.interfaces import Downloader, Indexer
|
|
17
22
|
from unstructured_ingest.v2.processes.connectors.google_drive import (
|
|
18
23
|
CONNECTOR_TYPE,
|
|
@@ -25,6 +30,49 @@ from unstructured_ingest.v2.processes.connectors.google_drive import (
|
|
|
25
30
|
)
|
|
26
31
|
|
|
27
32
|
|
|
33
|
+
@pytest.fixture
|
|
34
|
+
def google_drive_connection_config():
|
|
35
|
+
"""
|
|
36
|
+
Build a valid GoogleDriveConnectionConfig using the environment variables.
|
|
37
|
+
Expects:
|
|
38
|
+
- GOOGLE_DRIVE_ID
|
|
39
|
+
- GOOGLE_DRIVE_SERVICE_KEY
|
|
40
|
+
"""
|
|
41
|
+
drive_id = os.getenv("GOOGLE_DRIVE_ID")
|
|
42
|
+
service_key = os.getenv("GOOGLE_DRIVE_SERVICE_KEY")
|
|
43
|
+
if not drive_id or not service_key:
|
|
44
|
+
pytest.skip("Google Drive credentials not provided in environment variables.")
|
|
45
|
+
|
|
46
|
+
access_config = GoogleDriveAccessConfig(service_account_key=service_key)
|
|
47
|
+
return GoogleDriveConnectionConfig(drive_id=drive_id, access_config=access_config)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@pytest.fixture
|
|
51
|
+
def google_drive_empty_folder(google_drive_connection_config):
|
|
52
|
+
"""
|
|
53
|
+
Creates an empty folder on Google Drive for testing the "empty folder" case.
|
|
54
|
+
The folder is deleted after the test.
|
|
55
|
+
"""
|
|
56
|
+
from google.oauth2 import service_account
|
|
57
|
+
from googleapiclient.discovery import build
|
|
58
|
+
|
|
59
|
+
access_config = google_drive_connection_config.access_config.get_secret_value()
|
|
60
|
+
creds = service_account.Credentials.from_service_account_info(access_config.service_account_key)
|
|
61
|
+
service = build("drive", "v3", credentials=creds)
|
|
62
|
+
|
|
63
|
+
# Create an empty folder.
|
|
64
|
+
file_metadata = {
|
|
65
|
+
"name": f"utic-empty-folder-{uuid.uuid4()}",
|
|
66
|
+
"mimeType": "application/vnd.google-apps.folder",
|
|
67
|
+
}
|
|
68
|
+
folder = service.files().create(body=file_metadata, fields="id, name").execute()
|
|
69
|
+
folder_id = folder.get("id")
|
|
70
|
+
try:
|
|
71
|
+
yield folder_id
|
|
72
|
+
finally:
|
|
73
|
+
service.files().delete(fileId=folder_id).execute()
|
|
74
|
+
|
|
75
|
+
|
|
28
76
|
@requires_env("GOOGLE_DRIVE_SERVICE_KEY")
|
|
29
77
|
@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE)
|
|
30
78
|
def test_google_drive_source(temp_dir):
|
|
@@ -114,3 +162,96 @@ def source_connector_validation(
|
|
|
114
162
|
save_downloads=configs.validate_downloaded_files,
|
|
115
163
|
save_filedata=configs.validate_file_data,
|
|
116
164
|
)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
# Precheck fails when the drive ID has an appended parameter (simulate copy-paste error)
|
|
168
|
+
@pytest.mark.tags("google-drive", "precheck")
|
|
169
|
+
@requires_env("GOOGLE_DRIVE_ID", "GOOGLE_DRIVE_SERVICE_KEY")
|
|
170
|
+
def test_google_drive_precheck_invalid_parameter(google_drive_connection_config):
|
|
171
|
+
# Append a query parameter as often happens when copying from a URL.
|
|
172
|
+
invalid_drive_id = google_drive_connection_config.drive_id + "?usp=sharing"
|
|
173
|
+
connection_config = GoogleDriveConnectionConfig(
|
|
174
|
+
drive_id=invalid_drive_id,
|
|
175
|
+
access_config=google_drive_connection_config.access_config,
|
|
176
|
+
)
|
|
177
|
+
index_config = GoogleDriveIndexerConfig(recursive=True)
|
|
178
|
+
indexer = GoogleDriveIndexer(connection_config=connection_config, index_config=index_config)
|
|
179
|
+
with pytest.raises(SourceConnectionError) as excinfo:
|
|
180
|
+
indexer.precheck()
|
|
181
|
+
assert "invalid" in str(excinfo.value).lower() or "not found" in str(excinfo.value).lower()
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
# Precheck fails due to lack of permission (simulate via monkeypatching).
|
|
185
|
+
@pytest.mark.tags("google-drive", "precheck")
|
|
186
|
+
@requires_env("GOOGLE_DRIVE_ID", "GOOGLE_DRIVE_SERVICE_KEY")
|
|
187
|
+
def test_google_drive_precheck_no_permission(google_drive_connection_config, monkeypatch):
|
|
188
|
+
index_config = GoogleDriveIndexerConfig(recursive=True)
|
|
189
|
+
indexer = GoogleDriveIndexer(
|
|
190
|
+
connection_config=google_drive_connection_config,
|
|
191
|
+
index_config=index_config,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# Monkeypatch get_root_info to always raise an HTTP 403 error.
|
|
195
|
+
def fake_get_root_info(files_client, object_id):
|
|
196
|
+
raise HttpError(
|
|
197
|
+
resp=type("Response", (), {"status": 403, "reason": "Forbidden"})(),
|
|
198
|
+
content=b"Forbidden",
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
monkeypatch.setattr(indexer, "get_root_info", fake_get_root_info)
|
|
202
|
+
with pytest.raises(SourceConnectionError) as excinfo:
|
|
203
|
+
indexer.precheck()
|
|
204
|
+
assert "forbidden" in str(excinfo.value).lower() or "permission" in str(excinfo.value).lower()
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
# Precheck fails when the folder is empty.
|
|
208
|
+
# @pytest.mark.tags("google-drive", "precheck")
|
|
209
|
+
# @requires_env("GOOGLE_DRIVE_ID", "GOOGLE_DRIVE_SERVICE_KEY")
|
|
210
|
+
# def test_google_drive_precheck_empty_folder(
|
|
211
|
+
# google_drive_connection_config, google_drive_empty_folder
|
|
212
|
+
# ):
|
|
213
|
+
# # Use the empty folder's ID as the target.
|
|
214
|
+
# connection_config = GoogleDriveConnectionConfig(
|
|
215
|
+
# drive_id=google_drive_empty_folder,
|
|
216
|
+
# access_config=google_drive_connection_config.access_config,
|
|
217
|
+
# )
|
|
218
|
+
|
|
219
|
+
# index_config = GoogleDriveIndexerConfig(recursive=True)
|
|
220
|
+
# indexer = GoogleDriveIndexer(connection_config=connection_config, index_config=index_config)
|
|
221
|
+
# with pytest.raises(SourceConnectionError) as excinfo:
|
|
222
|
+
# indexer.precheck()
|
|
223
|
+
# assert "empty folder" in str(excinfo.value).lower()
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
@pytest.mark.tags("google-drive", "count", "integration")
|
|
227
|
+
@requires_env("GOOGLE_DRIVE_ID", "GOOGLE_DRIVE_SERVICE_KEY")
|
|
228
|
+
def test_google_drive_count_files(google_drive_connection_config):
|
|
229
|
+
"""
|
|
230
|
+
This test verifies that the count_files_recursively method returns the expected count of files.
|
|
231
|
+
According to the test credentials, there are 3 files in the root directory and 1 nested file,
|
|
232
|
+
so the total count should be 4.
|
|
233
|
+
"""
|
|
234
|
+
# I assumed that we're applying the same extension filter as with other tests
|
|
235
|
+
# However there's 6 files in total in the test dir
|
|
236
|
+
extensions_filter = ["pdf", "docx"]
|
|
237
|
+
with google_drive_connection_config.get_client() as client:
|
|
238
|
+
count = GoogleDriveIndexer.count_files_recursively(
|
|
239
|
+
client, google_drive_connection_config.drive_id, extensions_filter
|
|
240
|
+
)
|
|
241
|
+
assert count == 4, f"Expected file count of 4, but got {count}"
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
# Precheck fails with a completely invalid drive ID.
|
|
245
|
+
@pytest.mark.tags("google-drive", "precheck")
|
|
246
|
+
@requires_env("GOOGLE_DRIVE_ID", "GOOGLE_DRIVE_SERVICE_KEY")
|
|
247
|
+
def test_google_drive_precheck_invalid_drive_id(google_drive_connection_config):
|
|
248
|
+
invalid_drive_id = "invalid_drive_id"
|
|
249
|
+
connection_config = GoogleDriveConnectionConfig(
|
|
250
|
+
drive_id=invalid_drive_id,
|
|
251
|
+
access_config=google_drive_connection_config.access_config,
|
|
252
|
+
)
|
|
253
|
+
index_config = GoogleDriveIndexerConfig(recursive=True)
|
|
254
|
+
indexer = GoogleDriveIndexer(connection_config=connection_config, index_config=index_config)
|
|
255
|
+
with pytest.raises(SourceConnectionError) as excinfo:
|
|
256
|
+
indexer.precheck()
|
|
257
|
+
assert "invalid" in str(excinfo.value).lower() or "not found" in str(excinfo.value).lower()
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.5.
|
|
1
|
+
__version__ = "0.5.2" # pragma: no cover
|
|
@@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, AsyncIterable
|
|
|
8
8
|
from pydantic import Field, SecretStr
|
|
9
9
|
|
|
10
10
|
from unstructured_ingest.embed.interfaces import (
|
|
11
|
+
EMBEDDINGS_KEY,
|
|
11
12
|
AsyncBaseEmbeddingEncoder,
|
|
12
13
|
BaseEmbeddingEncoder,
|
|
13
14
|
EmbeddingConfig,
|
|
@@ -145,9 +146,12 @@ class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
145
146
|
return response_body.get("embedding")
|
|
146
147
|
|
|
147
148
|
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
149
|
+
elements = elements.copy()
|
|
150
|
+
elements_with_text = [e for e in elements if e.get("text")]
|
|
151
|
+
embeddings = [self.embed_query(query=e["text"]) for e in elements_with_text]
|
|
152
|
+
for element, embedding in zip(elements_with_text, embeddings):
|
|
153
|
+
element[EMBEDDINGS_KEY] = embedding
|
|
154
|
+
return elements
|
|
151
155
|
|
|
152
156
|
|
|
153
157
|
@dataclass
|
|
@@ -186,8 +190,11 @@ class AsyncBedrockEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
|
186
190
|
raise ValueError(f"Error raised by inference endpoint: {e}")
|
|
187
191
|
|
|
188
192
|
async def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
193
|
+
elements = elements.copy()
|
|
194
|
+
elements_with_text = [e for e in elements if e.get("text")]
|
|
189
195
|
embeddings = await asyncio.gather(
|
|
190
|
-
*[self.embed_query(query=e.get("text", "")) for e in
|
|
196
|
+
*[self.embed_query(query=e.get("text", "")) for e in elements_with_text]
|
|
191
197
|
)
|
|
192
|
-
|
|
193
|
-
|
|
198
|
+
for element, embedding in zip(elements_with_text, embeddings):
|
|
199
|
+
element[EMBEDDINGS_KEY] = embedding
|
|
200
|
+
return elements
|
|
@@ -3,7 +3,11 @@ from typing import TYPE_CHECKING, Optional
|
|
|
3
3
|
|
|
4
4
|
from pydantic import Field
|
|
5
5
|
|
|
6
|
-
from unstructured_ingest.embed.interfaces import
|
|
6
|
+
from unstructured_ingest.embed.interfaces import (
|
|
7
|
+
EMBEDDINGS_KEY,
|
|
8
|
+
BaseEmbeddingEncoder,
|
|
9
|
+
EmbeddingConfig,
|
|
10
|
+
)
|
|
7
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
8
12
|
|
|
9
13
|
if TYPE_CHECKING:
|
|
@@ -52,6 +56,9 @@ class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
52
56
|
return embeddings.tolist()
|
|
53
57
|
|
|
54
58
|
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
59
|
+
elements = elements.copy()
|
|
60
|
+
elements_with_text = [e for e in elements if e.get("text")]
|
|
61
|
+
embeddings = self._embed_documents([e["text"] for e in elements_with_text])
|
|
62
|
+
for element, embedding in zip(elements_with_text, embeddings):
|
|
63
|
+
element[EMBEDDINGS_KEY] = embedding
|
|
64
|
+
return elements
|
|
@@ -6,6 +6,8 @@ from typing import Optional
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
from pydantic import BaseModel, Field
|
|
8
8
|
|
|
9
|
+
EMBEDDINGS_KEY = "embeddings"
|
|
10
|
+
|
|
9
11
|
|
|
10
12
|
class EmbeddingConfig(BaseModel):
|
|
11
13
|
batch_size: Optional[int] = Field(
|
|
@@ -26,27 +28,6 @@ class BaseEncoder(ABC):
|
|
|
26
28
|
if possible"""
|
|
27
29
|
return e
|
|
28
30
|
|
|
29
|
-
@staticmethod
|
|
30
|
-
def _add_embeddings_to_elements(
|
|
31
|
-
elements: list[dict], embeddings: list[list[float]]
|
|
32
|
-
) -> list[dict]:
|
|
33
|
-
"""
|
|
34
|
-
Add embeddings to elements.
|
|
35
|
-
|
|
36
|
-
Args:
|
|
37
|
-
elements (list[Element]): List of elements.
|
|
38
|
-
embeddings (list[list[float]]): List of embeddings.
|
|
39
|
-
|
|
40
|
-
Returns:
|
|
41
|
-
list[Element]: Elements with embeddings added.
|
|
42
|
-
"""
|
|
43
|
-
assert len(elements) == len(embeddings)
|
|
44
|
-
elements_w_embedding = []
|
|
45
|
-
for i, element in enumerate(elements):
|
|
46
|
-
element["embeddings"] = embeddings[i]
|
|
47
|
-
elements_w_embedding.append(element)
|
|
48
|
-
return elements
|
|
49
|
-
|
|
50
31
|
|
|
51
32
|
@dataclass
|
|
52
33
|
class BaseEmbeddingEncoder(BaseEncoder, ABC):
|
|
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING
|
|
|
6
6
|
from pydantic import Field, SecretStr
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.embed.interfaces import (
|
|
9
|
+
EMBEDDINGS_KEY,
|
|
9
10
|
AsyncBaseEmbeddingEncoder,
|
|
10
11
|
BaseEmbeddingEncoder,
|
|
11
12
|
EmbeddingConfig,
|
|
@@ -134,8 +135,12 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
134
135
|
Returns:
|
|
135
136
|
list[Element]: Elements with embeddings.
|
|
136
137
|
"""
|
|
137
|
-
|
|
138
|
-
|
|
138
|
+
elements = elements.copy()
|
|
139
|
+
elements_with_text = [e for e in elements if e.get("text")]
|
|
140
|
+
embeddings = self._embed([e["text"] for e in elements_with_text])
|
|
141
|
+
for element, embedding in zip(elements_with_text, embeddings):
|
|
142
|
+
element[EMBEDDINGS_KEY] = embedding
|
|
143
|
+
return elements
|
|
139
144
|
|
|
140
145
|
def embed_query(self, query: str) -> list[float]:
|
|
141
146
|
"""
|
|
@@ -209,8 +214,12 @@ class AsyncMixedbreadAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
|
209
214
|
Returns:
|
|
210
215
|
list[Element]: Elements with embeddings.
|
|
211
216
|
"""
|
|
212
|
-
|
|
213
|
-
|
|
217
|
+
elements = elements.copy()
|
|
218
|
+
elements_with_text = [e for e in elements if e.get("text")]
|
|
219
|
+
embeddings = await self._embed([e["text"] for e in elements_with_text])
|
|
220
|
+
for element, embedding in zip(elements_with_text, embeddings):
|
|
221
|
+
element[EMBEDDINGS_KEY] = embedding
|
|
222
|
+
return elements
|
|
214
223
|
|
|
215
224
|
async def embed_query(self, query: str) -> list[float]:
|
|
216
225
|
"""
|
|
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING
|
|
|
4
4
|
from pydantic import Field, SecretStr
|
|
5
5
|
|
|
6
6
|
from unstructured_ingest.embed.interfaces import (
|
|
7
|
+
EMBEDDINGS_KEY,
|
|
7
8
|
AsyncBaseEmbeddingEncoder,
|
|
8
9
|
BaseEmbeddingEncoder,
|
|
9
10
|
EmbeddingConfig,
|
|
@@ -89,7 +90,9 @@ class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
89
90
|
return response.data[0].embedding
|
|
90
91
|
|
|
91
92
|
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
92
|
-
|
|
93
|
+
elements = elements.copy()
|
|
94
|
+
elements_with_text = [e for e in elements if e.get("text")]
|
|
95
|
+
texts = [e["text"] for e in elements_with_text]
|
|
93
96
|
embeddings = []
|
|
94
97
|
client = self.config.get_client()
|
|
95
98
|
try:
|
|
@@ -100,8 +103,9 @@ class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
100
103
|
embeddings.extend([data.embedding for data in response.data])
|
|
101
104
|
except Exception as e:
|
|
102
105
|
raise self.wrap_error(e=e)
|
|
103
|
-
|
|
104
|
-
|
|
106
|
+
for element, embedding in zip(elements_with_text, embeddings):
|
|
107
|
+
element[EMBEDDINGS_KEY] = embedding
|
|
108
|
+
return elements
|
|
105
109
|
|
|
106
110
|
|
|
107
111
|
@dataclass
|
|
@@ -122,7 +126,9 @@ class AsyncOctoAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
|
122
126
|
return response.data[0].embedding
|
|
123
127
|
|
|
124
128
|
async def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
125
|
-
|
|
129
|
+
elements = elements.copy()
|
|
130
|
+
elements_with_text = [e for e in elements if e.get("text")]
|
|
131
|
+
texts = [e["text"] for e in elements_with_text]
|
|
126
132
|
client = self.config.get_async_client()
|
|
127
133
|
embeddings = []
|
|
128
134
|
try:
|
|
@@ -133,5 +139,6 @@ class AsyncOctoAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
|
133
139
|
embeddings.extend([data.embedding for data in response.data])
|
|
134
140
|
except Exception as e:
|
|
135
141
|
raise self.wrap_error(e=e)
|
|
136
|
-
|
|
137
|
-
|
|
142
|
+
for element, embedding in zip(elements_with_text, embeddings):
|
|
143
|
+
element[EMBEDDINGS_KEY] = embedding
|
|
144
|
+
return elements
|
|
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING
|
|
|
4
4
|
from pydantic import Field, SecretStr
|
|
5
5
|
|
|
6
6
|
from unstructured_ingest.embed.interfaces import (
|
|
7
|
+
EMBEDDINGS_KEY,
|
|
7
8
|
AsyncBaseEmbeddingEncoder,
|
|
8
9
|
BaseEmbeddingEncoder,
|
|
9
10
|
EmbeddingConfig,
|
|
@@ -82,7 +83,9 @@ class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
82
83
|
|
|
83
84
|
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
84
85
|
client = self.config.get_client()
|
|
85
|
-
|
|
86
|
+
elements = elements.copy()
|
|
87
|
+
elements_with_text = [e for e in elements if e.get("text")]
|
|
88
|
+
texts = [e["text"] for e in elements_with_text]
|
|
86
89
|
embeddings = []
|
|
87
90
|
try:
|
|
88
91
|
for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
|
|
@@ -92,8 +95,9 @@ class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
92
95
|
embeddings.extend([data.embedding for data in response.data])
|
|
93
96
|
except Exception as e:
|
|
94
97
|
raise self.wrap_error(e=e)
|
|
95
|
-
|
|
96
|
-
|
|
98
|
+
for element, embedding in zip(elements_with_text, embeddings):
|
|
99
|
+
element[EMBEDDINGS_KEY] = embedding
|
|
100
|
+
return elements
|
|
97
101
|
|
|
98
102
|
|
|
99
103
|
@dataclass
|
|
@@ -115,7 +119,9 @@ class AsyncOpenAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
|
115
119
|
|
|
116
120
|
async def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
117
121
|
client = self.config.get_async_client()
|
|
118
|
-
|
|
122
|
+
elements = elements.copy()
|
|
123
|
+
elements_with_text = [e for e in elements if e.get("text")]
|
|
124
|
+
texts = [e["text"] for e in elements_with_text]
|
|
119
125
|
embeddings = []
|
|
120
126
|
try:
|
|
121
127
|
for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
|
|
@@ -125,5 +131,6 @@ class AsyncOpenAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
|
125
131
|
embeddings.extend([data.embedding for data in response.data])
|
|
126
132
|
except Exception as e:
|
|
127
133
|
raise self.wrap_error(e=e)
|
|
128
|
-
|
|
129
|
-
|
|
134
|
+
for element, embedding in zip(elements_with_text, embeddings):
|
|
135
|
+
element[EMBEDDINGS_KEY] = embedding
|
|
136
|
+
return elements
|
|
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING
|
|
|
4
4
|
from pydantic import Field, SecretStr
|
|
5
5
|
|
|
6
6
|
from unstructured_ingest.embed.interfaces import (
|
|
7
|
+
EMBEDDINGS_KEY,
|
|
7
8
|
AsyncBaseEmbeddingEncoder,
|
|
8
9
|
BaseEmbeddingEncoder,
|
|
9
10
|
EmbeddingConfig,
|
|
@@ -67,8 +68,12 @@ class TogetherAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
67
68
|
return self._embed_documents(elements=[query])[0]
|
|
68
69
|
|
|
69
70
|
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
70
|
-
|
|
71
|
-
|
|
71
|
+
elements = elements.copy()
|
|
72
|
+
elements_with_text = [e for e in elements if e.get("text")]
|
|
73
|
+
embeddings = self._embed_documents([e["text"] for e in elements_with_text])
|
|
74
|
+
for element, embedding in zip(elements_with_text, embeddings):
|
|
75
|
+
element[EMBEDDINGS_KEY] = embedding
|
|
76
|
+
return elements
|
|
72
77
|
|
|
73
78
|
def _embed_documents(self, elements: list[str]) -> list[list[float]]:
|
|
74
79
|
client = self.config.get_client()
|
|
@@ -98,8 +103,12 @@ class AsyncTogetherAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
|
98
103
|
return embedding[0]
|
|
99
104
|
|
|
100
105
|
async def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
101
|
-
|
|
102
|
-
|
|
106
|
+
elements = elements.copy()
|
|
107
|
+
elements_with_text = [e for e in elements if e.get("text")]
|
|
108
|
+
embeddings = await self._embed_documents([e["text"] for e in elements_with_text])
|
|
109
|
+
for element, embedding in zip(elements_with_text, embeddings):
|
|
110
|
+
element[EMBEDDINGS_KEY] = embedding
|
|
111
|
+
return elements
|
|
103
112
|
|
|
104
113
|
async def _embed_documents(self, elements: list[str]) -> list[list[float]]:
|
|
105
114
|
client = self.config.get_async_client()
|
|
@@ -9,6 +9,7 @@ from pydantic import Field, Secret, ValidationError
|
|
|
9
9
|
from pydantic.functional_validators import BeforeValidator
|
|
10
10
|
|
|
11
11
|
from unstructured_ingest.embed.interfaces import (
|
|
12
|
+
EMBEDDINGS_KEY,
|
|
12
13
|
AsyncBaseEmbeddingEncoder,
|
|
13
14
|
BaseEmbeddingEncoder,
|
|
14
15
|
EmbeddingConfig,
|
|
@@ -75,9 +76,12 @@ class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
75
76
|
return self._embed_documents(elements=[query])[0]
|
|
76
77
|
|
|
77
78
|
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
79
|
+
elements = elements.copy()
|
|
80
|
+
elements_with_text = [e for e in elements if e.get("text")]
|
|
81
|
+
embeddings = self._embed_documents([e["text"] for e in elements_with_text])
|
|
82
|
+
for element, embedding in zip(elements_with_text, embeddings):
|
|
83
|
+
element[EMBEDDINGS_KEY] = embedding
|
|
84
|
+
return elements
|
|
81
85
|
|
|
82
86
|
@requires_dependencies(
|
|
83
87
|
["vertexai"],
|
|
@@ -110,9 +114,12 @@ class AsyncVertexAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
|
110
114
|
return embedding[0]
|
|
111
115
|
|
|
112
116
|
async def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
117
|
+
elements = elements.copy()
|
|
118
|
+
elements_with_text = [e for e in elements if e.get("text")]
|
|
119
|
+
embeddings = await self._embed_documents([e["text"] for e in elements_with_text])
|
|
120
|
+
for element, embedding in zip(elements_with_text, embeddings):
|
|
121
|
+
element[EMBEDDINGS_KEY] = embedding
|
|
122
|
+
return elements
|
|
116
123
|
|
|
117
124
|
@requires_dependencies(
|
|
118
125
|
["vertexai"],
|
|
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING, Optional
|
|
|
4
4
|
from pydantic import Field, SecretStr
|
|
5
5
|
|
|
6
6
|
from unstructured_ingest.embed.interfaces import (
|
|
7
|
+
EMBEDDINGS_KEY,
|
|
7
8
|
AsyncBaseEmbeddingEncoder,
|
|
8
9
|
BaseEmbeddingEncoder,
|
|
9
10
|
EmbeddingConfig,
|
|
@@ -107,8 +108,12 @@ class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
107
108
|
return embeddings
|
|
108
109
|
|
|
109
110
|
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
110
|
-
|
|
111
|
-
|
|
111
|
+
elements = elements.copy()
|
|
112
|
+
elements_with_text = [e for e in elements if e.get("text")]
|
|
113
|
+
embeddings = self._embed_documents([e["text"] for e in elements_with_text])
|
|
114
|
+
for element, embedding in zip(elements_with_text, embeddings):
|
|
115
|
+
element[EMBEDDINGS_KEY] = embedding
|
|
116
|
+
return elements
|
|
112
117
|
|
|
113
118
|
def embed_query(self, query: str) -> list[float]:
|
|
114
119
|
return self._embed_documents(elements=[query])[0]
|
|
@@ -135,8 +140,12 @@ class AsyncVoyageAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
|
135
140
|
return embeddings
|
|
136
141
|
|
|
137
142
|
async def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
138
|
-
|
|
139
|
-
|
|
143
|
+
elements = elements.copy()
|
|
144
|
+
elements_with_text = [e for e in elements if e.get("text")]
|
|
145
|
+
embeddings = await self._embed_documents([e["text"] for e in elements_with_text])
|
|
146
|
+
for element, embedding in zip(elements_with_text, embeddings):
|
|
147
|
+
element[EMBEDDINGS_KEY] = embedding
|
|
148
|
+
return elements
|
|
140
149
|
|
|
141
150
|
async def embed_query(self, query: str) -> list[float]:
|
|
142
151
|
embedding = await self._embed_documents(elements=[query])
|
|
@@ -132,12 +132,141 @@ class GoogleDriveIndexer(Indexer):
|
|
|
132
132
|
]
|
|
133
133
|
)
|
|
134
134
|
|
|
135
|
+
@staticmethod
|
|
136
|
+
def verify_drive_api_enabled(client) -> None:
|
|
137
|
+
from googleapiclient.errors import HttpError
|
|
138
|
+
|
|
139
|
+
"""
|
|
140
|
+
Makes a lightweight API call to verify that the Drive API is enabled.
|
|
141
|
+
If the API is not enabled, an HttpError should be raised.
|
|
142
|
+
"""
|
|
143
|
+
try:
|
|
144
|
+
# A very minimal call: list 1 file from the drive.
|
|
145
|
+
client.list(spaces="drive", pageSize=1, fields="files(id)").execute()
|
|
146
|
+
except HttpError as e:
|
|
147
|
+
error_content = e.content.decode() if hasattr(e, "content") else ""
|
|
148
|
+
lower_error = error_content.lower()
|
|
149
|
+
if "drive api" in lower_error and (
|
|
150
|
+
"not enabled" in lower_error or "not been used" in lower_error
|
|
151
|
+
):
|
|
152
|
+
raise SourceConnectionError(
|
|
153
|
+
"Google Drive API is not enabled for your project. \
|
|
154
|
+
Please enable it in the Google Cloud Console."
|
|
155
|
+
)
|
|
156
|
+
else:
|
|
157
|
+
raise SourceConnectionError("Google drive API unreachable for an unknown reason!")
|
|
158
|
+
|
|
159
|
+
@staticmethod
|
|
160
|
+
def count_files_recursively(files_client, folder_id: str, extensions: list[str] = None) -> int:
|
|
161
|
+
"""
|
|
162
|
+
Count non-folder files recursively under the given folder.
|
|
163
|
+
If `extensions` is provided, only count files
|
|
164
|
+
whose `fileExtension` matches one of the values.
|
|
165
|
+
"""
|
|
166
|
+
count = 0
|
|
167
|
+
stack = [folder_id]
|
|
168
|
+
while stack:
|
|
169
|
+
current_folder = stack.pop()
|
|
170
|
+
# Always list all items under the current folder.
|
|
171
|
+
query = f"'{current_folder}' in parents"
|
|
172
|
+
page_token = None
|
|
173
|
+
while True:
|
|
174
|
+
response = files_client.list(
|
|
175
|
+
spaces="drive",
|
|
176
|
+
q=query,
|
|
177
|
+
fields="nextPageToken, files(id, mimeType, fileExtension)",
|
|
178
|
+
pageToken=page_token,
|
|
179
|
+
pageSize=1000,
|
|
180
|
+
).execute()
|
|
181
|
+
for item in response.get("files", []):
|
|
182
|
+
if item.get("mimeType") == "application/vnd.google-apps.folder":
|
|
183
|
+
# Always traverse sub-folders regardless of extension filter.
|
|
184
|
+
stack.append(item["id"])
|
|
185
|
+
else:
|
|
186
|
+
if extensions:
|
|
187
|
+
# Use a case-insensitive comparison for the file extension.
|
|
188
|
+
file_ext = (item.get("fileExtension") or "").lower()
|
|
189
|
+
valid_exts = [e.lower() for e in extensions]
|
|
190
|
+
if file_ext in valid_exts:
|
|
191
|
+
count += 1
|
|
192
|
+
else:
|
|
193
|
+
count += 1
|
|
194
|
+
page_token = response.get("nextPageToken")
|
|
195
|
+
if not page_token:
|
|
196
|
+
break
|
|
197
|
+
return count
|
|
198
|
+
|
|
135
199
|
def precheck(self) -> None:
|
|
200
|
+
"""
|
|
201
|
+
Enhanced precheck that verifies not only connectivity
|
|
202
|
+
but also that the provided drive_id is valid and accessible.
|
|
203
|
+
"""
|
|
136
204
|
try:
|
|
137
|
-
self.connection_config.get_client()
|
|
205
|
+
with self.connection_config.get_client() as client:
|
|
206
|
+
# First, verify that the Drive API is enabled.
|
|
207
|
+
self.verify_drive_api_enabled(client)
|
|
208
|
+
|
|
209
|
+
# Try to retrieve metadata for the drive id.
|
|
210
|
+
# This will catch errors such as an invalid drive id or insufficient permissions.
|
|
211
|
+
root_info = self.get_root_info(
|
|
212
|
+
files_client=client, object_id=self.connection_config.drive_id
|
|
213
|
+
)
|
|
214
|
+
logger.info(
|
|
215
|
+
f"Successfully retrieved drive root info: "
|
|
216
|
+
f"{root_info.get('name', 'Unnamed')} (ID: {root_info.get('id')})"
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# If the target is a folder, perform file count check.
|
|
220
|
+
if self.is_dir(root_info):
|
|
221
|
+
if self.index_config.recursive:
|
|
222
|
+
file_count = self.count_files_recursively(
|
|
223
|
+
client,
|
|
224
|
+
self.connection_config.drive_id,
|
|
225
|
+
extensions=self.index_config.extensions,
|
|
226
|
+
)
|
|
227
|
+
if file_count == 0:
|
|
228
|
+
logger.warning(
|
|
229
|
+
"Empty folder: no files found recursively in the folder. \
|
|
230
|
+
Please verify that the folder contains files and \
|
|
231
|
+
that the service account has proper permissions."
|
|
232
|
+
)
|
|
233
|
+
# raise SourceConnectionError(
|
|
234
|
+
# "Empty folder: no files found recursively in the folder. "
|
|
235
|
+
# "Please verify that the folder contains files and \
|
|
236
|
+
# that the service account has proper permissions."
|
|
237
|
+
# )
|
|
238
|
+
else:
|
|
239
|
+
logger.info(f"Found {file_count} files recursively in the folder.")
|
|
240
|
+
else:
|
|
241
|
+
# Non-recursive: check for at least one immediate non-folder child.
|
|
242
|
+
response = client.list(
|
|
243
|
+
spaces="drive",
|
|
244
|
+
fields="files(id)",
|
|
245
|
+
pageSize=1,
|
|
246
|
+
q=f"'{self.connection_config.drive_id}' in parents",
|
|
247
|
+
).execute()
|
|
248
|
+
if not response.get("files"):
|
|
249
|
+
logger.warning(
|
|
250
|
+
"Empty folder: no files found at the folder's root level. "
|
|
251
|
+
"Please verify that the folder contains files and \
|
|
252
|
+
that the service account has proper permissions."
|
|
253
|
+
)
|
|
254
|
+
# raise SourceConnectionError(
|
|
255
|
+
# "Empty folder: no files found at the folder's root level. "
|
|
256
|
+
# "Please verify that the folder contains files and \
|
|
257
|
+
# that the service account has proper permissions."
|
|
258
|
+
# )
|
|
259
|
+
else:
|
|
260
|
+
logger.info("Found files at the folder's root level.")
|
|
261
|
+
else:
|
|
262
|
+
# If the target is a file, precheck passes.
|
|
263
|
+
logger.info("Drive ID corresponds to a file. Precheck passed.")
|
|
264
|
+
|
|
138
265
|
except Exception as e:
|
|
139
|
-
logger.error(
|
|
140
|
-
|
|
266
|
+
logger.error(
|
|
267
|
+
"Failed to validate Google Drive connection during precheck", exc_info=True
|
|
268
|
+
)
|
|
269
|
+
raise SourceConnectionError(f"Precheck failed: {e}")
|
|
141
270
|
|
|
142
271
|
@staticmethod
|
|
143
272
|
def is_dir(record: dict) -> bool:
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
+
import json
|
|
1
2
|
from contextlib import contextmanager
|
|
2
3
|
from dataclasses import dataclass, field
|
|
3
|
-
from typing import TYPE_CHECKING, Generator, Optional
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
4
5
|
|
|
5
6
|
import numpy as np
|
|
6
7
|
import pandas as pd
|
|
@@ -15,6 +16,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
15
16
|
SourceRegistryEntry,
|
|
16
17
|
)
|
|
17
18
|
from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
19
|
+
_DATE_COLUMNS,
|
|
18
20
|
SQLAccessConfig,
|
|
19
21
|
SqlBatchFileData,
|
|
20
22
|
SQLConnectionConfig,
|
|
@@ -26,6 +28,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
|
26
28
|
SQLUploaderConfig,
|
|
27
29
|
SQLUploadStager,
|
|
28
30
|
SQLUploadStagerConfig,
|
|
31
|
+
parse_date_string,
|
|
29
32
|
)
|
|
30
33
|
|
|
31
34
|
if TYPE_CHECKING:
|
|
@@ -34,6 +37,17 @@ if TYPE_CHECKING:
|
|
|
34
37
|
|
|
35
38
|
CONNECTOR_TYPE = "snowflake"
|
|
36
39
|
|
|
40
|
+
_ARRAY_COLUMNS = (
|
|
41
|
+
"embeddings",
|
|
42
|
+
"languages",
|
|
43
|
+
"link_urls",
|
|
44
|
+
"link_texts",
|
|
45
|
+
"sent_from",
|
|
46
|
+
"sent_to",
|
|
47
|
+
"emphasized_text_contents",
|
|
48
|
+
"emphasized_text_tags",
|
|
49
|
+
)
|
|
50
|
+
|
|
37
51
|
|
|
38
52
|
class SnowflakeAccessConfig(SQLAccessConfig):
|
|
39
53
|
password: Optional[str] = Field(default=None, description="DB password")
|
|
@@ -160,6 +174,42 @@ class SnowflakeUploader(SQLUploader):
|
|
|
160
174
|
connector_type: str = CONNECTOR_TYPE
|
|
161
175
|
values_delimiter: str = "?"
|
|
162
176
|
|
|
177
|
+
def prepare_data(
|
|
178
|
+
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|
|
179
|
+
) -> list[tuple[Any, ...]]:
|
|
180
|
+
output = []
|
|
181
|
+
for row in data:
|
|
182
|
+
parsed = []
|
|
183
|
+
for column_name, value in zip(columns, row):
|
|
184
|
+
if column_name in _DATE_COLUMNS:
|
|
185
|
+
if value is None or pd.isna(value): # pandas is nan
|
|
186
|
+
parsed.append(None)
|
|
187
|
+
else:
|
|
188
|
+
parsed.append(parse_date_string(value))
|
|
189
|
+
elif column_name in _ARRAY_COLUMNS:
|
|
190
|
+
if not isinstance(value, list) and (
|
|
191
|
+
value is None or pd.isna(value)
|
|
192
|
+
): # pandas is nan
|
|
193
|
+
parsed.append(None)
|
|
194
|
+
else:
|
|
195
|
+
parsed.append(json.dumps(value))
|
|
196
|
+
else:
|
|
197
|
+
parsed.append(value)
|
|
198
|
+
output.append(tuple(parsed))
|
|
199
|
+
return output
|
|
200
|
+
|
|
201
|
+
def _parse_values(self, columns: list[str]) -> str:
|
|
202
|
+
return ",".join(
|
|
203
|
+
[
|
|
204
|
+
(
|
|
205
|
+
f"PARSE_JSON({self.values_delimiter})"
|
|
206
|
+
if col in _ARRAY_COLUMNS
|
|
207
|
+
else self.values_delimiter
|
|
208
|
+
)
|
|
209
|
+
for col in columns
|
|
210
|
+
]
|
|
211
|
+
)
|
|
212
|
+
|
|
163
213
|
def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
|
|
164
214
|
if self.can_delete():
|
|
165
215
|
self.delete_by_record_id(file_data=file_data)
|
|
@@ -173,10 +223,10 @@ class SnowflakeUploader(SQLUploader):
|
|
|
173
223
|
self._fit_to_schema(df=df)
|
|
174
224
|
|
|
175
225
|
columns = list(df.columns)
|
|
176
|
-
stmt = "INSERT INTO {table_name} ({columns})
|
|
226
|
+
stmt = "INSERT INTO {table_name} ({columns}) SELECT {values}".format(
|
|
177
227
|
table_name=self.upload_config.table_name,
|
|
178
228
|
columns=",".join(columns),
|
|
179
|
-
values=
|
|
229
|
+
values=self._parse_values(columns),
|
|
180
230
|
)
|
|
181
231
|
logger.info(
|
|
182
232
|
f"writing a total of {len(df)} elements via"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.2
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,31 +22,31 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.14
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
-
Requires-Dist:
|
|
26
|
-
Requires-Dist: click
|
|
27
|
-
Requires-Dist: opentelemetry-sdk
|
|
25
|
+
Requires-Dist: pandas
|
|
28
26
|
Requires-Dist: pydantic>=2.7
|
|
27
|
+
Requires-Dist: dataclasses-json
|
|
29
28
|
Requires-Dist: python-dateutil
|
|
30
|
-
Requires-Dist:
|
|
29
|
+
Requires-Dist: opentelemetry-sdk
|
|
30
|
+
Requires-Dist: click
|
|
31
31
|
Requires-Dist: tqdm
|
|
32
32
|
Provides-Extra: airtable
|
|
33
33
|
Requires-Dist: pyairtable; extra == "airtable"
|
|
34
34
|
Provides-Extra: astradb
|
|
35
35
|
Requires-Dist: astrapy; extra == "astradb"
|
|
36
36
|
Provides-Extra: azure
|
|
37
|
-
Requires-Dist: adlfs; extra == "azure"
|
|
38
37
|
Requires-Dist: fsspec; extra == "azure"
|
|
38
|
+
Requires-Dist: adlfs; extra == "azure"
|
|
39
39
|
Provides-Extra: azure-ai-search
|
|
40
40
|
Requires-Dist: azure-search-documents; extra == "azure-ai-search"
|
|
41
41
|
Provides-Extra: bedrock
|
|
42
|
-
Requires-Dist: aioboto3; extra == "bedrock"
|
|
43
42
|
Requires-Dist: boto3; extra == "bedrock"
|
|
43
|
+
Requires-Dist: aioboto3; extra == "bedrock"
|
|
44
44
|
Provides-Extra: biomed
|
|
45
45
|
Requires-Dist: requests; extra == "biomed"
|
|
46
46
|
Requires-Dist: bs4; extra == "biomed"
|
|
47
47
|
Provides-Extra: box
|
|
48
|
-
Requires-Dist: boxfs; extra == "box"
|
|
49
48
|
Requires-Dist: fsspec; extra == "box"
|
|
49
|
+
Requires-Dist: boxfs; extra == "box"
|
|
50
50
|
Provides-Extra: chroma
|
|
51
51
|
Requires-Dist: chromadb; extra == "chroma"
|
|
52
52
|
Provides-Extra: clarifai
|
|
@@ -92,12 +92,12 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
|
|
|
92
92
|
Provides-Extra: epub
|
|
93
93
|
Requires-Dist: unstructured[epub]; extra == "epub"
|
|
94
94
|
Provides-Extra: gcs
|
|
95
|
-
Requires-Dist: gcsfs; extra == "gcs"
|
|
96
|
-
Requires-Dist: bs4; extra == "gcs"
|
|
97
95
|
Requires-Dist: fsspec; extra == "gcs"
|
|
96
|
+
Requires-Dist: bs4; extra == "gcs"
|
|
97
|
+
Requires-Dist: gcsfs; extra == "gcs"
|
|
98
98
|
Provides-Extra: github
|
|
99
|
-
Requires-Dist: requests; extra == "github"
|
|
100
99
|
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
100
|
+
Requires-Dist: requests; extra == "github"
|
|
101
101
|
Provides-Extra: gitlab
|
|
102
102
|
Requires-Dist: python-gitlab; extra == "gitlab"
|
|
103
103
|
Provides-Extra: google-drive
|
|
@@ -122,20 +122,20 @@ Requires-Dist: pymongo; extra == "mongodb"
|
|
|
122
122
|
Provides-Extra: msg
|
|
123
123
|
Requires-Dist: unstructured[msg]; extra == "msg"
|
|
124
124
|
Provides-Extra: neo4j
|
|
125
|
-
Requires-Dist: neo4j; extra == "neo4j"
|
|
126
|
-
Requires-Dist: cymple; extra == "neo4j"
|
|
127
125
|
Requires-Dist: networkx; extra == "neo4j"
|
|
126
|
+
Requires-Dist: cymple; extra == "neo4j"
|
|
127
|
+
Requires-Dist: neo4j; extra == "neo4j"
|
|
128
128
|
Provides-Extra: notion
|
|
129
|
-
Requires-Dist: backoff; extra == "notion"
|
|
130
129
|
Requires-Dist: htmlBuilder; extra == "notion"
|
|
131
|
-
Requires-Dist:
|
|
130
|
+
Requires-Dist: backoff; extra == "notion"
|
|
132
131
|
Requires-Dist: notion-client; extra == "notion"
|
|
132
|
+
Requires-Dist: httpx; extra == "notion"
|
|
133
133
|
Provides-Extra: odt
|
|
134
134
|
Requires-Dist: unstructured[odt]; extra == "odt"
|
|
135
135
|
Provides-Extra: onedrive
|
|
136
136
|
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
137
|
-
Requires-Dist: msal; extra == "onedrive"
|
|
138
137
|
Requires-Dist: bs4; extra == "onedrive"
|
|
138
|
+
Requires-Dist: msal; extra == "onedrive"
|
|
139
139
|
Provides-Extra: openai
|
|
140
140
|
Requires-Dist: openai; extra == "openai"
|
|
141
141
|
Requires-Dist: tiktoken; extra == "openai"
|
|
@@ -169,13 +169,13 @@ Requires-Dist: unstructured[rst]; extra == "rst"
|
|
|
169
169
|
Provides-Extra: rtf
|
|
170
170
|
Requires-Dist: unstructured[rtf]; extra == "rtf"
|
|
171
171
|
Provides-Extra: s3
|
|
172
|
-
Requires-Dist: fsspec; extra == "s3"
|
|
173
172
|
Requires-Dist: s3fs; extra == "s3"
|
|
173
|
+
Requires-Dist: fsspec; extra == "s3"
|
|
174
174
|
Provides-Extra: salesforce
|
|
175
175
|
Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
176
176
|
Provides-Extra: sftp
|
|
177
|
-
Requires-Dist: paramiko; extra == "sftp"
|
|
178
177
|
Requires-Dist: fsspec; extra == "sftp"
|
|
178
|
+
Requires-Dist: paramiko; extra == "sftp"
|
|
179
179
|
Provides-Extra: sharepoint
|
|
180
180
|
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
181
181
|
Requires-Dist: msal; extra == "sharepoint"
|
|
@@ -184,19 +184,19 @@ Requires-Dist: singlestoredb; extra == "singlestore"
|
|
|
184
184
|
Provides-Extra: slack
|
|
185
185
|
Requires-Dist: slack-sdk[optional]; extra == "slack"
|
|
186
186
|
Provides-Extra: snowflake
|
|
187
|
-
Requires-Dist: snowflake-connector-python; extra == "snowflake"
|
|
188
187
|
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
188
|
+
Requires-Dist: snowflake-connector-python; extra == "snowflake"
|
|
189
189
|
Provides-Extra: togetherai
|
|
190
190
|
Requires-Dist: together; extra == "togetherai"
|
|
191
191
|
Provides-Extra: tsv
|
|
192
192
|
Requires-Dist: unstructured[tsv]; extra == "tsv"
|
|
193
193
|
Provides-Extra: vastdb
|
|
194
|
-
Requires-Dist: pyarrow; extra == "vastdb"
|
|
195
194
|
Requires-Dist: vastdb; extra == "vastdb"
|
|
195
|
+
Requires-Dist: pyarrow; extra == "vastdb"
|
|
196
196
|
Requires-Dist: ibis; extra == "vastdb"
|
|
197
197
|
Provides-Extra: vectara
|
|
198
|
-
Requires-Dist: httpx; extra == "vectara"
|
|
199
198
|
Requires-Dist: requests; extra == "vectara"
|
|
199
|
+
Requires-Dist: httpx; extra == "vectara"
|
|
200
200
|
Requires-Dist: aiofiles; extra == "vectara"
|
|
201
201
|
Provides-Extra: weaviate
|
|
202
202
|
Requires-Dist: weaviate-client; extra == "weaviate"
|
|
@@ -10,7 +10,7 @@ test/integration/connectors/test_azure_ai_search.py,sha256=MxFwk84vI_HT4taQTGrNp
|
|
|
10
10
|
test/integration/connectors/test_chroma.py,sha256=NuQv0PWPM0_LQfdPeUd6IYKqaKKXWmVaHGWjq5aBfOY,3721
|
|
11
11
|
test/integration/connectors/test_confluence.py,sha256=Ju0gRQbD2g9l9iRf2HDZKi7RyPnBGtFRWcGpsqhO3F8,3588
|
|
12
12
|
test/integration/connectors/test_delta_table.py,sha256=4qm2Arfc9Eb7SOZOnOlLF-vNpHy6Eqvr5Q45svfX1PY,6911
|
|
13
|
-
test/integration/connectors/test_google_drive.py,sha256=
|
|
13
|
+
test/integration/connectors/test_google_drive.py,sha256=ubjn3wvMhgpGHQs-wT_5icGgTIx2coS6hwNkAHOCEI8,10306
|
|
14
14
|
test/integration/connectors/test_lancedb.py,sha256=8MBxK_CUtOt87-4B7svDDK82NFII5psceo5cNN8HJMs,9228
|
|
15
15
|
test/integration/connectors/test_milvus.py,sha256=7mI6zznN0PTxDL9DLogH1k3dxx6R8DgGzlpyevsFu2w,7173
|
|
16
16
|
test/integration/connectors/test_mongodb.py,sha256=0A6DvF-iTCSZzOefisd_i20j9li8uNWTF2wyLGwlhco,12446
|
|
@@ -107,7 +107,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
|
|
|
107
107
|
test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
108
108
|
test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
|
|
109
109
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
110
|
-
unstructured_ingest/__version__.py,sha256=
|
|
110
|
+
unstructured_ingest/__version__.py,sha256=vh_Kp5pxLwbO2jsp-9Z1oAftmDaQZ9nkLMwneOowYbU,42
|
|
111
111
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
112
112
|
unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
|
|
113
113
|
unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
|
|
@@ -276,15 +276,15 @@ unstructured_ingest/connector/notion/types/database_properties/url.py,sha256=iXQ
|
|
|
276
276
|
unstructured_ingest/connector/notion/types/database_properties/verification.py,sha256=J_DLjY-v2T6xDGMQ7FkI0YMKMA6SG6Y3yYW7qUD1hKA,2334
|
|
277
277
|
unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
278
278
|
unstructured_ingest/embed/azure_openai.py,sha256=u9reyZzY6BtsT5U_TdIfS6vH_42lvohVBwKMPQAqvkI,1528
|
|
279
|
-
unstructured_ingest/embed/bedrock.py,sha256=
|
|
280
|
-
unstructured_ingest/embed/huggingface.py,sha256=
|
|
281
|
-
unstructured_ingest/embed/interfaces.py,sha256
|
|
282
|
-
unstructured_ingest/embed/mixedbreadai.py,sha256=
|
|
283
|
-
unstructured_ingest/embed/octoai.py,sha256=
|
|
284
|
-
unstructured_ingest/embed/openai.py,sha256=
|
|
285
|
-
unstructured_ingest/embed/togetherai.py,sha256=
|
|
286
|
-
unstructured_ingest/embed/vertexai.py,sha256=
|
|
287
|
-
unstructured_ingest/embed/voyageai.py,sha256=
|
|
279
|
+
unstructured_ingest/embed/bedrock.py,sha256=LnlxU2cC7mrq5wLiZ6D_7lv0_z_O2YtmY_8oRMu1N8E,7548
|
|
280
|
+
unstructured_ingest/embed/huggingface.py,sha256=4ytvdGrXur-PllLaNdKGQ4BhxxOJlHNzj4NfBwTUEsk,2270
|
|
281
|
+
unstructured_ingest/embed/interfaces.py,sha256=-SLdQKX6-KIa2Jq_-rz14noBnH9VuV8flOUKr8WJKMM,3109
|
|
282
|
+
unstructured_ingest/embed/mixedbreadai.py,sha256=ALRedRFg9xzFkYuV26uSjLGU4_3kHS46P8uSWeNdfoY,7214
|
|
283
|
+
unstructured_ingest/embed/octoai.py,sha256=lvfgs5Bnpn6lb_q4LenhFZ6IXC_L8xYilGC03ecJIGc,5318
|
|
284
|
+
unstructured_ingest/embed/openai.py,sha256=3QWindgIsziI5ChQ6Zzqt4hQ9g-qKeTFDZvt55YsA1k,4983
|
|
285
|
+
unstructured_ingest/embed/togetherai.py,sha256=lhUgiC24xuXbzmVlqgjrqtU4cJs-sIP-myxkaRK_tnk,4642
|
|
286
|
+
unstructured_ingest/embed/vertexai.py,sha256=LSLR9iOBX07e7bsIcMyU5kgmHm0zgC4GZ_gO6WL4xYY,5146
|
|
287
|
+
unstructured_ingest/embed/voyageai.py,sha256=M9LZ-YP54FSXzLt7XyWP4UoLb0naP6acpUNdQS-MucQ,5487
|
|
288
288
|
unstructured_ingest/enhanced_dataclass/__init__.py,sha256=gDZOUsv5eo-8jm4Yu7DdDwi101aGbfG7JctTdOYnTOM,151
|
|
289
289
|
unstructured_ingest/enhanced_dataclass/core.py,sha256=d6aUkDynuKX87cHx9_N5UDUWrvISR4jYRFRTvd_avlI,3038
|
|
290
290
|
unstructured_ingest/enhanced_dataclass/dataclasses.py,sha256=aZMsoCzAGRb8Rmh3BTSBFtNr6FmFTY93KYGLk3gYJKQ,1949
|
|
@@ -432,7 +432,7 @@ unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=i7vuNKsUkN93JRVm
|
|
|
432
432
|
unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=SotSXZQ85_6TO906YvFi3yTml8jE9A_zV6nBJ4oTx8A,7075
|
|
433
433
|
unstructured_ingest/v2/processes/connectors/discord.py,sha256=-e4-cBK4TnHkknK1qIb86AIVMy81lBgC288_iLpTzM8,5246
|
|
434
434
|
unstructured_ingest/v2/processes/connectors/gitlab.py,sha256=ufE65Z8q_tC4oppGg5BsGXwSaL7RbEXcaagJQYsylNo,9984
|
|
435
|
-
unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=
|
|
435
|
+
unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=QzcHNelUbnubsDtanFIgDCRzmYTuP-GjJ_g9y8fButE,19623
|
|
436
436
|
unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=VRDAiou_7oWOIAgQTdOGQWxudzQEDopXM8XkfkQ2j6g,5004
|
|
437
437
|
unstructured_ingest/v2/processes/connectors/local.py,sha256=ZvWTj6ZYkwnvQMNFsZWoaQyp9zp0WVqAywMaHJ2kcAc,7153
|
|
438
438
|
unstructured_ingest/v2/processes/connectors/milvus.py,sha256=wmcu9NVy3gYlQGT25inN5w_QrhFoL8-hRq0pJFSNw8g,8866
|
|
@@ -558,7 +558,7 @@ unstructured_ingest/v2/processes/connectors/sql/__init__.py,sha256=NSEZwJDHh_9kF
|
|
|
558
558
|
unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py,sha256=xbZ90rmehiCnBoqFXMz-3ZMXeYb0PzWB6iobCNSHTmQ,8955
|
|
559
559
|
unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=BATfX1PQGT2kl8jAbdNKXTojYKJxh3pJV9-h3OBnHGo,5124
|
|
560
560
|
unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=OPBDQ2c_5KjWHEFfqXxf3pQ2tWC-N4MtslMulMgP1Wc,5503
|
|
561
|
-
unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=
|
|
561
|
+
unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=0hfiX_u7V38k_RfoeDmXJp8WIHZ19ilIHnrgZVSleKw,9270
|
|
562
562
|
unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=F5PPUxt2W8JaAQGfz5Od0FvKqYa15RfwMIlnrdJu1nk,15317
|
|
563
563
|
unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=PRjN_S7UQv0k4ZpSyclW1AJrsrugyxbR-GoOrHvBpks,5200
|
|
564
564
|
unstructured_ingest/v2/processes/connectors/sql/vastdb.py,sha256=0rxrb1ByXIefB9umzMTEJbpvzdTttXHK5DjRY97-GG8,9618
|
|
@@ -567,9 +567,9 @@ unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-
|
|
|
567
567
|
unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
|
|
568
568
|
unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
|
|
569
569
|
unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=yJza_jBSEFnzZRq5L6vJ0Mm3uS1uxkOiKIimPpUyQds,12418
|
|
570
|
-
unstructured_ingest-0.5.
|
|
571
|
-
unstructured_ingest-0.5.
|
|
572
|
-
unstructured_ingest-0.5.
|
|
573
|
-
unstructured_ingest-0.5.
|
|
574
|
-
unstructured_ingest-0.5.
|
|
575
|
-
unstructured_ingest-0.5.
|
|
570
|
+
unstructured_ingest-0.5.2.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
571
|
+
unstructured_ingest-0.5.2.dist-info/METADATA,sha256=SiWzXim0-JmdQF6rlC6RbnmRBHcheGOk2VMlKN2A2ms,8051
|
|
572
|
+
unstructured_ingest-0.5.2.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
573
|
+
unstructured_ingest-0.5.2.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
574
|
+
unstructured_ingest-0.5.2.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
|
|
575
|
+
unstructured_ingest-0.5.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.5.1.dist-info → unstructured_ingest-0.5.2.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|