unstructured-ingest 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/sql/test_databricks_delta_tables.py +10 -10
- test/integration/connectors/weaviate/test_local.py +27 -6
- test/integration/embedders/test_azure_openai.py +1 -3
- test/integration/embedders/test_bedrock.py +2 -2
- test/integration/embedders/test_huggingface.py +1 -3
- test/integration/embedders/test_mixedbread.py +2 -2
- test/integration/embedders/test_octoai.py +2 -4
- test/integration/embedders/test_openai.py +2 -4
- test/integration/embedders/test_togetherai.py +2 -2
- test/integration/embedders/test_vertexai.py +2 -4
- test/integration/embedders/test_voyageai.py +2 -4
- test/integration/embedders/utils.py +12 -14
- test/unit/embed/test_openai.py +12 -4
- test/unit/test_html.py +112 -0
- test/unit/v2/connectors/databricks/__init__.py +0 -0
- test/unit/v2/connectors/databricks/test_volumes_table.py +44 -0
- test/unit/v2/embedders/test_voyageai.py +1 -1
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/huggingface.py +6 -1
- unstructured_ingest/embed/interfaces.py +9 -6
- unstructured_ingest/embed/mixedbreadai.py +3 -10
- unstructured_ingest/embed/octoai.py +14 -7
- unstructured_ingest/embed/openai.py +18 -5
- unstructured_ingest/embed/togetherai.py +19 -8
- unstructured_ingest/embed/vertexai.py +13 -6
- unstructured_ingest/embed/voyageai.py +19 -6
- unstructured_ingest/utils/data_prep.py +1 -1
- unstructured_ingest/utils/html.py +143 -93
- unstructured_ingest/v2/interfaces/__init__.py +2 -1
- unstructured_ingest/v2/interfaces/process.py +3 -0
- unstructured_ingest/v2/interfaces/uploader.py +14 -1
- unstructured_ingest/v2/pipeline/pipeline.py +20 -6
- unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/confluence.py +15 -22
- unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +14 -11
- unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +15 -15
- unstructured_ingest/v2/processes/connectors/sql/sql.py +4 -1
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +41 -3
- unstructured_ingest/v2/processes/embedder.py +3 -0
- {unstructured_ingest-0.4.1.dist-info → unstructured_ingest-0.4.3.dist-info}/METADATA +22 -22
- {unstructured_ingest-0.4.1.dist-info → unstructured_ingest-0.4.3.dist-info}/RECORD +45 -41
- {unstructured_ingest-0.4.1.dist-info → unstructured_ingest-0.4.3.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.4.1.dist-info → unstructured_ingest-0.4.3.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.4.1.dist-info → unstructured_ingest-0.4.3.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.4.1.dist-info → unstructured_ingest-0.4.3.dist-info}/top_level.txt +0 -0
|
@@ -7,6 +7,7 @@ from pydantic import Field, Secret
|
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.error import SourceConnectionError
|
|
9
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
|
+
from unstructured_ingest.utils.html import HtmlMixin
|
|
10
11
|
from unstructured_ingest.v2.interfaces import (
|
|
11
12
|
AccessConfig,
|
|
12
13
|
ConnectionConfig,
|
|
@@ -180,19 +181,8 @@ class ConfluenceIndexer(Indexer):
|
|
|
180
181
|
yield file_data
|
|
181
182
|
|
|
182
183
|
|
|
183
|
-
class ConfluenceDownloaderConfig(DownloaderConfig):
|
|
184
|
-
|
|
185
|
-
default=False,
|
|
186
|
-
description="if true, will download images and replace "
|
|
187
|
-
"the html content with base64 encoded images",
|
|
188
|
-
)
|
|
189
|
-
extract_files: bool = Field(
|
|
190
|
-
default=False, description="if true, will download any embedded files"
|
|
191
|
-
)
|
|
192
|
-
force_download: bool = Field(
|
|
193
|
-
default=False,
|
|
194
|
-
description="if true, will redownload extracted files even if they already exist locally",
|
|
195
|
-
)
|
|
184
|
+
class ConfluenceDownloaderConfig(DownloaderConfig, HtmlMixin):
|
|
185
|
+
pass
|
|
196
186
|
|
|
197
187
|
|
|
198
188
|
@dataclass
|
|
@@ -206,24 +196,27 @@ class ConfluenceDownloader(Downloader):
|
|
|
206
196
|
) -> list[DownloadResponse]:
|
|
207
197
|
if not self.download_config.extract_files:
|
|
208
198
|
return []
|
|
209
|
-
|
|
210
|
-
|
|
199
|
+
url = current_file_data.metadata.url
|
|
200
|
+
if url is None:
|
|
201
|
+
logger.warning(
|
|
202
|
+
f"""Missing URL for file: {current_file_data.source_identifiers.filename}.
|
|
203
|
+
Skipping file extraction."""
|
|
204
|
+
)
|
|
205
|
+
return []
|
|
211
206
|
filepath = current_file_data.source_identifiers.relative_path
|
|
212
207
|
download_path = Path(self.download_dir) / filepath
|
|
213
208
|
download_dir = download_path.with_suffix("")
|
|
214
|
-
return
|
|
209
|
+
return self.download_config.extract_embedded_files(
|
|
210
|
+
url=url,
|
|
215
211
|
download_dir=download_dir,
|
|
216
212
|
original_filedata=current_file_data,
|
|
217
|
-
|
|
213
|
+
html=html,
|
|
218
214
|
session=session,
|
|
219
|
-
force_download=self.download_config.force_download,
|
|
220
215
|
)
|
|
221
216
|
|
|
222
217
|
def run(self, file_data: FileData, **kwargs) -> download_responses:
|
|
223
218
|
from bs4 import BeautifulSoup
|
|
224
219
|
|
|
225
|
-
from unstructured_ingest.utils.html import convert_image_tags
|
|
226
|
-
|
|
227
220
|
doc_id = file_data.identifier
|
|
228
221
|
try:
|
|
229
222
|
with self.connection_config.get_client() as client:
|
|
@@ -246,8 +239,8 @@ class ConfluenceDownloader(Downloader):
|
|
|
246
239
|
content = f"<body class='Document' >{title_html}{content}</body>"
|
|
247
240
|
if self.download_config.extract_images:
|
|
248
241
|
with self.connection_config.get_client() as client:
|
|
249
|
-
content =
|
|
250
|
-
url=file_data.metadata.url,
|
|
242
|
+
content = self.download_config.extract_html_images(
|
|
243
|
+
url=file_data.metadata.url, html=content, session=client._session
|
|
251
244
|
)
|
|
252
245
|
|
|
253
246
|
filepath = file_data.source_identifiers.relative_path
|
|
@@ -3,10 +3,11 @@ import os
|
|
|
3
3
|
from contextlib import contextmanager
|
|
4
4
|
from dataclasses import dataclass
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Any, Generator
|
|
6
|
+
from typing import Any, Generator
|
|
7
7
|
|
|
8
8
|
from pydantic import Field
|
|
9
9
|
|
|
10
|
+
from unstructured_ingest.utils.data_prep import write_data
|
|
10
11
|
from unstructured_ingest.v2.interfaces import FileData, Uploader, UploaderConfig
|
|
11
12
|
from unstructured_ingest.v2.logger import logger
|
|
12
13
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
@@ -14,9 +15,9 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
14
15
|
)
|
|
15
16
|
from unstructured_ingest.v2.processes.connectors.databricks.volumes import DatabricksPathMixin
|
|
16
17
|
from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables import (
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
18
|
+
DatabricksDeltaTablesConnectionConfig,
|
|
19
|
+
DatabricksDeltaTablesUploadStager,
|
|
20
|
+
DatabricksDeltaTablesUploadStagerConfig,
|
|
20
21
|
)
|
|
21
22
|
|
|
22
23
|
CONNECTOR_TYPE = "databricks_volume_delta_tables"
|
|
@@ -28,17 +29,16 @@ class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMix
|
|
|
28
29
|
|
|
29
30
|
|
|
30
31
|
@dataclass
|
|
31
|
-
class DatabricksVolumeDeltaTableStager(
|
|
32
|
-
def write_output(self, output_path: Path, data: list[dict]
|
|
32
|
+
class DatabricksVolumeDeltaTableStager(DatabricksDeltaTablesUploadStager):
|
|
33
|
+
def write_output(self, output_path: Path, data: list[dict]) -> None:
|
|
33
34
|
# To avoid new line issues when migrating from volumes into delta tables, omit indenting
|
|
34
35
|
# and always write it as a json file
|
|
35
|
-
|
|
36
|
-
json.dump(data, f)
|
|
36
|
+
write_data(path=output_path.with_suffix(".json"), data=data, indent=None)
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
@dataclass
|
|
40
40
|
class DatabricksVolumeDeltaTableUploader(Uploader):
|
|
41
|
-
connection_config:
|
|
41
|
+
connection_config: DatabricksDeltaTablesConnectionConfig
|
|
42
42
|
upload_config: DatabricksVolumeDeltaTableUploaderConfig
|
|
43
43
|
connector_type: str = CONNECTOR_TYPE
|
|
44
44
|
|
|
@@ -78,7 +78,10 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
|
|
|
78
78
|
@contextmanager
|
|
79
79
|
def get_cursor(self, **connect_kwargs) -> Generator[Any, None, None]:
|
|
80
80
|
with self.connection_config.get_cursor(**connect_kwargs) as cursor:
|
|
81
|
+
logger.debug(f"executing: USE CATALOG: '{self.upload_config.catalog}'")
|
|
81
82
|
cursor.execute(f"USE CATALOG '{self.upload_config.catalog}'")
|
|
83
|
+
logger.debug(f"executing: USE DATABASE: {self.upload_config.database}")
|
|
84
|
+
cursor.execute(f"USE DATABASE {self.upload_config.database}")
|
|
82
85
|
yield cursor
|
|
83
86
|
|
|
84
87
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
@@ -98,9 +101,9 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
|
|
|
98
101
|
|
|
99
102
|
|
|
100
103
|
databricks_volumes_delta_tables_destination_entry = DestinationRegistryEntry(
|
|
101
|
-
connection_config=
|
|
104
|
+
connection_config=DatabricksDeltaTablesConnectionConfig,
|
|
102
105
|
uploader=DatabricksVolumeDeltaTableUploader,
|
|
103
106
|
uploader_config=DatabricksVolumeDeltaTableUploaderConfig,
|
|
104
107
|
upload_stager=DatabricksVolumeDeltaTableStager,
|
|
105
|
-
upload_stager_config=
|
|
108
|
+
upload_stager_config=DatabricksDeltaTablesUploadStagerConfig,
|
|
106
109
|
)
|
|
@@ -31,7 +31,7 @@ if TYPE_CHECKING:
|
|
|
31
31
|
CONNECTOR_TYPE = "databricks_delta_tables"
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
class
|
|
34
|
+
class DatabricksDeltaTablesAccessConfig(SQLAccessConfig):
|
|
35
35
|
token: Optional[str] = Field(default=None, description="Databricks Personal Access Token")
|
|
36
36
|
client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
|
|
37
37
|
client_secret: Optional[str] = Field(
|
|
@@ -39,8 +39,8 @@ class DatabrickDeltaTablesAccessConfig(SQLAccessConfig):
|
|
|
39
39
|
)
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
class
|
|
43
|
-
access_config: Secret[
|
|
42
|
+
class DatabricksDeltaTablesConnectionConfig(SQLConnectionConfig):
|
|
43
|
+
access_config: Secret[DatabricksDeltaTablesAccessConfig]
|
|
44
44
|
server_hostname: str = Field(description="server hostname connection config value")
|
|
45
45
|
http_path: str = Field(description="http path connection config value")
|
|
46
46
|
user_agent: str = "unstructuredio_oss"
|
|
@@ -102,24 +102,24 @@ class DatabrickDeltaTablesConnectionConfig(SQLConnectionConfig):
|
|
|
102
102
|
yield cursor
|
|
103
103
|
|
|
104
104
|
|
|
105
|
-
class
|
|
105
|
+
class DatabricksDeltaTablesUploadStagerConfig(SQLUploadStagerConfig):
|
|
106
106
|
pass
|
|
107
107
|
|
|
108
108
|
|
|
109
|
-
class
|
|
110
|
-
upload_stager_config:
|
|
109
|
+
class DatabricksDeltaTablesUploadStager(SQLUploadStager):
|
|
110
|
+
upload_stager_config: DatabricksDeltaTablesUploadStagerConfig
|
|
111
111
|
|
|
112
112
|
|
|
113
|
-
class
|
|
113
|
+
class DatabricksDeltaTablesUploaderConfig(SQLUploaderConfig):
|
|
114
114
|
catalog: str = Field(description="Name of the catalog in the Databricks Unity Catalog service")
|
|
115
115
|
database: str = Field(description="Database name", default="default")
|
|
116
116
|
table_name: str = Field(description="Table name")
|
|
117
117
|
|
|
118
118
|
|
|
119
119
|
@dataclass
|
|
120
|
-
class
|
|
121
|
-
upload_config:
|
|
122
|
-
connection_config:
|
|
120
|
+
class DatabricksDeltaTablesUploader(SQLUploader):
|
|
121
|
+
upload_config: DatabricksDeltaTablesUploaderConfig
|
|
122
|
+
connection_config: DatabricksDeltaTablesConnectionConfig
|
|
123
123
|
connector_type: str = CONNECTOR_TYPE
|
|
124
124
|
|
|
125
125
|
@contextmanager
|
|
@@ -205,9 +205,9 @@ class DatabrickDeltaTablesUploader(SQLUploader):
|
|
|
205
205
|
|
|
206
206
|
|
|
207
207
|
databricks_delta_tables_destination_entry = DestinationRegistryEntry(
|
|
208
|
-
connection_config=
|
|
209
|
-
uploader=
|
|
210
|
-
uploader_config=
|
|
211
|
-
upload_stager=
|
|
212
|
-
upload_stager_config=
|
|
208
|
+
connection_config=DatabricksDeltaTablesConnectionConfig,
|
|
209
|
+
uploader=DatabricksDeltaTablesUploader,
|
|
210
|
+
uploader_config=DatabricksDeltaTablesUploaderConfig,
|
|
211
|
+
upload_stager=DatabricksDeltaTablesUploadStager,
|
|
212
|
+
upload_stager_config=DatabricksDeltaTablesUploadStagerConfig,
|
|
213
213
|
)
|
|
@@ -292,6 +292,9 @@ class SQLUploadStager(UploadStager):
|
|
|
292
292
|
df[column] = df[column].apply(str)
|
|
293
293
|
return df
|
|
294
294
|
|
|
295
|
+
def write_output(self, output_path: Path, data: list[dict]) -> None:
|
|
296
|
+
write_data(path=output_path, data=data)
|
|
297
|
+
|
|
295
298
|
def run(
|
|
296
299
|
self,
|
|
297
300
|
elements_filepath: Path,
|
|
@@ -314,7 +317,7 @@ class SQLUploadStager(UploadStager):
|
|
|
314
317
|
output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
|
|
315
318
|
output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
|
|
316
319
|
|
|
317
|
-
|
|
320
|
+
self.write_output(output_path=output_path, data=df.to_dict(orient="records"))
|
|
318
321
|
return output_path
|
|
319
322
|
|
|
320
323
|
|
|
@@ -3,6 +3,7 @@ from abc import ABC, abstractmethod
|
|
|
3
3
|
from contextlib import contextmanager
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
5
|
from datetime import date, datetime
|
|
6
|
+
from pathlib import Path
|
|
6
7
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
7
8
|
|
|
8
9
|
from dateutil import parser
|
|
@@ -15,10 +16,10 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
15
16
|
AccessConfig,
|
|
16
17
|
ConnectionConfig,
|
|
17
18
|
FileData,
|
|
18
|
-
Uploader,
|
|
19
19
|
UploaderConfig,
|
|
20
20
|
UploadStager,
|
|
21
21
|
UploadStagerConfig,
|
|
22
|
+
VectorDBUploader,
|
|
22
23
|
)
|
|
23
24
|
from unstructured_ingest.v2.logger import logger
|
|
24
25
|
|
|
@@ -160,7 +161,9 @@ class WeaviateUploadStager(UploadStager):
|
|
|
160
161
|
|
|
161
162
|
|
|
162
163
|
class WeaviateUploaderConfig(UploaderConfig):
|
|
163
|
-
collection: str = Field(
|
|
164
|
+
collection: Optional[str] = Field(
|
|
165
|
+
description="The name of the collection this object belongs to", default=None
|
|
166
|
+
)
|
|
164
167
|
batch_size: Optional[int] = Field(default=None, description="Number of records per batch")
|
|
165
168
|
requests_per_minute: Optional[int] = Field(default=None, description="Rate limit for upload")
|
|
166
169
|
dynamic_batch: bool = Field(default=True, description="Whether to use dynamic batch")
|
|
@@ -205,17 +208,50 @@ class WeaviateUploaderConfig(UploaderConfig):
|
|
|
205
208
|
|
|
206
209
|
|
|
207
210
|
@dataclass
|
|
208
|
-
class WeaviateUploader(
|
|
211
|
+
class WeaviateUploader(VectorDBUploader, ABC):
|
|
209
212
|
upload_config: WeaviateUploaderConfig
|
|
210
213
|
connection_config: WeaviateConnectionConfig
|
|
211
214
|
|
|
215
|
+
def _collection_exists(self, collection_name: Optional[str] = None):
|
|
216
|
+
collection_name = collection_name or self.upload_config.collection
|
|
217
|
+
with self.connection_config.get_client() as weaviate_client:
|
|
218
|
+
return weaviate_client.collections.exists(name=collection_name)
|
|
219
|
+
|
|
212
220
|
def precheck(self) -> None:
|
|
213
221
|
try:
|
|
214
222
|
self.connection_config.get_client()
|
|
223
|
+
# only if collection name populated should we check that it exists
|
|
224
|
+
if self.upload_config.collection and not self._collection_exists():
|
|
225
|
+
raise DestinationConnectionError(
|
|
226
|
+
f"collection '{self.upload_config.collection}' does not exist"
|
|
227
|
+
)
|
|
215
228
|
except Exception as e:
|
|
216
229
|
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
217
230
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
218
231
|
|
|
232
|
+
def init(self, *kwargs: Any) -> None:
|
|
233
|
+
self.create_destination()
|
|
234
|
+
|
|
235
|
+
def create_destination(
|
|
236
|
+
self, destination_name: str = "elements", vector_length: Optional[int] = None, **kwargs: Any
|
|
237
|
+
) -> bool:
|
|
238
|
+
collection_name = self.upload_config.collection or destination_name
|
|
239
|
+
self.upload_config.collection = collection_name
|
|
240
|
+
connectors_dir = Path(__file__).parents[1]
|
|
241
|
+
collection_config_file = connectors_dir / "assets" / "weaviate_collection_config.json"
|
|
242
|
+
with collection_config_file.open() as f:
|
|
243
|
+
collection_config = json.load(f)
|
|
244
|
+
collection_config["class"] = collection_name
|
|
245
|
+
if not self._collection_exists():
|
|
246
|
+
logger.info(
|
|
247
|
+
f"creating default weaviate collection '{collection_name}' with default configs"
|
|
248
|
+
)
|
|
249
|
+
with self.connection_config.get_client() as weaviate_client:
|
|
250
|
+
weaviate_client.collections.create_from_dict(config=collection_config)
|
|
251
|
+
return True
|
|
252
|
+
logger.debug(f"collection with name '{collection_name}' already exists, skipping creation")
|
|
253
|
+
return False
|
|
254
|
+
|
|
219
255
|
def check_for_errors(self, client: "WeaviateClient") -> None:
|
|
220
256
|
failed_uploads = client.batch.failed_objects
|
|
221
257
|
if failed_uploads:
|
|
@@ -253,6 +289,8 @@ class WeaviateUploader(Uploader, ABC):
|
|
|
253
289
|
f"writing {len(data)} objects to destination "
|
|
254
290
|
f"class {self.connection_config.access_config} "
|
|
255
291
|
)
|
|
292
|
+
if not self.upload_config.collection:
|
|
293
|
+
raise ValueError("No collection specified")
|
|
256
294
|
|
|
257
295
|
with self.connection_config.get_client() as weaviate_client:
|
|
258
296
|
self.delete_by_record_id(client=weaviate_client, file_data=file_data)
|
|
@@ -184,6 +184,9 @@ class EmbedderConfig(BaseModel):
|
|
|
184
184
|
class Embedder(BaseProcess, ABC):
|
|
185
185
|
config: EmbedderConfig
|
|
186
186
|
|
|
187
|
+
def init(self, *kwargs: Any) -> None:
|
|
188
|
+
self.config.get_embedder().initialize()
|
|
189
|
+
|
|
187
190
|
def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
|
|
188
191
|
# TODO update base embedder classes to support async
|
|
189
192
|
embedder = self.config.get_embedder()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.3
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,38 +22,38 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.14
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
-
Requires-Dist: tqdm
|
|
26
|
-
Requires-Dist: pydantic>=2.7
|
|
27
|
-
Requires-Dist: python-dateutil
|
|
28
|
-
Requires-Dist: pandas
|
|
29
25
|
Requires-Dist: dataclasses-json
|
|
26
|
+
Requires-Dist: pandas
|
|
27
|
+
Requires-Dist: python-dateutil
|
|
30
28
|
Requires-Dist: opentelemetry-sdk
|
|
31
29
|
Requires-Dist: click
|
|
30
|
+
Requires-Dist: pydantic>=2.7
|
|
31
|
+
Requires-Dist: tqdm
|
|
32
32
|
Provides-Extra: airtable
|
|
33
33
|
Requires-Dist: pyairtable; extra == "airtable"
|
|
34
34
|
Provides-Extra: astradb
|
|
35
35
|
Requires-Dist: astrapy; extra == "astradb"
|
|
36
36
|
Provides-Extra: azure
|
|
37
|
-
Requires-Dist: adlfs; extra == "azure"
|
|
38
37
|
Requires-Dist: fsspec; extra == "azure"
|
|
38
|
+
Requires-Dist: adlfs; extra == "azure"
|
|
39
39
|
Provides-Extra: azure-ai-search
|
|
40
40
|
Requires-Dist: azure-search-documents; extra == "azure-ai-search"
|
|
41
41
|
Provides-Extra: bedrock
|
|
42
|
-
Requires-Dist: aioboto3; extra == "bedrock"
|
|
43
42
|
Requires-Dist: boto3; extra == "bedrock"
|
|
43
|
+
Requires-Dist: aioboto3; extra == "bedrock"
|
|
44
44
|
Provides-Extra: biomed
|
|
45
45
|
Requires-Dist: bs4; extra == "biomed"
|
|
46
46
|
Requires-Dist: requests; extra == "biomed"
|
|
47
47
|
Provides-Extra: box
|
|
48
|
-
Requires-Dist: boxfs; extra == "box"
|
|
49
48
|
Requires-Dist: fsspec; extra == "box"
|
|
49
|
+
Requires-Dist: boxfs; extra == "box"
|
|
50
50
|
Provides-Extra: chroma
|
|
51
51
|
Requires-Dist: chromadb; extra == "chroma"
|
|
52
52
|
Provides-Extra: clarifai
|
|
53
53
|
Requires-Dist: clarifai; extra == "clarifai"
|
|
54
54
|
Provides-Extra: confluence
|
|
55
|
-
Requires-Dist: atlassian-python-api; extra == "confluence"
|
|
56
55
|
Requires-Dist: requests; extra == "confluence"
|
|
56
|
+
Requires-Dist: atlassian-python-api; extra == "confluence"
|
|
57
57
|
Provides-Extra: couchbase
|
|
58
58
|
Requires-Dist: couchbase; extra == "couchbase"
|
|
59
59
|
Provides-Extra: csv
|
|
@@ -63,8 +63,8 @@ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
|
|
|
63
63
|
Provides-Extra: databricks-volumes
|
|
64
64
|
Requires-Dist: databricks-sdk; extra == "databricks-volumes"
|
|
65
65
|
Provides-Extra: delta-table
|
|
66
|
-
Requires-Dist: deltalake; extra == "delta-table"
|
|
67
66
|
Requires-Dist: boto3; extra == "delta-table"
|
|
67
|
+
Requires-Dist: deltalake; extra == "delta-table"
|
|
68
68
|
Provides-Extra: discord
|
|
69
69
|
Requires-Dist: discord.py; extra == "discord"
|
|
70
70
|
Provides-Extra: doc
|
|
@@ -72,8 +72,8 @@ Requires-Dist: unstructured[docx]; extra == "doc"
|
|
|
72
72
|
Provides-Extra: docx
|
|
73
73
|
Requires-Dist: unstructured[docx]; extra == "docx"
|
|
74
74
|
Provides-Extra: dropbox
|
|
75
|
-
Requires-Dist: dropboxdrivefs; extra == "dropbox"
|
|
76
75
|
Requires-Dist: fsspec; extra == "dropbox"
|
|
76
|
+
Requires-Dist: dropboxdrivefs; extra == "dropbox"
|
|
77
77
|
Provides-Extra: duckdb
|
|
78
78
|
Requires-Dist: duckdb; extra == "duckdb"
|
|
79
79
|
Provides-Extra: elasticsearch
|
|
@@ -93,11 +93,11 @@ Provides-Extra: epub
|
|
|
93
93
|
Requires-Dist: unstructured[epub]; extra == "epub"
|
|
94
94
|
Provides-Extra: gcs
|
|
95
95
|
Requires-Dist: gcsfs; extra == "gcs"
|
|
96
|
-
Requires-Dist: bs4; extra == "gcs"
|
|
97
96
|
Requires-Dist: fsspec; extra == "gcs"
|
|
97
|
+
Requires-Dist: bs4; extra == "gcs"
|
|
98
98
|
Provides-Extra: github
|
|
99
|
-
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
100
99
|
Requires-Dist: requests; extra == "github"
|
|
100
|
+
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
101
101
|
Provides-Extra: gitlab
|
|
102
102
|
Requires-Dist: python-gitlab; extra == "gitlab"
|
|
103
103
|
Provides-Extra: google-drive
|
|
@@ -126,16 +126,16 @@ Requires-Dist: neo4j; extra == "neo4j"
|
|
|
126
126
|
Requires-Dist: networkx; extra == "neo4j"
|
|
127
127
|
Requires-Dist: cymple; extra == "neo4j"
|
|
128
128
|
Provides-Extra: notion
|
|
129
|
-
Requires-Dist: httpx; extra == "notion"
|
|
130
|
-
Requires-Dist: backoff; extra == "notion"
|
|
131
129
|
Requires-Dist: htmlBuilder; extra == "notion"
|
|
130
|
+
Requires-Dist: backoff; extra == "notion"
|
|
132
131
|
Requires-Dist: notion-client; extra == "notion"
|
|
132
|
+
Requires-Dist: httpx; extra == "notion"
|
|
133
133
|
Provides-Extra: odt
|
|
134
134
|
Requires-Dist: unstructured[odt]; extra == "odt"
|
|
135
135
|
Provides-Extra: onedrive
|
|
136
|
-
Requires-Dist: bs4; extra == "onedrive"
|
|
137
|
-
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
138
136
|
Requires-Dist: msal; extra == "onedrive"
|
|
137
|
+
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
138
|
+
Requires-Dist: bs4; extra == "onedrive"
|
|
139
139
|
Provides-Extra: openai
|
|
140
140
|
Requires-Dist: openai; extra == "openai"
|
|
141
141
|
Requires-Dist: tiktoken; extra == "openai"
|
|
@@ -144,8 +144,8 @@ Requires-Dist: opensearch-py; extra == "opensearch"
|
|
|
144
144
|
Provides-Extra: org
|
|
145
145
|
Requires-Dist: unstructured[org]; extra == "org"
|
|
146
146
|
Provides-Extra: outlook
|
|
147
|
-
Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
|
|
148
147
|
Requires-Dist: msal; extra == "outlook"
|
|
148
|
+
Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
|
|
149
149
|
Provides-Extra: pdf
|
|
150
150
|
Requires-Dist: unstructured[pdf]; extra == "pdf"
|
|
151
151
|
Provides-Extra: pinecone
|
|
@@ -174,26 +174,26 @@ Requires-Dist: s3fs; extra == "s3"
|
|
|
174
174
|
Provides-Extra: salesforce
|
|
175
175
|
Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
176
176
|
Provides-Extra: sftp
|
|
177
|
-
Requires-Dist: paramiko; extra == "sftp"
|
|
178
177
|
Requires-Dist: fsspec; extra == "sftp"
|
|
178
|
+
Requires-Dist: paramiko; extra == "sftp"
|
|
179
179
|
Provides-Extra: sharepoint
|
|
180
|
-
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
181
180
|
Requires-Dist: msal; extra == "sharepoint"
|
|
181
|
+
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
182
182
|
Provides-Extra: singlestore
|
|
183
183
|
Requires-Dist: singlestoredb; extra == "singlestore"
|
|
184
184
|
Provides-Extra: slack
|
|
185
185
|
Requires-Dist: slack-sdk[optional]; extra == "slack"
|
|
186
186
|
Provides-Extra: snowflake
|
|
187
|
-
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
188
187
|
Requires-Dist: snowflake-connector-python; extra == "snowflake"
|
|
188
|
+
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
189
189
|
Provides-Extra: togetherai
|
|
190
190
|
Requires-Dist: together; extra == "togetherai"
|
|
191
191
|
Provides-Extra: tsv
|
|
192
192
|
Requires-Dist: unstructured[tsv]; extra == "tsv"
|
|
193
193
|
Provides-Extra: vastdb
|
|
194
|
-
Requires-Dist: pyarrow; extra == "vastdb"
|
|
195
194
|
Requires-Dist: vastdb; extra == "vastdb"
|
|
196
195
|
Requires-Dist: ibis; extra == "vastdb"
|
|
196
|
+
Requires-Dist: pyarrow; extra == "vastdb"
|
|
197
197
|
Provides-Extra: vectara
|
|
198
198
|
Requires-Dist: httpx; extra == "vectara"
|
|
199
199
|
Requires-Dist: requests; extra == "vectara"
|