unstructured-ingest 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (45) hide show
  1. test/integration/connectors/sql/test_databricks_delta_tables.py +10 -10
  2. test/integration/connectors/weaviate/test_local.py +27 -6
  3. test/integration/embedders/test_azure_openai.py +1 -3
  4. test/integration/embedders/test_bedrock.py +2 -2
  5. test/integration/embedders/test_huggingface.py +1 -3
  6. test/integration/embedders/test_mixedbread.py +2 -2
  7. test/integration/embedders/test_octoai.py +2 -4
  8. test/integration/embedders/test_openai.py +2 -4
  9. test/integration/embedders/test_togetherai.py +2 -2
  10. test/integration/embedders/test_vertexai.py +2 -4
  11. test/integration/embedders/test_voyageai.py +2 -4
  12. test/integration/embedders/utils.py +12 -14
  13. test/unit/embed/test_openai.py +12 -4
  14. test/unit/test_html.py +112 -0
  15. test/unit/v2/connectors/databricks/__init__.py +0 -0
  16. test/unit/v2/connectors/databricks/test_volumes_table.py +44 -0
  17. test/unit/v2/embedders/test_voyageai.py +1 -1
  18. unstructured_ingest/__version__.py +1 -1
  19. unstructured_ingest/embed/huggingface.py +6 -1
  20. unstructured_ingest/embed/interfaces.py +9 -6
  21. unstructured_ingest/embed/mixedbreadai.py +3 -10
  22. unstructured_ingest/embed/octoai.py +14 -7
  23. unstructured_ingest/embed/openai.py +18 -5
  24. unstructured_ingest/embed/togetherai.py +19 -8
  25. unstructured_ingest/embed/vertexai.py +13 -6
  26. unstructured_ingest/embed/voyageai.py +19 -6
  27. unstructured_ingest/utils/data_prep.py +1 -1
  28. unstructured_ingest/utils/html.py +143 -93
  29. unstructured_ingest/v2/interfaces/__init__.py +2 -1
  30. unstructured_ingest/v2/interfaces/process.py +3 -0
  31. unstructured_ingest/v2/interfaces/uploader.py +14 -1
  32. unstructured_ingest/v2/pipeline/pipeline.py +20 -6
  33. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  34. unstructured_ingest/v2/processes/connectors/confluence.py +15 -22
  35. unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +14 -11
  36. unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +15 -15
  37. unstructured_ingest/v2/processes/connectors/sql/sql.py +4 -1
  38. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +41 -3
  39. unstructured_ingest/v2/processes/embedder.py +3 -0
  40. {unstructured_ingest-0.4.1.dist-info → unstructured_ingest-0.4.3.dist-info}/METADATA +22 -22
  41. {unstructured_ingest-0.4.1.dist-info → unstructured_ingest-0.4.3.dist-info}/RECORD +45 -41
  42. {unstructured_ingest-0.4.1.dist-info → unstructured_ingest-0.4.3.dist-info}/LICENSE.md +0 -0
  43. {unstructured_ingest-0.4.1.dist-info → unstructured_ingest-0.4.3.dist-info}/WHEEL +0 -0
  44. {unstructured_ingest-0.4.1.dist-info → unstructured_ingest-0.4.3.dist-info}/entry_points.txt +0 -0
  45. {unstructured_ingest-0.4.1.dist-info → unstructured_ingest-0.4.3.dist-info}/top_level.txt +0 -0
@@ -7,6 +7,7 @@ from pydantic import Field, Secret
7
7
 
8
8
  from unstructured_ingest.error import SourceConnectionError
9
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
+ from unstructured_ingest.utils.html import HtmlMixin
10
11
  from unstructured_ingest.v2.interfaces import (
11
12
  AccessConfig,
12
13
  ConnectionConfig,
@@ -180,19 +181,8 @@ class ConfluenceIndexer(Indexer):
180
181
  yield file_data
181
182
 
182
183
 
183
- class ConfluenceDownloaderConfig(DownloaderConfig):
184
- extract_images: bool = Field(
185
- default=False,
186
- description="if true, will download images and replace "
187
- "the html content with base64 encoded images",
188
- )
189
- extract_files: bool = Field(
190
- default=False, description="if true, will download any embedded files"
191
- )
192
- force_download: bool = Field(
193
- default=False,
194
- description="if true, will redownload extracted files even if they already exist locally",
195
- )
184
+ class ConfluenceDownloaderConfig(DownloaderConfig, HtmlMixin):
185
+ pass
196
186
 
197
187
 
198
188
  @dataclass
@@ -206,24 +196,27 @@ class ConfluenceDownloader(Downloader):
206
196
  ) -> list[DownloadResponse]:
207
197
  if not self.download_config.extract_files:
208
198
  return []
209
- from unstructured_ingest.utils.html import download_embedded_files
210
-
199
+ url = current_file_data.metadata.url
200
+ if url is None:
201
+ logger.warning(
202
+ f"""Missing URL for file: {current_file_data.source_identifiers.filename}.
203
+ Skipping file extraction."""
204
+ )
205
+ return []
211
206
  filepath = current_file_data.source_identifiers.relative_path
212
207
  download_path = Path(self.download_dir) / filepath
213
208
  download_dir = download_path.with_suffix("")
214
- return download_embedded_files(
209
+ return self.download_config.extract_embedded_files(
210
+ url=url,
215
211
  download_dir=download_dir,
216
212
  original_filedata=current_file_data,
217
- original_html=html,
213
+ html=html,
218
214
  session=session,
219
- force_download=self.download_config.force_download,
220
215
  )
221
216
 
222
217
  def run(self, file_data: FileData, **kwargs) -> download_responses:
223
218
  from bs4 import BeautifulSoup
224
219
 
225
- from unstructured_ingest.utils.html import convert_image_tags
226
-
227
220
  doc_id = file_data.identifier
228
221
  try:
229
222
  with self.connection_config.get_client() as client:
@@ -246,8 +239,8 @@ class ConfluenceDownloader(Downloader):
246
239
  content = f"<body class='Document' >{title_html}{content}</body>"
247
240
  if self.download_config.extract_images:
248
241
  with self.connection_config.get_client() as client:
249
- content = convert_image_tags(
250
- url=file_data.metadata.url, original_html=content, session=client._session
242
+ content = self.download_config.extract_html_images(
243
+ url=file_data.metadata.url, html=content, session=client._session
251
244
  )
252
245
 
253
246
  filepath = file_data.source_identifiers.relative_path
@@ -3,10 +3,11 @@ import os
3
3
  from contextlib import contextmanager
4
4
  from dataclasses import dataclass
5
5
  from pathlib import Path
6
- from typing import Any, Generator, Optional
6
+ from typing import Any, Generator
7
7
 
8
8
  from pydantic import Field
9
9
 
10
+ from unstructured_ingest.utils.data_prep import write_data
10
11
  from unstructured_ingest.v2.interfaces import FileData, Uploader, UploaderConfig
11
12
  from unstructured_ingest.v2.logger import logger
12
13
  from unstructured_ingest.v2.processes.connector_registry import (
@@ -14,9 +15,9 @@ from unstructured_ingest.v2.processes.connector_registry import (
14
15
  )
15
16
  from unstructured_ingest.v2.processes.connectors.databricks.volumes import DatabricksPathMixin
16
17
  from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables import (
17
- DatabrickDeltaTablesConnectionConfig,
18
- DatabrickDeltaTablesUploadStager,
19
- DatabrickDeltaTablesUploadStagerConfig,
18
+ DatabricksDeltaTablesConnectionConfig,
19
+ DatabricksDeltaTablesUploadStager,
20
+ DatabricksDeltaTablesUploadStagerConfig,
20
21
  )
21
22
 
22
23
  CONNECTOR_TYPE = "databricks_volume_delta_tables"
@@ -28,17 +29,16 @@ class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMix
28
29
 
29
30
 
30
31
  @dataclass
31
- class DatabricksVolumeDeltaTableStager(DatabrickDeltaTablesUploadStager):
32
- def write_output(self, output_path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
32
+ class DatabricksVolumeDeltaTableStager(DatabricksDeltaTablesUploadStager):
33
+ def write_output(self, output_path: Path, data: list[dict]) -> None:
33
34
  # To avoid new line issues when migrating from volumes into delta tables, omit indenting
34
35
  # and always write it as a json file
35
- with output_path.with_suffix(".json").open("w") as f:
36
- json.dump(data, f)
36
+ write_data(path=output_path.with_suffix(".json"), data=data, indent=None)
37
37
 
38
38
 
39
39
  @dataclass
40
40
  class DatabricksVolumeDeltaTableUploader(Uploader):
41
- connection_config: DatabrickDeltaTablesConnectionConfig
41
+ connection_config: DatabricksDeltaTablesConnectionConfig
42
42
  upload_config: DatabricksVolumeDeltaTableUploaderConfig
43
43
  connector_type: str = CONNECTOR_TYPE
44
44
 
@@ -78,7 +78,10 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
78
78
  @contextmanager
79
79
  def get_cursor(self, **connect_kwargs) -> Generator[Any, None, None]:
80
80
  with self.connection_config.get_cursor(**connect_kwargs) as cursor:
81
+ logger.debug(f"executing: USE CATALOG: '{self.upload_config.catalog}'")
81
82
  cursor.execute(f"USE CATALOG '{self.upload_config.catalog}'")
83
+ logger.debug(f"executing: USE DATABASE: {self.upload_config.database}")
84
+ cursor.execute(f"USE DATABASE {self.upload_config.database}")
82
85
  yield cursor
83
86
 
84
87
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
@@ -98,9 +101,9 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
98
101
 
99
102
 
100
103
  databricks_volumes_delta_tables_destination_entry = DestinationRegistryEntry(
101
- connection_config=DatabrickDeltaTablesConnectionConfig,
104
+ connection_config=DatabricksDeltaTablesConnectionConfig,
102
105
  uploader=DatabricksVolumeDeltaTableUploader,
103
106
  uploader_config=DatabricksVolumeDeltaTableUploaderConfig,
104
107
  upload_stager=DatabricksVolumeDeltaTableStager,
105
- upload_stager_config=DatabrickDeltaTablesUploadStagerConfig,
108
+ upload_stager_config=DatabricksDeltaTablesUploadStagerConfig,
106
109
  )
@@ -31,7 +31,7 @@ if TYPE_CHECKING:
31
31
  CONNECTOR_TYPE = "databricks_delta_tables"
32
32
 
33
33
 
34
- class DatabrickDeltaTablesAccessConfig(SQLAccessConfig):
34
+ class DatabricksDeltaTablesAccessConfig(SQLAccessConfig):
35
35
  token: Optional[str] = Field(default=None, description="Databricks Personal Access Token")
36
36
  client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
37
37
  client_secret: Optional[str] = Field(
@@ -39,8 +39,8 @@ class DatabrickDeltaTablesAccessConfig(SQLAccessConfig):
39
39
  )
40
40
 
41
41
 
42
- class DatabrickDeltaTablesConnectionConfig(SQLConnectionConfig):
43
- access_config: Secret[DatabrickDeltaTablesAccessConfig]
42
+ class DatabricksDeltaTablesConnectionConfig(SQLConnectionConfig):
43
+ access_config: Secret[DatabricksDeltaTablesAccessConfig]
44
44
  server_hostname: str = Field(description="server hostname connection config value")
45
45
  http_path: str = Field(description="http path connection config value")
46
46
  user_agent: str = "unstructuredio_oss"
@@ -102,24 +102,24 @@ class DatabrickDeltaTablesConnectionConfig(SQLConnectionConfig):
102
102
  yield cursor
103
103
 
104
104
 
105
- class DatabrickDeltaTablesUploadStagerConfig(SQLUploadStagerConfig):
105
+ class DatabricksDeltaTablesUploadStagerConfig(SQLUploadStagerConfig):
106
106
  pass
107
107
 
108
108
 
109
- class DatabrickDeltaTablesUploadStager(SQLUploadStager):
110
- upload_stager_config: DatabrickDeltaTablesUploadStagerConfig
109
+ class DatabricksDeltaTablesUploadStager(SQLUploadStager):
110
+ upload_stager_config: DatabricksDeltaTablesUploadStagerConfig
111
111
 
112
112
 
113
- class DatabrickDeltaTablesUploaderConfig(SQLUploaderConfig):
113
+ class DatabricksDeltaTablesUploaderConfig(SQLUploaderConfig):
114
114
  catalog: str = Field(description="Name of the catalog in the Databricks Unity Catalog service")
115
115
  database: str = Field(description="Database name", default="default")
116
116
  table_name: str = Field(description="Table name")
117
117
 
118
118
 
119
119
  @dataclass
120
- class DatabrickDeltaTablesUploader(SQLUploader):
121
- upload_config: DatabrickDeltaTablesUploaderConfig
122
- connection_config: DatabrickDeltaTablesConnectionConfig
120
+ class DatabricksDeltaTablesUploader(SQLUploader):
121
+ upload_config: DatabricksDeltaTablesUploaderConfig
122
+ connection_config: DatabricksDeltaTablesConnectionConfig
123
123
  connector_type: str = CONNECTOR_TYPE
124
124
 
125
125
  @contextmanager
@@ -205,9 +205,9 @@ class DatabrickDeltaTablesUploader(SQLUploader):
205
205
 
206
206
 
207
207
  databricks_delta_tables_destination_entry = DestinationRegistryEntry(
208
- connection_config=DatabrickDeltaTablesConnectionConfig,
209
- uploader=DatabrickDeltaTablesUploader,
210
- uploader_config=DatabrickDeltaTablesUploaderConfig,
211
- upload_stager=DatabrickDeltaTablesUploadStager,
212
- upload_stager_config=DatabrickDeltaTablesUploadStagerConfig,
208
+ connection_config=DatabricksDeltaTablesConnectionConfig,
209
+ uploader=DatabricksDeltaTablesUploader,
210
+ uploader_config=DatabricksDeltaTablesUploaderConfig,
211
+ upload_stager=DatabricksDeltaTablesUploadStager,
212
+ upload_stager_config=DatabricksDeltaTablesUploadStagerConfig,
213
213
  )
@@ -292,6 +292,9 @@ class SQLUploadStager(UploadStager):
292
292
  df[column] = df[column].apply(str)
293
293
  return df
294
294
 
295
+ def write_output(self, output_path: Path, data: list[dict]) -> None:
296
+ write_data(path=output_path, data=data)
297
+
295
298
  def run(
296
299
  self,
297
300
  elements_filepath: Path,
@@ -314,7 +317,7 @@ class SQLUploadStager(UploadStager):
314
317
  output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
315
318
  output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
316
319
 
317
- write_data(path=output_path, data=df.to_dict(orient="records"))
320
+ self.write_output(output_path=output_path, data=df.to_dict(orient="records"))
318
321
  return output_path
319
322
 
320
323
 
@@ -3,6 +3,7 @@ from abc import ABC, abstractmethod
3
3
  from contextlib import contextmanager
4
4
  from dataclasses import dataclass, field
5
5
  from datetime import date, datetime
6
+ from pathlib import Path
6
7
  from typing import TYPE_CHECKING, Any, Generator, Optional
7
8
 
8
9
  from dateutil import parser
@@ -15,10 +16,10 @@ from unstructured_ingest.v2.interfaces import (
15
16
  AccessConfig,
16
17
  ConnectionConfig,
17
18
  FileData,
18
- Uploader,
19
19
  UploaderConfig,
20
20
  UploadStager,
21
21
  UploadStagerConfig,
22
+ VectorDBUploader,
22
23
  )
23
24
  from unstructured_ingest.v2.logger import logger
24
25
 
@@ -160,7 +161,9 @@ class WeaviateUploadStager(UploadStager):
160
161
 
161
162
 
162
163
  class WeaviateUploaderConfig(UploaderConfig):
163
- collection: str = Field(description="The name of the collection this object belongs to")
164
+ collection: Optional[str] = Field(
165
+ description="The name of the collection this object belongs to", default=None
166
+ )
164
167
  batch_size: Optional[int] = Field(default=None, description="Number of records per batch")
165
168
  requests_per_minute: Optional[int] = Field(default=None, description="Rate limit for upload")
166
169
  dynamic_batch: bool = Field(default=True, description="Whether to use dynamic batch")
@@ -205,17 +208,50 @@ class WeaviateUploaderConfig(UploaderConfig):
205
208
 
206
209
 
207
210
  @dataclass
208
- class WeaviateUploader(Uploader, ABC):
211
+ class WeaviateUploader(VectorDBUploader, ABC):
209
212
  upload_config: WeaviateUploaderConfig
210
213
  connection_config: WeaviateConnectionConfig
211
214
 
215
+ def _collection_exists(self, collection_name: Optional[str] = None):
216
+ collection_name = collection_name or self.upload_config.collection
217
+ with self.connection_config.get_client() as weaviate_client:
218
+ return weaviate_client.collections.exists(name=collection_name)
219
+
212
220
  def precheck(self) -> None:
213
221
  try:
214
222
  self.connection_config.get_client()
223
+ # only if collection name populated should we check that it exists
224
+ if self.upload_config.collection and not self._collection_exists():
225
+ raise DestinationConnectionError(
226
+ f"collection '{self.upload_config.collection}' does not exist"
227
+ )
215
228
  except Exception as e:
216
229
  logger.error(f"Failed to validate connection {e}", exc_info=True)
217
230
  raise DestinationConnectionError(f"failed to validate connection: {e}")
218
231
 
232
+ def init(self, *kwargs: Any) -> None:
233
+ self.create_destination()
234
+
235
+ def create_destination(
236
+ self, destination_name: str = "elements", vector_length: Optional[int] = None, **kwargs: Any
237
+ ) -> bool:
238
+ collection_name = self.upload_config.collection or destination_name
239
+ self.upload_config.collection = collection_name
240
+ connectors_dir = Path(__file__).parents[1]
241
+ collection_config_file = connectors_dir / "assets" / "weaviate_collection_config.json"
242
+ with collection_config_file.open() as f:
243
+ collection_config = json.load(f)
244
+ collection_config["class"] = collection_name
245
+ if not self._collection_exists():
246
+ logger.info(
247
+ f"creating default weaviate collection '{collection_name}' with default configs"
248
+ )
249
+ with self.connection_config.get_client() as weaviate_client:
250
+ weaviate_client.collections.create_from_dict(config=collection_config)
251
+ return True
252
+ logger.debug(f"collection with name '{collection_name}' already exists, skipping creation")
253
+ return False
254
+
219
255
  def check_for_errors(self, client: "WeaviateClient") -> None:
220
256
  failed_uploads = client.batch.failed_objects
221
257
  if failed_uploads:
@@ -253,6 +289,8 @@ class WeaviateUploader(Uploader, ABC):
253
289
  f"writing {len(data)} objects to destination "
254
290
  f"class {self.connection_config.access_config} "
255
291
  )
292
+ if not self.upload_config.collection:
293
+ raise ValueError("No collection specified")
256
294
 
257
295
  with self.connection_config.get_client() as weaviate_client:
258
296
  self.delete_by_record_id(client=weaviate_client, file_data=file_data)
@@ -184,6 +184,9 @@ class EmbedderConfig(BaseModel):
184
184
  class Embedder(BaseProcess, ABC):
185
185
  config: EmbedderConfig
186
186
 
187
+ def init(self, *kwargs: Any) -> None:
188
+ self.config.get_embedder().initialize()
189
+
187
190
  def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
188
191
  # TODO update base embedder classes to support async
189
192
  embedder = self.config.get_embedder()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.4.1
3
+ Version: 0.4.3
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,38 +22,38 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.14
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
- Requires-Dist: tqdm
26
- Requires-Dist: pydantic>=2.7
27
- Requires-Dist: python-dateutil
28
- Requires-Dist: pandas
29
25
  Requires-Dist: dataclasses-json
26
+ Requires-Dist: pandas
27
+ Requires-Dist: python-dateutil
30
28
  Requires-Dist: opentelemetry-sdk
31
29
  Requires-Dist: click
30
+ Requires-Dist: pydantic>=2.7
31
+ Requires-Dist: tqdm
32
32
  Provides-Extra: airtable
33
33
  Requires-Dist: pyairtable; extra == "airtable"
34
34
  Provides-Extra: astradb
35
35
  Requires-Dist: astrapy; extra == "astradb"
36
36
  Provides-Extra: azure
37
- Requires-Dist: adlfs; extra == "azure"
38
37
  Requires-Dist: fsspec; extra == "azure"
38
+ Requires-Dist: adlfs; extra == "azure"
39
39
  Provides-Extra: azure-ai-search
40
40
  Requires-Dist: azure-search-documents; extra == "azure-ai-search"
41
41
  Provides-Extra: bedrock
42
- Requires-Dist: aioboto3; extra == "bedrock"
43
42
  Requires-Dist: boto3; extra == "bedrock"
43
+ Requires-Dist: aioboto3; extra == "bedrock"
44
44
  Provides-Extra: biomed
45
45
  Requires-Dist: bs4; extra == "biomed"
46
46
  Requires-Dist: requests; extra == "biomed"
47
47
  Provides-Extra: box
48
- Requires-Dist: boxfs; extra == "box"
49
48
  Requires-Dist: fsspec; extra == "box"
49
+ Requires-Dist: boxfs; extra == "box"
50
50
  Provides-Extra: chroma
51
51
  Requires-Dist: chromadb; extra == "chroma"
52
52
  Provides-Extra: clarifai
53
53
  Requires-Dist: clarifai; extra == "clarifai"
54
54
  Provides-Extra: confluence
55
- Requires-Dist: atlassian-python-api; extra == "confluence"
56
55
  Requires-Dist: requests; extra == "confluence"
56
+ Requires-Dist: atlassian-python-api; extra == "confluence"
57
57
  Provides-Extra: couchbase
58
58
  Requires-Dist: couchbase; extra == "couchbase"
59
59
  Provides-Extra: csv
@@ -63,8 +63,8 @@ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
63
63
  Provides-Extra: databricks-volumes
64
64
  Requires-Dist: databricks-sdk; extra == "databricks-volumes"
65
65
  Provides-Extra: delta-table
66
- Requires-Dist: deltalake; extra == "delta-table"
67
66
  Requires-Dist: boto3; extra == "delta-table"
67
+ Requires-Dist: deltalake; extra == "delta-table"
68
68
  Provides-Extra: discord
69
69
  Requires-Dist: discord.py; extra == "discord"
70
70
  Provides-Extra: doc
@@ -72,8 +72,8 @@ Requires-Dist: unstructured[docx]; extra == "doc"
72
72
  Provides-Extra: docx
73
73
  Requires-Dist: unstructured[docx]; extra == "docx"
74
74
  Provides-Extra: dropbox
75
- Requires-Dist: dropboxdrivefs; extra == "dropbox"
76
75
  Requires-Dist: fsspec; extra == "dropbox"
76
+ Requires-Dist: dropboxdrivefs; extra == "dropbox"
77
77
  Provides-Extra: duckdb
78
78
  Requires-Dist: duckdb; extra == "duckdb"
79
79
  Provides-Extra: elasticsearch
@@ -93,11 +93,11 @@ Provides-Extra: epub
93
93
  Requires-Dist: unstructured[epub]; extra == "epub"
94
94
  Provides-Extra: gcs
95
95
  Requires-Dist: gcsfs; extra == "gcs"
96
- Requires-Dist: bs4; extra == "gcs"
97
96
  Requires-Dist: fsspec; extra == "gcs"
97
+ Requires-Dist: bs4; extra == "gcs"
98
98
  Provides-Extra: github
99
- Requires-Dist: pygithub>1.58.0; extra == "github"
100
99
  Requires-Dist: requests; extra == "github"
100
+ Requires-Dist: pygithub>1.58.0; extra == "github"
101
101
  Provides-Extra: gitlab
102
102
  Requires-Dist: python-gitlab; extra == "gitlab"
103
103
  Provides-Extra: google-drive
@@ -126,16 +126,16 @@ Requires-Dist: neo4j; extra == "neo4j"
126
126
  Requires-Dist: networkx; extra == "neo4j"
127
127
  Requires-Dist: cymple; extra == "neo4j"
128
128
  Provides-Extra: notion
129
- Requires-Dist: httpx; extra == "notion"
130
- Requires-Dist: backoff; extra == "notion"
131
129
  Requires-Dist: htmlBuilder; extra == "notion"
130
+ Requires-Dist: backoff; extra == "notion"
132
131
  Requires-Dist: notion-client; extra == "notion"
132
+ Requires-Dist: httpx; extra == "notion"
133
133
  Provides-Extra: odt
134
134
  Requires-Dist: unstructured[odt]; extra == "odt"
135
135
  Provides-Extra: onedrive
136
- Requires-Dist: bs4; extra == "onedrive"
137
- Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
138
136
  Requires-Dist: msal; extra == "onedrive"
137
+ Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
138
+ Requires-Dist: bs4; extra == "onedrive"
139
139
  Provides-Extra: openai
140
140
  Requires-Dist: openai; extra == "openai"
141
141
  Requires-Dist: tiktoken; extra == "openai"
@@ -144,8 +144,8 @@ Requires-Dist: opensearch-py; extra == "opensearch"
144
144
  Provides-Extra: org
145
145
  Requires-Dist: unstructured[org]; extra == "org"
146
146
  Provides-Extra: outlook
147
- Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
148
147
  Requires-Dist: msal; extra == "outlook"
148
+ Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
149
149
  Provides-Extra: pdf
150
150
  Requires-Dist: unstructured[pdf]; extra == "pdf"
151
151
  Provides-Extra: pinecone
@@ -174,26 +174,26 @@ Requires-Dist: s3fs; extra == "s3"
174
174
  Provides-Extra: salesforce
175
175
  Requires-Dist: simple-salesforce; extra == "salesforce"
176
176
  Provides-Extra: sftp
177
- Requires-Dist: paramiko; extra == "sftp"
178
177
  Requires-Dist: fsspec; extra == "sftp"
178
+ Requires-Dist: paramiko; extra == "sftp"
179
179
  Provides-Extra: sharepoint
180
- Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
181
180
  Requires-Dist: msal; extra == "sharepoint"
181
+ Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
182
182
  Provides-Extra: singlestore
183
183
  Requires-Dist: singlestoredb; extra == "singlestore"
184
184
  Provides-Extra: slack
185
185
  Requires-Dist: slack-sdk[optional]; extra == "slack"
186
186
  Provides-Extra: snowflake
187
- Requires-Dist: psycopg2-binary; extra == "snowflake"
188
187
  Requires-Dist: snowflake-connector-python; extra == "snowflake"
188
+ Requires-Dist: psycopg2-binary; extra == "snowflake"
189
189
  Provides-Extra: togetherai
190
190
  Requires-Dist: together; extra == "togetherai"
191
191
  Provides-Extra: tsv
192
192
  Requires-Dist: unstructured[tsv]; extra == "tsv"
193
193
  Provides-Extra: vastdb
194
- Requires-Dist: pyarrow; extra == "vastdb"
195
194
  Requires-Dist: vastdb; extra == "vastdb"
196
195
  Requires-Dist: ibis; extra == "vastdb"
196
+ Requires-Dist: pyarrow; extra == "vastdb"
197
197
  Provides-Extra: vectara
198
198
  Requires-Dist: httpx; extra == "vectara"
199
199
  Requires-Dist: requests; extra == "vectara"