unstructured-ingest 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (23) hide show
  1. test/integration/connectors/sql/test_databricks_delta_tables.py +142 -0
  2. test/integration/connectors/test_confluence.py +4 -4
  3. test/integration/connectors/test_pinecone.py +68 -2
  4. test/unit/v2/connectors/sql/__init__.py +0 -0
  5. test/unit/v2/connectors/sql/test_sql.py +72 -0
  6. test/unit/v2/connectors/test_confluence.py +6 -6
  7. unstructured_ingest/__version__.py +1 -1
  8. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  9. unstructured_ingest/v2/processes/connectors/confluence.py +30 -10
  10. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +6 -0
  11. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +6 -3
  12. unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +106 -0
  13. unstructured_ingest/v2/processes/connectors/pinecone.py +18 -11
  14. unstructured_ingest/v2/processes/connectors/sql/__init__.py +6 -0
  15. unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +213 -0
  16. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +1 -1
  17. unstructured_ingest/v2/processes/connectors/sql/sql.py +28 -9
  18. {unstructured_ingest-0.3.14.dist-info → unstructured_ingest-0.4.0.dist-info}/METADATA +22 -20
  19. {unstructured_ingest-0.3.14.dist-info → unstructured_ingest-0.4.0.dist-info}/RECORD +23 -18
  20. {unstructured_ingest-0.3.14.dist-info → unstructured_ingest-0.4.0.dist-info}/LICENSE.md +0 -0
  21. {unstructured_ingest-0.3.14.dist-info → unstructured_ingest-0.4.0.dist-info}/WHEEL +0 -0
  22. {unstructured_ingest-0.3.14.dist-info → unstructured_ingest-0.4.0.dist-info}/entry_points.txt +0 -0
  23. {unstructured_ingest-0.3.14.dist-info → unstructured_ingest-0.4.0.dist-info}/top_level.txt +0 -0
@@ -5,12 +5,10 @@ from typing import TYPE_CHECKING, Any, Optional
5
5
  from pydantic import Field, Secret
6
6
 
7
7
  from unstructured_ingest.error import DestinationConnectionError
8
- from unstructured_ingest.utils.data_prep import (
9
- flatten_dict,
10
- generator_batching_wbytes,
11
- )
8
+ from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
12
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
13
10
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
11
+ from unstructured_ingest.v2.errors import UserError
14
12
  from unstructured_ingest.v2.interfaces import (
15
13
  AccessConfig,
16
14
  ConnectionConfig,
@@ -63,6 +61,7 @@ class PineconeConnectionConfig(ConnectionConfig):
63
61
  pc = self.get_client()
64
62
 
65
63
  index = pc.Index(name=self.index_name, **index_kwargs)
64
+
66
65
  logger.debug(f"connected to index: {pc.describe_index(self.index_name)}")
67
66
  return index
68
67
 
@@ -182,14 +181,18 @@ class PineconeUploader(Uploader):
182
181
  delete_kwargs = {
183
182
  "filter": {self.upload_config.record_id_key: {"$eq": file_data.identifier}}
184
183
  }
184
+
185
185
  if namespace := self.upload_config.namespace:
186
186
  delete_kwargs["namespace"] = namespace
187
+ try:
188
+ index.delete(**delete_kwargs)
189
+ except UserError as e:
190
+ logger.error(f"failed to delete batch of ids: {delete_kwargs} {e}")
187
191
 
188
- resp = index.delete(**delete_kwargs)
189
192
  logger.debug(
190
193
  f"deleted any content with metadata "
191
194
  f"{self.upload_config.record_id_key}={file_data.identifier} "
192
- f"from pinecone index: {resp}"
195
+ f"from pinecone index: {delete_kwargs}"
193
196
  )
194
197
 
195
198
  def serverless_delete_by_record_id(self, file_data: FileData) -> None:
@@ -203,15 +206,19 @@ class PineconeUploader(Uploader):
203
206
  deleted_ids = 0
204
207
  if namespace := self.upload_config.namespace:
205
208
  list_kwargs["namespace"] = namespace
209
+
206
210
  for ids in index.list(**list_kwargs):
207
211
  deleted_ids += len(ids)
208
212
  delete_kwargs = {"ids": ids}
213
+
209
214
  if namespace := self.upload_config.namespace:
210
- delete_resp = delete_kwargs["namespace"] = namespace
211
- # delete_resp should be an empty dict if there were no errors
212
- if delete_resp:
213
- logger.error(f"failed to delete batch of ids: {delete_resp}")
214
- index.delete(**delete_kwargs)
215
+ delete_kwargs["namespace"] = namespace
216
+
217
+ try:
218
+ index.delete(**delete_kwargs)
219
+ except UserError as e:
220
+ logger.error(f"failed to delete batch of ids: {delete_kwargs} {e}")
221
+
215
222
  logger.info(
216
223
  f"deleted {deleted_ids} records with metadata "
217
224
  f"{self.upload_config.record_id_key}={file_data.identifier} "
@@ -5,6 +5,8 @@ from unstructured_ingest.v2.processes.connector_registry import (
5
5
  add_source_entry,
6
6
  )
7
7
 
8
+ from .databricks_delta_tables import CONNECTOR_TYPE as DATABRICKS_DELTA_TABLES_CONNECTOR_TYPE
9
+ from .databricks_delta_tables import databricks_delta_tables_destination_entry
8
10
  from .postgres import CONNECTOR_TYPE as POSTGRES_CONNECTOR_TYPE
9
11
  from .postgres import postgres_destination_entry, postgres_source_entry
10
12
  from .singlestore import CONNECTOR_TYPE as SINGLESTORE_CONNECTOR_TYPE
@@ -25,3 +27,7 @@ add_destination_entry(destination_type=SNOWFLAKE_CONNECTOR_TYPE, entry=snowflake
25
27
  add_destination_entry(
26
28
  destination_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_destination_entry
27
29
  )
30
+ add_destination_entry(
31
+ destination_type=DATABRICKS_DELTA_TABLES_CONNECTOR_TYPE,
32
+ entry=databricks_delta_tables_destination_entry,
33
+ )
@@ -0,0 +1,213 @@
1
+ import json
2
+ from contextlib import contextmanager
3
+ from dataclasses import dataclass
4
+ from typing import TYPE_CHECKING, Any, Generator, Optional
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ from pydantic import Field, Secret
9
+
10
+ from unstructured_ingest.utils.data_prep import split_dataframe
11
+ from unstructured_ingest.utils.dep_check import requires_dependencies
12
+ from unstructured_ingest.v2.interfaces import FileData
13
+ from unstructured_ingest.v2.logger import logger
14
+ from unstructured_ingest.v2.processes.connector_registry import (
15
+ DestinationRegistryEntry,
16
+ )
17
+ from unstructured_ingest.v2.processes.connectors.sql.sql import (
18
+ SQLAccessConfig,
19
+ SQLConnectionConfig,
20
+ SQLUploader,
21
+ SQLUploaderConfig,
22
+ SQLUploadStager,
23
+ SQLUploadStagerConfig,
24
+ )
25
+
26
+ if TYPE_CHECKING:
27
+ from databricks.sdk.core import oauth_service_principal
28
+ from databricks.sql.client import Connection as DeltaTableConnection
29
+ from databricks.sql.client import Cursor as DeltaTableCursor
30
+
31
+ CONNECTOR_TYPE = "databricks_delta_tables"
32
+
33
+
34
+ class DatabrickDeltaTablesAccessConfig(SQLAccessConfig):
35
+ token: Optional[str] = Field(default=None, description="Databricks Personal Access Token")
36
+ client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
37
+ client_secret: Optional[str] = Field(
38
+ default=None, description="Client Secret of the OAuth app."
39
+ )
40
+
41
+
42
+ class DatabrickDeltaTablesConnectionConfig(SQLConnectionConfig):
43
+ access_config: Secret[DatabrickDeltaTablesAccessConfig]
44
+ server_hostname: str = Field(description="server hostname connection config value")
45
+ http_path: str = Field(description="http path connection config value")
46
+ user_agent: str = "unstructuredio_oss"
47
+
48
+ @requires_dependencies(["databricks"], extras="databricks-delta-tables")
49
+ def get_credentials_provider(self) -> "oauth_service_principal":
50
+ from databricks.sdk.core import Config, oauth_service_principal
51
+
52
+ host = f"https://{self.server_hostname}"
53
+ access_configs = self.access_config.get_secret_value()
54
+ if (client_id := access_configs.client_id) and (
55
+ client_secret := access_configs.client_secret
56
+ ):
57
+ return oauth_service_principal(
58
+ Config(
59
+ host=host,
60
+ client_id=client_id,
61
+ client_secret=client_secret,
62
+ )
63
+ )
64
+ return False
65
+
66
+ def model_post_init(self, __context: Any) -> None:
67
+ access_config = self.access_config.get_secret_value()
68
+ if access_config.token and access_config.client_secret and access_config.client_id:
69
+ raise ValueError(
70
+ "One one for of auth can be provided, either token or client id and secret"
71
+ )
72
+ if not access_config.token and not (
73
+ access_config.client_secret and access_config.client_id
74
+ ):
75
+ raise ValueError(
76
+ "One form of auth must be provided, either token or client id and secret"
77
+ )
78
+
79
+ @contextmanager
80
+ @requires_dependencies(["databricks"], extras="databricks-delta-tables")
81
+ def get_connection(self, **connect_kwargs) -> Generator["DeltaTableConnection", None, None]:
82
+ from databricks.sql import connect
83
+
84
+ connect_kwargs = connect_kwargs or {}
85
+ connect_kwargs["_user_agent_entry"] = self.user_agent
86
+ connect_kwargs["server_hostname"] = connect_kwargs.get(
87
+ "server_hostname", self.server_hostname
88
+ )
89
+ connect_kwargs["http_path"] = connect_kwargs.get("http_path", self.http_path)
90
+
91
+ if credential_provider := self.get_credentials_provider():
92
+ connect_kwargs["credentials_provider"] = credential_provider
93
+ else:
94
+ connect_kwargs["access_token"] = self.access_config.get_secret_value().token
95
+ with connect(**connect_kwargs) as connection:
96
+ yield connection
97
+
98
+ @contextmanager
99
+ def get_cursor(self, **connect_kwargs) -> Generator["DeltaTableCursor", None, None]:
100
+ with self.get_connection(**connect_kwargs) as connection:
101
+ cursor = connection.cursor()
102
+ yield cursor
103
+
104
+
105
+ class DatabrickDeltaTablesUploadStagerConfig(SQLUploadStagerConfig):
106
+ pass
107
+
108
+
109
+ class DatabrickDeltaTablesUploadStager(SQLUploadStager):
110
+ upload_stager_config: DatabrickDeltaTablesUploadStagerConfig
111
+
112
+
113
+ class DatabrickDeltaTablesUploaderConfig(SQLUploaderConfig):
114
+ catalog: str = Field(description="Name of the catalog in the Databricks Unity Catalog service")
115
+ database: str = Field(description="Database name", default="default")
116
+ table_name: str = Field(description="Table name")
117
+
118
+
119
+ @dataclass
120
+ class DatabrickDeltaTablesUploader(SQLUploader):
121
+ upload_config: DatabrickDeltaTablesUploaderConfig
122
+ connection_config: DatabrickDeltaTablesConnectionConfig
123
+ connector_type: str = CONNECTOR_TYPE
124
+
125
+ @contextmanager
126
+ def get_cursor(self) -> Generator[Any, None, None]:
127
+ with self.connection_config.get_cursor() as cursor:
128
+ cursor.execute(f"USE CATALOG '{self.upload_config.catalog}'")
129
+ yield cursor
130
+
131
+ def precheck(self) -> None:
132
+ with self.connection_config.get_cursor() as cursor:
133
+ cursor.execute("SHOW CATALOGS")
134
+ catalogs = [r[0] for r in cursor.fetchall()]
135
+ if self.upload_config.catalog not in catalogs:
136
+ raise ValueError(
137
+ "Catalog {} not found in {}".format(
138
+ self.upload_config.catalog, ", ".join(catalogs)
139
+ )
140
+ )
141
+ cursor.execute(f"USE CATALOG '{self.upload_config.catalog}'")
142
+ cursor.execute("SHOW DATABASES")
143
+ databases = [r[0] for r in cursor.fetchall()]
144
+ if self.upload_config.database not in databases:
145
+ raise ValueError(
146
+ "Database {} not found in {}".format(
147
+ self.upload_config.database, ", ".join(databases)
148
+ )
149
+ )
150
+ cursor.execute("SHOW TABLES")
151
+ table_names = [r[1] for r in cursor.fetchall()]
152
+ if self.upload_config.table_name not in table_names:
153
+ raise ValueError(
154
+ "Table {} not found in {}".format(
155
+ self.upload_config.table_name, ", ".join(table_names)
156
+ )
157
+ )
158
+
159
+ def create_statement(self, columns: list[str], values: tuple[Any, ...]) -> str:
160
+ values_list = []
161
+ for v in values:
162
+ if isinstance(v, dict):
163
+ values_list.append(json.dumps(v))
164
+ elif isinstance(v, list):
165
+ if v and isinstance(v[0], (int, float)):
166
+ values_list.append("ARRAY({})".format(", ".join([str(val) for val in v])))
167
+ else:
168
+ values_list.append("ARRAY({})".format(", ".join([f"'{val}'" for val in v])))
169
+ else:
170
+ values_list.append(f"'{v}'")
171
+ statement = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
172
+ table_name=self.upload_config.table_name,
173
+ columns=", ".join(columns),
174
+ values=", ".join(values_list),
175
+ )
176
+ return statement
177
+
178
+ def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
179
+ if self.can_delete():
180
+ self.delete_by_record_id(file_data=file_data)
181
+ else:
182
+ logger.warning(
183
+ f"table doesn't contain expected "
184
+ f"record id column "
185
+ f"{self.upload_config.record_id_key}, skipping delete"
186
+ )
187
+ df.replace({np.nan: None}, inplace=True)
188
+ self._fit_to_schema(df=df)
189
+
190
+ columns = list(df.columns)
191
+ logger.info(
192
+ f"writing a total of {len(df)} elements via"
193
+ f" document batches to destination"
194
+ f" table named {self.upload_config.table_name}"
195
+ # f" with batch size {self.upload_config.batch_size}"
196
+ )
197
+ # TODO: currently variable binding not supporting for list types,
198
+ # update once that gets resolved in SDK
199
+ for rows in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
200
+ with self.get_cursor() as cursor:
201
+ values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
202
+ for v in values:
203
+ stmt = self.create_statement(columns=columns, values=v)
204
+ cursor.execute(stmt)
205
+
206
+
207
+ databricks_delta_tables_destination_entry = DestinationRegistryEntry(
208
+ connection_config=DatabrickDeltaTablesConnectionConfig,
209
+ uploader=DatabrickDeltaTablesUploader,
210
+ uploader_config=DatabrickDeltaTablesUploaderConfig,
211
+ upload_stager=DatabrickDeltaTablesUploadStager,
212
+ upload_stager_config=DatabrickDeltaTablesUploadStagerConfig,
213
+ )
@@ -170,7 +170,7 @@ class SnowflakeUploader(SQLUploader):
170
170
  f"{self.upload_config.record_id_key}, skipping delete"
171
171
  )
172
172
  df.replace({np.nan: None}, inplace=True)
173
- self._fit_to_schema(df=df, columns=self.get_table_columns())
173
+ self._fit_to_schema(df=df)
174
174
 
175
175
  columns = list(df.columns)
176
176
  stmt = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
@@ -129,8 +129,13 @@ class SQLIndexer(Indexer, ABC):
129
129
  connection_config: SQLConnectionConfig
130
130
  index_config: SQLIndexerConfig
131
131
 
132
- def _get_doc_ids(self) -> list[str]:
132
+ @contextmanager
133
+ def get_cursor(self) -> Generator[Any, None, None]:
133
134
  with self.connection_config.get_cursor() as cursor:
135
+ yield cursor
136
+
137
+ def _get_doc_ids(self) -> list[str]:
138
+ with self.get_cursor() as cursor:
134
139
  cursor.execute(
135
140
  f"SELECT {self.index_config.id_column} FROM {self.index_config.table_name}"
136
141
  )
@@ -140,7 +145,7 @@ class SQLIndexer(Indexer, ABC):
140
145
 
141
146
  def precheck(self) -> None:
142
147
  try:
143
- with self.connection_config.get_cursor() as cursor:
148
+ with self.get_cursor() as cursor:
144
149
  cursor.execute("SELECT 1;")
145
150
  except Exception as e:
146
151
  logger.error(f"failed to validate connection: {e}", exc_info=True)
@@ -182,6 +187,11 @@ class SQLDownloader(Downloader, ABC):
182
187
  connection_config: SQLConnectionConfig
183
188
  download_config: SQLDownloaderConfig
184
189
 
190
+ @contextmanager
191
+ def get_cursor(self) -> Generator[Any, None, None]:
192
+ with self.connection_config.get_cursor() as cursor:
193
+ yield cursor
194
+
185
195
  @abstractmethod
186
196
  def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
187
197
  pass
@@ -300,6 +310,8 @@ class SQLUploadStager(UploadStager):
300
310
  )
301
311
  df = self.conform_dataframe(df=df)
302
312
 
313
+ output_filename_suffix = Path(elements_filepath).suffix
314
+ output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
303
315
  output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
304
316
 
305
317
  self.write_output(output_path=output_path, data=df.to_dict(orient="records"))
@@ -323,12 +335,17 @@ class SQLUploader(Uploader):
323
335
 
324
336
  def precheck(self) -> None:
325
337
  try:
326
- with self.connection_config.get_cursor() as cursor:
338
+ with self.get_cursor() as cursor:
327
339
  cursor.execute("SELECT 1;")
328
340
  except Exception as e:
329
341
  logger.error(f"failed to validate connection: {e}", exc_info=True)
330
342
  raise DestinationConnectionError(f"failed to validate connection: {e}")
331
343
 
344
+ @contextmanager
345
+ def get_cursor(self) -> Generator[Any, None, None]:
346
+ with self.connection_config.get_cursor() as cursor:
347
+ yield cursor
348
+
332
349
  def prepare_data(
333
350
  self, columns: list[str], data: tuple[tuple[Any, ...], ...]
334
351
  ) -> list[tuple[Any, ...]]:
@@ -346,7 +363,7 @@ class SQLUploader(Uploader):
346
363
  output.append(tuple(parsed))
347
364
  return output
348
365
 
349
- def _fit_to_schema(self, df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
366
+ def _fit_to_schema(self, df: pd.DataFrame) -> pd.DataFrame:
350
367
  columns = set(df.columns)
351
368
  schema_fields = set(columns)
352
369
  columns_to_drop = columns - schema_fields
@@ -367,6 +384,7 @@ class SQLUploader(Uploader):
367
384
 
368
385
  for column in missing_columns:
369
386
  df[column] = pd.Series()
387
+ return df
370
388
 
371
389
  def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
372
390
  if self.can_delete():
@@ -378,7 +396,7 @@ class SQLUploader(Uploader):
378
396
  f"{self.upload_config.record_id_key}, skipping delete"
379
397
  )
380
398
  df.replace({np.nan: None}, inplace=True)
381
- self._fit_to_schema(df=df, columns=self.get_table_columns())
399
+ self._fit_to_schema(df=df)
382
400
 
383
401
  columns = list(df.columns)
384
402
  stmt = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
@@ -393,7 +411,7 @@ class SQLUploader(Uploader):
393
411
  f" with batch size {self.upload_config.batch_size}"
394
412
  )
395
413
  for rows in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
396
- with self.connection_config.get_cursor() as cursor:
414
+ with self.get_cursor() as cursor:
397
415
  values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
398
416
  # For debugging purposes:
399
417
  # for val in values:
@@ -406,7 +424,7 @@ class SQLUploader(Uploader):
406
424
  cursor.executemany(stmt, values)
407
425
 
408
426
  def get_table_columns(self) -> list[str]:
409
- with self.connection_config.get_cursor() as cursor:
427
+ with self.get_cursor() as cursor:
410
428
  cursor.execute(f"SELECT * from {self.upload_config.table_name}")
411
429
  return [desc[0] for desc in cursor.description]
412
430
 
@@ -420,10 +438,11 @@ class SQLUploader(Uploader):
420
438
  f"from table {self.upload_config.table_name}"
421
439
  )
422
440
  stmt = f"DELETE FROM {self.upload_config.table_name} WHERE {self.upload_config.record_id_key} = {self.values_delimiter}" # noqa: E501
423
- with self.connection_config.get_cursor() as cursor:
441
+ with self.get_cursor() as cursor:
424
442
  cursor.execute(stmt, [file_data.identifier])
425
443
  rowcount = cursor.rowcount
426
- logger.info(f"deleted {rowcount} rows from table {self.upload_config.table_name}")
444
+ if rowcount > 0:
445
+ logger.info(f"deleted {rowcount} rows from table {self.upload_config.table_name}")
427
446
 
428
447
  def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
429
448
  df = pd.DataFrame(data)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.3.14
3
+ Version: 0.4.0
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,43 +22,45 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.14
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
- Requires-Dist: python-dateutil
25
+ Requires-Dist: click
26
+ Requires-Dist: pydantic>=2.7
26
27
  Requires-Dist: pandas
28
+ Requires-Dist: ndjson
29
+ Requires-Dist: opentelemetry-sdk
30
+ Requires-Dist: python-dateutil
27
31
  Requires-Dist: tqdm
28
32
  Requires-Dist: dataclasses-json
29
- Requires-Dist: opentelemetry-sdk
30
- Requires-Dist: click
31
- Requires-Dist: ndjson
32
- Requires-Dist: pydantic>=2.7
33
33
  Provides-Extra: airtable
34
34
  Requires-Dist: pyairtable; extra == "airtable"
35
35
  Provides-Extra: astradb
36
36
  Requires-Dist: astrapy; extra == "astradb"
37
37
  Provides-Extra: azure
38
- Requires-Dist: adlfs; extra == "azure"
39
38
  Requires-Dist: fsspec; extra == "azure"
39
+ Requires-Dist: adlfs; extra == "azure"
40
40
  Provides-Extra: azure-ai-search
41
41
  Requires-Dist: azure-search-documents; extra == "azure-ai-search"
42
42
  Provides-Extra: bedrock
43
43
  Requires-Dist: boto3; extra == "bedrock"
44
44
  Requires-Dist: aioboto3; extra == "bedrock"
45
45
  Provides-Extra: biomed
46
- Requires-Dist: requests; extra == "biomed"
47
46
  Requires-Dist: bs4; extra == "biomed"
47
+ Requires-Dist: requests; extra == "biomed"
48
48
  Provides-Extra: box
49
- Requires-Dist: boxfs; extra == "box"
50
49
  Requires-Dist: fsspec; extra == "box"
50
+ Requires-Dist: boxfs; extra == "box"
51
51
  Provides-Extra: chroma
52
52
  Requires-Dist: chromadb; extra == "chroma"
53
53
  Provides-Extra: clarifai
54
54
  Requires-Dist: clarifai; extra == "clarifai"
55
55
  Provides-Extra: confluence
56
- Requires-Dist: requests; extra == "confluence"
57
56
  Requires-Dist: atlassian-python-api; extra == "confluence"
57
+ Requires-Dist: requests; extra == "confluence"
58
58
  Provides-Extra: couchbase
59
59
  Requires-Dist: couchbase; extra == "couchbase"
60
60
  Provides-Extra: csv
61
61
  Requires-Dist: unstructured[tsv]; extra == "csv"
62
+ Provides-Extra: databricks-delta-tables
63
+ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
62
64
  Provides-Extra: databricks-volumes
63
65
  Requires-Dist: databricks-sdk; extra == "databricks-volumes"
64
66
  Provides-Extra: delta-table
@@ -82,8 +84,8 @@ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
82
84
  Provides-Extra: embed-mixedbreadai
83
85
  Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
84
86
  Provides-Extra: embed-octoai
85
- Requires-Dist: openai; extra == "embed-octoai"
86
87
  Requires-Dist: tiktoken; extra == "embed-octoai"
88
+ Requires-Dist: openai; extra == "embed-octoai"
87
89
  Provides-Extra: embed-vertexai
88
90
  Requires-Dist: vertexai; extra == "embed-vertexai"
89
91
  Provides-Extra: embed-voyageai
@@ -91,8 +93,8 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
91
93
  Provides-Extra: epub
92
94
  Requires-Dist: unstructured[epub]; extra == "epub"
93
95
  Provides-Extra: gcs
94
- Requires-Dist: bs4; extra == "gcs"
95
96
  Requires-Dist: fsspec; extra == "gcs"
97
+ Requires-Dist: bs4; extra == "gcs"
96
98
  Requires-Dist: gcsfs; extra == "gcs"
97
99
  Provides-Extra: github
98
100
  Requires-Dist: pygithub>1.58.0; extra == "github"
@@ -102,8 +104,8 @@ Requires-Dist: python-gitlab; extra == "gitlab"
102
104
  Provides-Extra: google-drive
103
105
  Requires-Dist: google-api-python-client; extra == "google-drive"
104
106
  Provides-Extra: hubspot
105
- Requires-Dist: urllib3; extra == "hubspot"
106
107
  Requires-Dist: hubspot-api-client; extra == "hubspot"
108
+ Requires-Dist: urllib3; extra == "hubspot"
107
109
  Provides-Extra: jira
108
110
  Requires-Dist: atlassian-python-api; extra == "jira"
109
111
  Provides-Extra: kafka
@@ -122,12 +124,12 @@ Provides-Extra: msg
122
124
  Requires-Dist: unstructured[msg]; extra == "msg"
123
125
  Provides-Extra: neo4j
124
126
  Requires-Dist: cymple; extra == "neo4j"
125
- Requires-Dist: networkx; extra == "neo4j"
126
127
  Requires-Dist: neo4j; extra == "neo4j"
128
+ Requires-Dist: networkx; extra == "neo4j"
127
129
  Provides-Extra: notion
128
- Requires-Dist: backoff; extra == "notion"
129
130
  Requires-Dist: notion-client; extra == "notion"
130
131
  Requires-Dist: htmlBuilder; extra == "notion"
132
+ Requires-Dist: backoff; extra == "notion"
131
133
  Requires-Dist: httpx; extra == "notion"
132
134
  Provides-Extra: odt
133
135
  Requires-Dist: unstructured[odt]; extra == "odt"
@@ -136,8 +138,8 @@ Requires-Dist: bs4; extra == "onedrive"
136
138
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
137
139
  Requires-Dist: msal; extra == "onedrive"
138
140
  Provides-Extra: openai
139
- Requires-Dist: openai; extra == "openai"
140
141
  Requires-Dist: tiktoken; extra == "openai"
142
+ Requires-Dist: openai; extra == "openai"
141
143
  Provides-Extra: opensearch
142
144
  Requires-Dist: opensearch-py; extra == "opensearch"
143
145
  Provides-Extra: org
@@ -168,13 +170,13 @@ Requires-Dist: unstructured[rst]; extra == "rst"
168
170
  Provides-Extra: rtf
169
171
  Requires-Dist: unstructured[rtf]; extra == "rtf"
170
172
  Provides-Extra: s3
171
- Requires-Dist: s3fs; extra == "s3"
172
173
  Requires-Dist: fsspec; extra == "s3"
174
+ Requires-Dist: s3fs; extra == "s3"
173
175
  Provides-Extra: salesforce
174
176
  Requires-Dist: simple-salesforce; extra == "salesforce"
175
177
  Provides-Extra: sftp
176
- Requires-Dist: paramiko; extra == "sftp"
177
178
  Requires-Dist: fsspec; extra == "sftp"
179
+ Requires-Dist: paramiko; extra == "sftp"
178
180
  Provides-Extra: sharepoint
179
181
  Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
180
182
  Requires-Dist: msal; extra == "sharepoint"
@@ -183,16 +185,16 @@ Requires-Dist: singlestoredb; extra == "singlestore"
183
185
  Provides-Extra: slack
184
186
  Requires-Dist: slack-sdk[optional]; extra == "slack"
185
187
  Provides-Extra: snowflake
186
- Requires-Dist: psycopg2-binary; extra == "snowflake"
187
188
  Requires-Dist: snowflake-connector-python; extra == "snowflake"
189
+ Requires-Dist: psycopg2-binary; extra == "snowflake"
188
190
  Provides-Extra: togetherai
189
191
  Requires-Dist: together; extra == "togetherai"
190
192
  Provides-Extra: tsv
191
193
  Requires-Dist: unstructured[tsv]; extra == "tsv"
192
194
  Provides-Extra: vectara
195
+ Requires-Dist: httpx; extra == "vectara"
193
196
  Requires-Dist: requests; extra == "vectara"
194
197
  Requires-Dist: aiofiles; extra == "vectara"
195
- Requires-Dist: httpx; extra == "vectara"
196
198
  Provides-Extra: weaviate
197
199
  Requires-Dist: weaviate-client; extra == "weaviate"
198
200
  Provides-Extra: wikipedia