unstructured-ingest 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (39) hide show
  1. test/integration/connectors/duckdb/__init__.py +0 -0
  2. test/integration/connectors/duckdb/test_duckdb.py +82 -0
  3. test/integration/connectors/duckdb/test_motherduck.py +106 -0
  4. test/integration/connectors/test_kafka.py +109 -6
  5. test/integration/connectors/test_qdrant.py +55 -0
  6. test/unit/v2/connectors/test_confluence.py +39 -0
  7. unstructured_ingest/__version__.py +1 -1
  8. unstructured_ingest/v2/processes/connectors/__init__.py +1 -0
  9. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +24 -21
  10. unstructured_ingest/v2/processes/connectors/chroma.py +6 -5
  11. unstructured_ingest/v2/processes/connectors/confluence.py +14 -2
  12. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  13. unstructured_ingest/v2/processes/connectors/duckdb/base.py +99 -0
  14. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +118 -0
  15. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +133 -0
  16. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +34 -15
  17. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +6 -2
  18. unstructured_ingest/v2/processes/connectors/fsspec/box.py +23 -11
  19. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +3 -3
  20. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +2 -2
  21. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +2 -3
  22. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +8 -8
  23. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +9 -2
  24. unstructured_ingest/v2/processes/connectors/kafka/local.py +1 -1
  25. unstructured_ingest/v2/processes/connectors/kdbai.py +2 -2
  26. unstructured_ingest/v2/processes/connectors/pinecone.py +2 -2
  27. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +6 -4
  28. unstructured_ingest/v2/processes/connectors/sql/__init__.py +2 -1
  29. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +7 -9
  30. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +62 -24
  31. unstructured_ingest/v2/processes/connectors/sql/sql.py +8 -3
  32. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +6 -9
  33. unstructured_ingest/v2/utils.py +9 -0
  34. {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/METADATA +19 -17
  35. {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/RECORD +39 -31
  36. {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/LICENSE.md +0 -0
  37. {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/WHEEL +0 -0
  38. {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/entry_points.txt +0 -0
  39. {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,4 @@
1
1
  import json
2
- import uuid
3
2
  from dataclasses import dataclass, field
4
3
  from pathlib import Path
5
4
  from typing import TYPE_CHECKING, Any, Optional
@@ -24,6 +23,7 @@ from unstructured_ingest.v2.logger import logger
24
23
  from unstructured_ingest.v2.processes.connector_registry import (
25
24
  DestinationRegistryEntry,
26
25
  )
26
+ from unstructured_ingest.v2.utils import get_enhanced_element_id
27
27
 
28
28
  if TYPE_CHECKING:
29
29
  from kdbai_client import Database, Session, Table
@@ -81,7 +81,7 @@ class KdbaiUploadStager(UploadStager):
81
81
  for element in elements_contents:
82
82
  data.append(
83
83
  {
84
- "id": str(uuid.uuid4()),
84
+ "id": get_enhanced_element_id(element_dict=element, file_data=file_data),
85
85
  "element_id": element.get("element_id"),
86
86
  "document": element.pop("text", None),
87
87
  "embeddings": element.get("embeddings"),
@@ -1,5 +1,4 @@
1
1
  import json
2
- import uuid
3
2
  from dataclasses import dataclass, field
4
3
  from pathlib import Path
5
4
  from typing import TYPE_CHECKING, Any, Optional
@@ -21,6 +20,7 @@ from unstructured_ingest.v2.interfaces import (
21
20
  )
22
21
  from unstructured_ingest.v2.logger import logger
23
22
  from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
23
+ from unstructured_ingest.v2.utils import get_enhanced_element_id
24
24
 
25
25
  if TYPE_CHECKING:
26
26
  from pinecone import Index as PineconeIndex
@@ -149,7 +149,7 @@ class PineconeUploadStager(UploadStager):
149
149
  metadata[RECORD_ID_LABEL] = file_data.identifier
150
150
 
151
151
  return {
152
- "id": str(uuid.uuid4()),
152
+ "id": get_enhanced_element_id(element_dict=element_dict, file_data=file_data),
153
153
  "values": embeddings,
154
154
  "metadata": metadata,
155
155
  }
@@ -1,6 +1,5 @@
1
1
  import asyncio
2
2
  import json
3
- import uuid
4
3
  from abc import ABC, abstractmethod
5
4
  from contextlib import asynccontextmanager
6
5
  from dataclasses import dataclass, field
@@ -22,6 +21,7 @@ from unstructured_ingest.v2.interfaces import (
22
21
  UploadStagerConfig,
23
22
  )
24
23
  from unstructured_ingest.v2.logger import logger
24
+ from unstructured_ingest.v2.utils import get_enhanced_element_id
25
25
 
26
26
  if TYPE_CHECKING:
27
27
  from qdrant_client import AsyncQdrantClient
@@ -64,10 +64,10 @@ class QdrantUploadStager(UploadStager, ABC):
64
64
  )
65
65
 
66
66
  @staticmethod
67
- def conform_dict(data: dict) -> dict:
67
+ def conform_dict(data: dict, file_data: FileData) -> dict:
68
68
  """Prepares dictionary in the format that Chroma requires"""
69
69
  return {
70
- "id": str(uuid.uuid4()),
70
+ "id": get_enhanced_element_id(element_dict=data, file_data=file_data),
71
71
  "vector": data.pop("embeddings", {}),
72
72
  "payload": {
73
73
  "text": data.pop("text", None),
@@ -91,7 +91,9 @@ class QdrantUploadStager(UploadStager, ABC):
91
91
  with open(elements_filepath) as elements_file:
92
92
  elements_contents = json.load(elements_file)
93
93
 
94
- conformed_elements = [self.conform_dict(data=element) for element in elements_contents]
94
+ conformed_elements = [
95
+ self.conform_dict(data=element, file_data=file_data) for element in elements_contents
96
+ ]
95
97
  output_path = Path(output_dir) / Path(f"{output_filename}.json")
96
98
 
97
99
  with open(output_path, "w") as output_file:
@@ -8,7 +8,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
8
8
  from .postgres import CONNECTOR_TYPE as POSTGRES_CONNECTOR_TYPE
9
9
  from .postgres import postgres_destination_entry, postgres_source_entry
10
10
  from .singlestore import CONNECTOR_TYPE as SINGLESTORE_CONNECTOR_TYPE
11
- from .singlestore import singlestore_destination_entry
11
+ from .singlestore import singlestore_destination_entry, singlestore_source_entry
12
12
  from .snowflake import CONNECTOR_TYPE as SNOWFLAKE_CONNECTOR_TYPE
13
13
  from .snowflake import snowflake_destination_entry, snowflake_source_entry
14
14
  from .sqlite import CONNECTOR_TYPE as SQLITE_CONNECTOR_TYPE
@@ -17,6 +17,7 @@ from .sqlite import sqlite_destination_entry, sqlite_source_entry
17
17
  add_source_entry(source_type=SQLITE_CONNECTOR_TYPE, entry=sqlite_source_entry)
18
18
  add_source_entry(source_type=POSTGRES_CONNECTOR_TYPE, entry=postgres_source_entry)
19
19
  add_source_entry(source_type=SNOWFLAKE_CONNECTOR_TYPE, entry=snowflake_source_entry)
20
+ add_source_entry(source_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_source_entry)
20
21
 
21
22
  add_destination_entry(destination_type=SQLITE_CONNECTOR_TYPE, entry=sqlite_destination_entry)
22
23
  add_destination_entry(destination_type=POSTGRES_CONNECTOR_TYPE, entry=postgres_destination_entry)
@@ -91,22 +91,20 @@ class SingleStoreDownloader(SQLDownloader):
91
91
  connection_config: SingleStoreConnectionConfig
92
92
  download_config: SingleStoreDownloaderConfig
93
93
  connector_type: str = CONNECTOR_TYPE
94
+ values_delimiter: str = "%s"
94
95
 
95
96
  def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
96
97
  table_name = file_data.additional_metadata["table_name"]
97
98
  id_column = file_data.additional_metadata["id_column"]
98
- ids = file_data.additional_metadata["ids"]
99
+ ids = tuple(file_data.additional_metadata["ids"])
99
100
  with self.connection_config.get_connection() as sqlite_connection:
100
101
  cursor = sqlite_connection.cursor()
101
102
  fields = ",".join(self.download_config.fields) if self.download_config.fields else "*"
102
- query = "SELECT {fields} FROM {table_name} WHERE {id_column} in ({ids})".format(
103
- fields=fields,
104
- table_name=table_name,
105
- id_column=id_column,
106
- ids=",".join([str(i) for i in ids]),
103
+ query = (
104
+ f"SELECT {fields} FROM {table_name} WHERE {id_column} IN {self.values_delimiter}"
107
105
  )
108
- logger.debug(f"running query: {query}")
109
- cursor.execute(query)
106
+ logger.debug(f"running query: {query}\nwith values: {(ids,)}")
107
+ cursor.execute(query, (ids,))
110
108
  rows = cursor.fetchall()
111
109
  columns = [col[0] for col in cursor.description]
112
110
  return rows, columns
@@ -154,7 +152,7 @@ class SingleStoreUploader(SQLUploader):
154
152
  singlestore_source_entry = SourceRegistryEntry(
155
153
  connection_config=SingleStoreConnectionConfig,
156
154
  indexer_config=SingleStoreIndexerConfig,
157
- indexer=SQLIndexer,
155
+ indexer=SingleStoreIndexer,
158
156
  downloader_config=SingleStoreDownloaderConfig,
159
157
  downloader=SingleStoreDownloader,
160
158
  )
@@ -7,22 +7,26 @@ import numpy as np
7
7
  import pandas as pd
8
8
  from pydantic import Field, Secret
9
9
 
10
+ from unstructured_ingest.utils.data_prep import split_dataframe
10
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
12
+ from unstructured_ingest.v2.interfaces.file_data import FileData
13
+ from unstructured_ingest.v2.logger import logger
11
14
  from unstructured_ingest.v2.processes.connector_registry import (
12
15
  DestinationRegistryEntry,
13
16
  SourceRegistryEntry,
14
17
  )
15
- from unstructured_ingest.v2.processes.connectors.sql.postgres import (
16
- PostgresDownloader,
17
- PostgresDownloaderConfig,
18
- PostgresIndexer,
19
- PostgresIndexerConfig,
20
- PostgresUploader,
21
- PostgresUploaderConfig,
22
- PostgresUploadStager,
23
- PostgresUploadStagerConfig,
18
+ from unstructured_ingest.v2.processes.connectors.sql.sql import (
19
+ SQLAccessConfig,
20
+ SQLConnectionConfig,
21
+ SQLDownloader,
22
+ SQLDownloaderConfig,
23
+ SQLIndexer,
24
+ SQLIndexerConfig,
25
+ SQLUploader,
26
+ SQLUploaderConfig,
27
+ SQLUploadStager,
28
+ SQLUploadStagerConfig,
24
29
  )
25
- from unstructured_ingest.v2.processes.connectors.sql.sql import SQLAccessConfig, SQLConnectionConfig
26
30
 
27
31
  if TYPE_CHECKING:
28
32
  from snowflake.connector import SnowflakeConnection
@@ -59,6 +63,7 @@ class SnowflakeConnectionConfig(SQLConnectionConfig):
59
63
  connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
60
64
 
61
65
  @contextmanager
66
+ # The actual snowflake module package name is: snowflake-connector-python
62
67
  @requires_dependencies(["snowflake"], extras="snowflake")
63
68
  def get_connection(self) -> Generator["SnowflakeConnection", None, None]:
64
69
  # https://docs.snowflake.com/en/developer-guide/python-connector/python-connector-api#label-snowflake-connector-methods-connect
@@ -89,42 +94,67 @@ class SnowflakeConnectionConfig(SQLConnectionConfig):
89
94
  cursor.close()
90
95
 
91
96
 
92
- class SnowflakeIndexerConfig(PostgresIndexerConfig):
97
+ class SnowflakeIndexerConfig(SQLIndexerConfig):
93
98
  pass
94
99
 
95
100
 
96
101
  @dataclass
97
- class SnowflakeIndexer(PostgresIndexer):
102
+ class SnowflakeIndexer(SQLIndexer):
98
103
  connection_config: SnowflakeConnectionConfig
99
104
  index_config: SnowflakeIndexerConfig
100
105
  connector_type: str = CONNECTOR_TYPE
101
106
 
102
107
 
103
- class SnowflakeDownloaderConfig(PostgresDownloaderConfig):
108
+ class SnowflakeDownloaderConfig(SQLDownloaderConfig):
104
109
  pass
105
110
 
106
111
 
107
112
  @dataclass
108
- class SnowflakeDownloader(PostgresDownloader):
113
+ class SnowflakeDownloader(SQLDownloader):
109
114
  connection_config: SnowflakeConnectionConfig
110
115
  download_config: SnowflakeDownloaderConfig
111
116
  connector_type: str = CONNECTOR_TYPE
117
+ values_delimiter: str = "?"
112
118
 
113
-
114
- class SnowflakeUploadStagerConfig(PostgresUploadStagerConfig):
119
+ # The actual snowflake module package name is: snowflake-connector-python
120
+ @requires_dependencies(["snowflake"], extras="snowflake")
121
+ def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
122
+ table_name = file_data.additional_metadata["table_name"]
123
+ id_column = file_data.additional_metadata["id_column"]
124
+ ids = file_data.additional_metadata["ids"]
125
+
126
+ with self.connection_config.get_cursor() as cursor:
127
+ query = "SELECT {fields} FROM {table_name} WHERE {id_column} IN ({values})".format(
128
+ table_name=table_name,
129
+ id_column=id_column,
130
+ fields=(
131
+ ",".join(self.download_config.fields) if self.download_config.fields else "*"
132
+ ),
133
+ values=",".join([self.values_delimiter for _ in ids]),
134
+ )
135
+ logger.debug(f"running query: {query}\nwith values: {ids}")
136
+ cursor.execute(query, ids)
137
+ rows = [
138
+ tuple(row.values()) if isinstance(row, dict) else row for row in cursor.fetchall()
139
+ ]
140
+ columns = [col[0] for col in cursor.description]
141
+ return rows, columns
142
+
143
+
144
+ class SnowflakeUploadStagerConfig(SQLUploadStagerConfig):
115
145
  pass
116
146
 
117
147
 
118
- class SnowflakeUploadStager(PostgresUploadStager):
148
+ class SnowflakeUploadStager(SQLUploadStager):
119
149
  upload_stager_config: SnowflakeUploadStagerConfig
120
150
 
121
151
 
122
- class SnowflakeUploaderConfig(PostgresUploaderConfig):
152
+ class SnowflakeUploaderConfig(SQLUploaderConfig):
123
153
  pass
124
154
 
125
155
 
126
156
  @dataclass
127
- class SnowflakeUploader(PostgresUploader):
157
+ class SnowflakeUploader(SQLUploader):
128
158
  upload_config: SnowflakeUploaderConfig = field(default_factory=SnowflakeUploaderConfig)
129
159
  connection_config: SnowflakeConnectionConfig
130
160
  connector_type: str = CONNECTOR_TYPE
@@ -135,15 +165,23 @@ class SnowflakeUploader(PostgresUploader):
135
165
  df.replace({np.nan: None}, inplace=True)
136
166
 
137
167
  columns = list(df.columns)
138
- stmt = f"INSERT INTO {self.upload_config.table_name} ({','.join(columns)}) VALUES({','.join([self.values_delimiter for x in columns])})" # noqa E501
139
-
140
- for rows in pd.read_json(
141
- path, orient="records", lines=True, chunksize=self.upload_config.batch_size
142
- ):
168
+ stmt = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
169
+ table_name=self.upload_config.table_name,
170
+ columns=",".join(columns),
171
+ values=",".join([self.values_delimiter for _ in columns]),
172
+ )
173
+ logger.info(
174
+ f"writing a total of {len(df)} elements via"
175
+ f" document batches to destination"
176
+ f" table named {self.upload_config.table_name}"
177
+ f" with batch size {self.upload_config.batch_size}"
178
+ )
179
+ for rows in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
143
180
  with self.connection_config.get_cursor() as cursor:
144
181
  values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
145
182
  # TODO: executemany break on 'Binding data in type (list) is not supported'
146
183
  for val in values:
184
+ logger.debug(f"running query: {stmt}\nwith values: {val}")
147
185
  cursor.execute(stmt, val)
148
186
 
149
187
 
@@ -1,7 +1,6 @@
1
1
  import hashlib
2
2
  import json
3
3
  import sys
4
- import uuid
5
4
  from abc import ABC, abstractmethod
6
5
  from contextlib import contextmanager
7
6
  from dataclasses import dataclass, field, replace
@@ -35,6 +34,7 @@ from unstructured_ingest.v2.interfaces import (
35
34
  download_responses,
36
35
  )
37
36
  from unstructured_ingest.v2.logger import logger
37
+ from unstructured_ingest.v2.utils import get_enhanced_element_id
38
38
 
39
39
  _COLUMNS = (
40
40
  "id",
@@ -251,7 +251,7 @@ class SQLUploadStager(UploadStager):
251
251
  element.update(data_source)
252
252
  element.update(coordinates)
253
253
 
254
- element["id"] = str(uuid.uuid4())
254
+ element["id"] = get_enhanced_element_id(element_dict=element, file_data=file_data)
255
255
 
256
256
  # remove extraneous, not supported columns
257
257
  element = {k: v for k, v in element.items() if k in _COLUMNS}
@@ -367,7 +367,11 @@ class SQLUploader(Uploader):
367
367
  self._fit_to_schema(df=df, columns=self.get_table_columns())
368
368
 
369
369
  columns = list(df.columns)
370
- stmt = f"INSERT INTO {self.upload_config.table_name} ({','.join(columns)}) VALUES({','.join([self.values_delimiter for x in columns])})" # noqa E501
370
+ stmt = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
371
+ table_name=self.upload_config.table_name,
372
+ columns=",".join(columns),
373
+ values=",".join([self.values_delimiter for _ in columns]),
374
+ )
371
375
  logger.info(
372
376
  f"writing a total of {len(df)} elements via"
373
377
  f" document batches to destination"
@@ -384,6 +388,7 @@ class SQLUploader(Uploader):
384
388
  # except Exception as e:
385
389
  # print(f"Error: {e}")
386
390
  # print(f"failed to write {len(columns)}, {len(val)}: {stmt} -> {val}")
391
+ logger.debug(f"running query: {stmt}")
387
392
  cursor.executemany(stmt, values)
388
393
 
389
394
  def get_table_columns(self) -> list[str]:
@@ -95,6 +95,7 @@ class SQLiteDownloader(SQLDownloader):
95
95
  connection_config: SQLConnectionConfig
96
96
  download_config: SQLDownloaderConfig
97
97
  connector_type: str = CONNECTOR_TYPE
98
+ values_delimiter: str = "?"
98
99
 
99
100
  def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
100
101
  table_name = file_data.additional_metadata["table_name"]
@@ -103,14 +104,10 @@ class SQLiteDownloader(SQLDownloader):
103
104
  with self.connection_config.get_connection() as sqlite_connection:
104
105
  cursor = sqlite_connection.cursor()
105
106
  fields = ",".join(self.download_config.fields) if self.download_config.fields else "*"
106
- query = "SELECT {fields} FROM {table_name} WHERE {id_column} in ({ids})".format(
107
- fields=fields,
108
- table_name=table_name,
109
- id_column=id_column,
110
- ids=",".join([str(i) for i in ids]),
111
- )
112
- logger.debug(f"running query: {query}")
113
- cursor.execute(query)
107
+ values = ",".join(self.values_delimiter for _ in ids)
108
+ query = f"SELECT {fields} FROM {table_name} WHERE {id_column} IN ({values})"
109
+ logger.debug(f"running query: {query}\nwith values: {ids}")
110
+ cursor.execute(query, ids)
114
111
  rows = cursor.fetchall()
115
112
  columns = [col[0] for col in cursor.description]
116
113
  return rows, columns
@@ -157,7 +154,7 @@ class SQLiteUploader(SQLUploader):
157
154
  sqlite_source_entry = SourceRegistryEntry(
158
155
  connection_config=SQLiteConnectionConfig,
159
156
  indexer_config=SQLiteIndexerConfig,
160
- indexer=SQLIndexer,
157
+ indexer=SQLiteIndexer,
161
158
  downloader_config=SQLiteDownloaderConfig,
162
159
  downloader=SQLiteDownloader,
163
160
  )
@@ -3,10 +3,13 @@ from datetime import datetime
3
3
  from inspect import isclass
4
4
  from pathlib import Path
5
5
  from typing import Any
6
+ from uuid import NAMESPACE_DNS, uuid5
6
7
 
7
8
  from pydantic import BaseModel
8
9
  from pydantic.types import _SecretBase
9
10
 
11
+ from unstructured_ingest.v2.interfaces import FileData
12
+
10
13
 
11
14
  def is_secret(value: Any) -> bool:
12
15
  # Case Secret[int]
@@ -50,3 +53,9 @@ def serialize_base_model_json(model: BaseModel, **json_kwargs) -> str:
50
53
 
51
54
  # Support json dumps kwargs such as sort_keys
52
55
  return json.dumps(model_dict, default=json_serial, **json_kwargs)
56
+
57
+
58
+ def get_enhanced_element_id(element_dict: dict, file_data: FileData) -> str:
59
+ element_id = element_dict.get("element_id")
60
+ new_data = f"{element_id}{file_data.identifier}"
61
+ return str(uuid5(NAMESPACE_DNS, new_data))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.3.6
3
+ Version: 0.3.7
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,27 +22,27 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.13
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
- Requires-Dist: python-dateutil
25
+ Requires-Dist: pandas
26
26
  Requires-Dist: tqdm
27
27
  Requires-Dist: pydantic>=2.7
28
- Requires-Dist: dataclasses-json
29
28
  Requires-Dist: opentelemetry-sdk
30
- Requires-Dist: pandas
29
+ Requires-Dist: dataclasses-json
31
30
  Requires-Dist: click
31
+ Requires-Dist: python-dateutil
32
32
  Provides-Extra: airtable
33
33
  Requires-Dist: pyairtable; extra == "airtable"
34
34
  Provides-Extra: astradb
35
35
  Requires-Dist: astrapy; extra == "astradb"
36
36
  Provides-Extra: azure
37
- Requires-Dist: adlfs; extra == "azure"
38
37
  Requires-Dist: fsspec; extra == "azure"
38
+ Requires-Dist: adlfs; extra == "azure"
39
39
  Provides-Extra: azure-ai-search
40
40
  Requires-Dist: azure-search-documents; extra == "azure-ai-search"
41
41
  Provides-Extra: bedrock
42
42
  Requires-Dist: boto3; extra == "bedrock"
43
43
  Provides-Extra: biomed
44
- Requires-Dist: bs4; extra == "biomed"
45
44
  Requires-Dist: requests; extra == "biomed"
45
+ Requires-Dist: bs4; extra == "biomed"
46
46
  Provides-Extra: box
47
47
  Requires-Dist: fsspec; extra == "box"
48
48
  Requires-Dist: boxfs; extra == "box"
@@ -60,8 +60,8 @@ Requires-Dist: unstructured[tsv]; extra == "csv"
60
60
  Provides-Extra: databricks-volumes
61
61
  Requires-Dist: databricks-sdk; extra == "databricks-volumes"
62
62
  Provides-Extra: delta-table
63
- Requires-Dist: deltalake; extra == "delta-table"
64
63
  Requires-Dist: boto3; extra == "delta-table"
64
+ Requires-Dist: deltalake; extra == "delta-table"
65
65
  Provides-Extra: discord
66
66
  Requires-Dist: discord-py; extra == "discord"
67
67
  Provides-Extra: doc
@@ -69,8 +69,10 @@ Requires-Dist: unstructured[docx]; extra == "doc"
69
69
  Provides-Extra: docx
70
70
  Requires-Dist: unstructured[docx]; extra == "docx"
71
71
  Provides-Extra: dropbox
72
- Requires-Dist: fsspec; extra == "dropbox"
73
72
  Requires-Dist: dropboxdrivefs; extra == "dropbox"
73
+ Requires-Dist: fsspec; extra == "dropbox"
74
+ Provides-Extra: duckdb
75
+ Requires-Dist: duckdb; extra == "duckdb"
74
76
  Provides-Extra: elasticsearch
75
77
  Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
76
78
  Provides-Extra: embed-huggingface
@@ -78,8 +80,8 @@ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
78
80
  Provides-Extra: embed-mixedbreadai
79
81
  Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
80
82
  Provides-Extra: embed-octoai
81
- Requires-Dist: openai; extra == "embed-octoai"
82
83
  Requires-Dist: tiktoken; extra == "embed-octoai"
84
+ Requires-Dist: openai; extra == "embed-octoai"
83
85
  Provides-Extra: embed-vertexai
84
86
  Requires-Dist: vertexai; extra == "embed-vertexai"
85
87
  Provides-Extra: embed-voyageai
@@ -87,9 +89,9 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
87
89
  Provides-Extra: epub
88
90
  Requires-Dist: unstructured[epub]; extra == "epub"
89
91
  Provides-Extra: gcs
90
- Requires-Dist: bs4; extra == "gcs"
91
- Requires-Dist: fsspec; extra == "gcs"
92
92
  Requires-Dist: gcsfs; extra == "gcs"
93
+ Requires-Dist: fsspec; extra == "gcs"
94
+ Requires-Dist: bs4; extra == "gcs"
93
95
  Provides-Extra: github
94
96
  Requires-Dist: requests; extra == "github"
95
97
  Requires-Dist: pygithub>1.58.0; extra == "github"
@@ -98,8 +100,8 @@ Requires-Dist: python-gitlab; extra == "gitlab"
98
100
  Provides-Extra: google-drive
99
101
  Requires-Dist: google-api-python-client; extra == "google-drive"
100
102
  Provides-Extra: hubspot
101
- Requires-Dist: urllib3; extra == "hubspot"
102
103
  Requires-Dist: hubspot-api-client; extra == "hubspot"
104
+ Requires-Dist: urllib3; extra == "hubspot"
103
105
  Provides-Extra: jira
104
106
  Requires-Dist: atlassian-python-api; extra == "jira"
105
107
  Provides-Extra: kafka
@@ -119,17 +121,17 @@ Requires-Dist: unstructured[msg]; extra == "msg"
119
121
  Provides-Extra: notion
120
122
  Requires-Dist: notion-client; extra == "notion"
121
123
  Requires-Dist: backoff; extra == "notion"
122
- Requires-Dist: httpx; extra == "notion"
123
124
  Requires-Dist: htmlBuilder; extra == "notion"
125
+ Requires-Dist: httpx; extra == "notion"
124
126
  Provides-Extra: odt
125
127
  Requires-Dist: unstructured[odt]; extra == "odt"
126
128
  Provides-Extra: onedrive
127
- Requires-Dist: bs4; extra == "onedrive"
128
129
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
129
130
  Requires-Dist: msal; extra == "onedrive"
131
+ Requires-Dist: bs4; extra == "onedrive"
130
132
  Provides-Extra: openai
131
- Requires-Dist: openai; extra == "openai"
132
133
  Requires-Dist: tiktoken; extra == "openai"
134
+ Requires-Dist: openai; extra == "openai"
133
135
  Provides-Extra: opensearch
134
136
  Requires-Dist: opensearch-py; extra == "opensearch"
135
137
  Provides-Extra: org
@@ -158,8 +160,8 @@ Requires-Dist: unstructured[rst]; extra == "rst"
158
160
  Provides-Extra: rtf
159
161
  Requires-Dist: unstructured[rtf]; extra == "rtf"
160
162
  Provides-Extra: s3
161
- Requires-Dist: fsspec; extra == "s3"
162
163
  Requires-Dist: s3fs; extra == "s3"
164
+ Requires-Dist: fsspec; extra == "s3"
163
165
  Provides-Extra: salesforce
164
166
  Requires-Dist: simple-salesforce; extra == "salesforce"
165
167
  Provides-Extra: sftp
@@ -173,8 +175,8 @@ Requires-Dist: singlestoredb; extra == "singlestore"
173
175
  Provides-Extra: slack
174
176
  Requires-Dist: slack-sdk[optional]; extra == "slack"
175
177
  Provides-Extra: snowflake
176
- Requires-Dist: snowflake-connector-python; extra == "snowflake"
177
178
  Requires-Dist: psycopg2-binary; extra == "snowflake"
179
+ Requires-Dist: snowflake-connector-python; extra == "snowflake"
178
180
  Provides-Extra: togetherai
179
181
  Requires-Dist: together; extra == "togetherai"
180
182
  Provides-Extra: tsv