unstructured-ingest 0.0.2.dev0__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (44) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/v2/cli/base/cmd.py +10 -0
  3. unstructured_ingest/v2/cli/base/src.py +2 -0
  4. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +1 -9
  5. unstructured_ingest/v2/cli/cmds/local.py +0 -8
  6. unstructured_ingest/v2/cli/configs/__init__.py +8 -1
  7. unstructured_ingest/v2/cli/configs/filter.py +28 -0
  8. unstructured_ingest/v2/interfaces/__init__.py +2 -1
  9. unstructured_ingest/v2/interfaces/downloader.py +9 -3
  10. unstructured_ingest/v2/interfaces/file_data.py +6 -1
  11. unstructured_ingest/v2/interfaces/process.py +3 -4
  12. unstructured_ingest/v2/pipeline/interfaces.py +3 -5
  13. unstructured_ingest/v2/pipeline/pipeline.py +72 -2
  14. unstructured_ingest/v2/pipeline/steps/download.py +77 -13
  15. unstructured_ingest/v2/pipeline/steps/filter.py +40 -0
  16. unstructured_ingest/v2/processes/connectors/astra.py +8 -0
  17. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +8 -0
  18. unstructured_ingest/v2/processes/connectors/chroma.py +8 -6
  19. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +9 -0
  20. unstructured_ingest/v2/processes/connectors/elasticsearch.py +23 -9
  21. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +8 -0
  22. unstructured_ingest/v2/processes/connectors/fsspec/box.py +8 -0
  23. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +8 -0
  24. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +30 -28
  25. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +8 -0
  26. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +21 -5
  27. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +8 -0
  28. unstructured_ingest/v2/processes/connectors/google_drive.py +13 -9
  29. unstructured_ingest/v2/processes/connectors/local.py +15 -15
  30. unstructured_ingest/v2/processes/connectors/mongodb.py +10 -4
  31. unstructured_ingest/v2/processes/connectors/onedrive.py +14 -2
  32. unstructured_ingest/v2/processes/connectors/opensearch.py +33 -5
  33. unstructured_ingest/v2/processes/connectors/pinecone.py +6 -3
  34. unstructured_ingest/v2/processes/connectors/salesforce.py +10 -8
  35. unstructured_ingest/v2/processes/connectors/sharepoint.py +14 -8
  36. unstructured_ingest/v2/processes/connectors/sql.py +24 -9
  37. unstructured_ingest/v2/processes/connectors/weaviate.py +13 -5
  38. unstructured_ingest/v2/processes/filter.py +54 -0
  39. unstructured_ingest-0.0.3.dist-info/METADATA +175 -0
  40. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.3.dist-info}/RECORD +43 -40
  41. unstructured_ingest-0.0.2.dev0.dist-info/METADATA +0 -321
  42. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.3.dist-info}/WHEEL +0 -0
  43. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.3.dist-info}/entry_points.txt +0 -0
  44. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.3.dist-info}/top_level.txt +0 -0
@@ -100,9 +100,15 @@ class OpenSearchConnectionConfig(ConnectionConfig):
100
100
  return OpenSearch(**self.get_client_kwargs())
101
101
 
102
102
 
103
+ @dataclass
104
+ class OpensearchIndexerConfig(ElasticsearchIndexerConfig):
105
+ pass
106
+
107
+
103
108
  @dataclass
104
109
  class OpenSearchIndexer(ElasticsearchIndexer):
105
110
  connection_config: OpenSearchConnectionConfig
111
+ index_config: OpensearchIndexerConfig
106
112
  client: "OpenSearch" = field(init=False)
107
113
 
108
114
  @requires_dependencies(["opensearchpy"], extras="opensearch")
@@ -112,9 +118,15 @@ class OpenSearchIndexer(ElasticsearchIndexer):
112
118
  return scan
113
119
 
114
120
 
121
+ @dataclass
122
+ class OpensearchDownloaderConfig(ElasticsearchDownloaderConfig):
123
+ pass
124
+
125
+
115
126
  @dataclass
116
127
  class OpenSearchDownloader(ElasticsearchDownloader):
117
128
  connection_config: OpenSearchConnectionConfig
129
+ download_config: OpensearchDownloaderConfig
118
130
  connector_type: str = CONNECTOR_TYPE
119
131
 
120
132
  @requires_dependencies(["opensearchpy"], extras="opensearch")
@@ -125,9 +137,15 @@ class OpenSearchDownloader(ElasticsearchDownloader):
125
137
  return AsyncOpenSearch, async_scan
126
138
 
127
139
 
140
+ @dataclass
141
+ class OpensearchUploaderConfig(ElasticsearchUploaderConfig):
142
+ pass
143
+
144
+
128
145
  @dataclass
129
146
  class OpenSearchUploader(ElasticsearchUploader):
130
147
  connection_config: OpenSearchConnectionConfig
148
+ upload_config: OpensearchUploaderConfig
131
149
  connector_type: str = CONNECTOR_TYPE
132
150
 
133
151
  @requires_dependencies(["opensearchpy"], extras="opensearch")
@@ -137,19 +155,29 @@ class OpenSearchUploader(ElasticsearchUploader):
137
155
  return parallel_bulk
138
156
 
139
157
 
158
+ @dataclass
159
+ class OpensearchUploadStagerConfig(ElasticsearchUploadStagerConfig):
160
+ pass
161
+
162
+
163
+ @dataclass
164
+ class OpensearchUploadStager(ElasticsearchUploadStager):
165
+ upload_stager_config: OpensearchUploadStagerConfig
166
+
167
+
140
168
  opensearch_source_entry = SourceRegistryEntry(
141
169
  connection_config=OpenSearchConnectionConfig,
142
170
  indexer=OpenSearchIndexer,
143
- indexer_config=ElasticsearchIndexerConfig,
171
+ indexer_config=OpensearchIndexerConfig,
144
172
  downloader=OpenSearchDownloader,
145
- downloader_config=ElasticsearchDownloaderConfig,
173
+ downloader_config=OpensearchDownloaderConfig,
146
174
  )
147
175
 
148
176
 
149
177
  opensearch_destination_entry = DestinationRegistryEntry(
150
178
  connection_config=OpenSearchConnectionConfig,
151
- upload_stager_config=ElasticsearchUploadStagerConfig,
152
- upload_stager=ElasticsearchUploadStager,
153
- uploader_config=ElasticsearchUploaderConfig,
179
+ upload_stager_config=OpensearchUploadStagerConfig,
180
+ upload_stager=OpensearchUploadStager,
181
+ uploader_config=OpensearchUploaderConfig,
154
182
  uploader=OpenSearchUploader,
155
183
  )
@@ -123,9 +123,12 @@ class PineconeUploader(Uploader):
123
123
  connection_config: PineconeConnectionConfig
124
124
  connector_type: str = CONNECTOR_TYPE
125
125
 
126
- @DestinationConnectionError.wrap
127
- def check_connection(self):
128
- _ = self.connection_config.get_index()
126
+ def precheck(self):
127
+ try:
128
+ self.connection_config.get_index()
129
+ except Exception as e:
130
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
131
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
129
132
 
130
133
  @requires_dependencies(["pinecone"], extras="pinecone")
131
134
  def upsert_batch(self, batch):
@@ -18,10 +18,9 @@ from textwrap import dedent
18
18
  from typing import TYPE_CHECKING, Any, Generator, Type
19
19
 
20
20
  from dateutil import parser
21
- from unstructured.documents.elements import DataSourceMetadata
22
21
 
23
22
  from unstructured_ingest.enhanced_dataclass import enhanced_field
24
- from unstructured_ingest.error import SourceConnectionNetworkError
23
+ from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
25
24
  from unstructured_ingest.utils.dep_check import requires_dependencies
26
25
  from unstructured_ingest.v2.interfaces import (
27
26
  AccessConfig,
@@ -30,6 +29,7 @@ from unstructured_ingest.v2.interfaces import (
30
29
  DownloaderConfig,
31
30
  DownloadResponse,
32
31
  FileData,
32
+ FileDataSourceMetadata,
33
33
  Indexer,
34
34
  IndexerConfig,
35
35
  SourceIdentifiers,
@@ -132,6 +132,13 @@ class SalesforceIndexer(Indexer):
132
132
  if record_type not in ACCEPTED_CATEGORIES:
133
133
  raise ValueError(f"{record_type} not currently an accepted Salesforce category")
134
134
 
135
+ def precheck(self) -> None:
136
+ try:
137
+ self.connection_config.get_client()
138
+ except Exception as e:
139
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
140
+ raise SourceConnectionError(f"failed to validate connection: {e}")
141
+
135
142
  def get_file_extension(self, record_type) -> str:
136
143
  if record_type == "EmailMessage":
137
144
  extension = ".eml"
@@ -172,7 +179,7 @@ class SalesforceIndexer(Indexer):
172
179
  filename=record_with_extension,
173
180
  fullpath=f"{record['attributes']['type']}/{record_with_extension}",
174
181
  ),
175
- metadata=DataSourceMetadata(
182
+ metadata=FileDataSourceMetadata(
176
183
  url=record["attributes"]["url"],
177
184
  version=str(parser.parse(record["SystemModstamp"]).timestamp()),
178
185
  date_created=str(parser.parse(record["CreatedDate"]).timestamp()),
@@ -207,11 +214,6 @@ class SalesforceDownloader(Downloader):
207
214
  )
208
215
  connector_type: str = CONNECTOR_TYPE
209
216
 
210
- def get_download_path(self, file_data: FileData) -> Path:
211
- rel_path = file_data.source_identifiers.relative_path
212
- rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
213
- return self.download_dir / Path(rel_path)
214
-
215
217
  def _xml_for_record(self, record: OrderedDict) -> str:
216
218
  """Creates partitionable xml file from a record"""
217
219
  import xml.etree.ElementTree as ET
@@ -6,10 +6,8 @@ from time import time
6
6
  from typing import TYPE_CHECKING, Any, Generator, Optional
7
7
  from urllib.parse import quote
8
8
 
9
- from unstructured.documents.elements import DataSourceMetadata
10
-
11
9
  from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
12
- from unstructured_ingest.error import SourceConnectionNetworkError
10
+ from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
13
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
14
12
  from unstructured_ingest.v2.interfaces import (
15
13
  AccessConfig,
@@ -18,6 +16,7 @@ from unstructured_ingest.v2.interfaces import (
18
16
  DownloaderConfig,
19
17
  DownloadResponse,
20
18
  FileData,
19
+ FileDataSourceMetadata,
21
20
  Indexer,
22
21
  IndexerConfig,
23
22
  SourceIdentifiers,
@@ -134,6 +133,14 @@ class SharepointIndexer(Indexer):
134
133
  connection_config: SharepointConnectionConfig
135
134
  index_config: SharepointIndexerConfig = field(default_factory=lambda: SharepointIndexerConfig())
136
135
 
136
+ def precheck(self) -> None:
137
+ try:
138
+ site_client = self.connection_config.get_client()
139
+ site_client.site_pages.pages.get().execute_query()
140
+ except Exception as e:
141
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
142
+ raise SourceConnectionError(f"failed to validate connection: {e}")
143
+
137
144
  def list_files(self, folder: "Folder", recursive: bool = False) -> list["File"]:
138
145
  if not recursive:
139
146
  folder.expand(["Files"]).get().execute_query()
@@ -187,7 +194,7 @@ class SharepointIndexer(Indexer):
187
194
  fullpath=file_path,
188
195
  rel_path=file_path.replace(self.index_config.path, ""),
189
196
  ),
190
- metadata=DataSourceMetadata(
197
+ metadata=FileDataSourceMetadata(
191
198
  url=url,
192
199
  version=version,
193
200
  date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
@@ -222,7 +229,7 @@ class SharepointIndexer(Indexer):
222
229
  fullpath=fullpath,
223
230
  rel_path=rel_path,
224
231
  ),
225
- metadata=DataSourceMetadata(
232
+ metadata=FileDataSourceMetadata(
226
233
  url=absolute_url,
227
234
  version=f"{file.major_version}.{file.minor_version}",
228
235
  date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
@@ -340,10 +347,9 @@ class SharepointDownloader(Downloader):
340
347
  connector_type: str = CONNECTOR_TYPE
341
348
 
342
349
  def get_download_path(self, file_data: FileData) -> Path:
350
+ download_path = super().get_download_path(file_data=file_data)
351
+
343
352
  content_type = file_data.additional_metadata.get("sharepoint_content_type")
344
- rel_path = file_data.source_identifiers.fullpath
345
- rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
346
- download_path = self.download_dir / Path(rel_path)
347
353
  if content_type == SharepointContentType.SITEPAGE.value:
348
354
  # Update output extension to html if site page
349
355
  download_path = download_path.with_suffix(".html")
@@ -4,13 +4,14 @@ import uuid
4
4
  from dataclasses import dataclass, field
5
5
  from datetime import date, datetime
6
6
  from pathlib import Path
7
- from typing import Any, Optional, Union
7
+ from typing import TYPE_CHECKING, Any, Callable, Optional, Union
8
8
 
9
9
  import numpy as np
10
10
  import pandas as pd
11
11
  from dateutil import parser
12
12
 
13
13
  from unstructured_ingest.enhanced_dataclass import enhanced_field
14
+ from unstructured_ingest.error import DestinationConnectionError
14
15
  from unstructured_ingest.utils.dep_check import requires_dependencies
15
16
  from unstructured_ingest.v2.interfaces import (
16
17
  AccessConfig,
@@ -25,6 +26,11 @@ from unstructured_ingest.v2.interfaces import (
25
26
  from unstructured_ingest.v2.logger import logger
26
27
  from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
27
28
 
29
+ if TYPE_CHECKING:
30
+ from sqlite3 import Connection as SqliteConnection
31
+
32
+ from psycopg2.extensions import connection as PostgresConnection
33
+
28
34
  CONNECTOR_TYPE = "sql"
29
35
  ELEMENTS_TABLE_NAME = "elements"
30
36
 
@@ -41,7 +47,7 @@ class DatabaseType(str, enum.Enum):
41
47
 
42
48
 
43
49
  @dataclass
44
- class SimpleSqlConfig(ConnectionConfig):
50
+ class SQLConnectionConfig(ConnectionConfig):
45
51
  db_type: DatabaseType = (
46
52
  # required default value here because of parent class
47
53
  DatabaseType.SQLITE
@@ -134,7 +140,7 @@ class SQLUploadStager(UploadStager):
134
140
  **kwargs: Any,
135
141
  ) -> Path:
136
142
  with open(elements_filepath) as elements_file:
137
- elements_contents = json.load(elements_file)
143
+ elements_contents: list[dict] = json.load(elements_file)
138
144
  output_path = Path(output_dir) / Path(f"{output_filename}.json")
139
145
  output_path.parent.mkdir(parents=True, exist_ok=True)
140
146
 
@@ -151,7 +157,7 @@ class SQLUploadStager(UploadStager):
151
157
  data["id"] = str(uuid.uuid4())
152
158
 
153
159
  # remove extraneous, not supported columns
154
- [data.pop(column) for column in data if column not in _COLUMNS]
160
+ data = {k: v for k, v in data.items() if k in _COLUMNS}
155
161
 
156
162
  output.append(data)
157
163
 
@@ -185,23 +191,32 @@ class SQLUploaderConfig(UploaderConfig):
185
191
  class SQLUploader(Uploader):
186
192
  connector_type: str = CONNECTOR_TYPE
187
193
  upload_config: SQLUploaderConfig
188
- connection_config: SimpleSqlConfig
194
+ connection_config: SQLConnectionConfig
195
+
196
+ def precheck(self) -> None:
197
+ try:
198
+ cursor = self.connection().cursor()
199
+ cursor.execute("SELECT 1;")
200
+ cursor.close()
201
+ except Exception as e:
202
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
203
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
189
204
 
190
205
  @property
191
- def connection(self):
206
+ def connection(self) -> Callable[[], Union["SqliteConnection", "PostgresConnection"]]:
192
207
  if self.connection_config.db_type == DatabaseType.POSTGRESQL:
193
208
  return self._make_psycopg_connection
194
209
  elif self.connection_config.db_type == DatabaseType.SQLITE:
195
210
  return self._make_sqlite_connection
196
211
  raise ValueError(f"Unsupported database {self.connection_config.db_type} connection.")
197
212
 
198
- def _make_sqlite_connection(self):
213
+ def _make_sqlite_connection(self) -> "SqliteConnection":
199
214
  from sqlite3 import connect
200
215
 
201
216
  return connect(database=self.connection_config.database)
202
217
 
203
218
  @requires_dependencies(["psycopg2"], extras="postgres")
204
- def _make_psycopg_connection(self):
219
+ def _make_psycopg_connection(self) -> "PostgresConnection":
205
220
  from psycopg2 import connect
206
221
 
207
222
  return connect(
@@ -261,7 +276,7 @@ class SQLUploader(Uploader):
261
276
 
262
277
 
263
278
  sql_destination_entry = DestinationRegistryEntry(
264
- connection_config=SimpleSqlConfig,
279
+ connection_config=SQLConnectionConfig,
265
280
  uploader=SQLUploader,
266
281
  uploader_config=SQLUploaderConfig,
267
282
  upload_stager=SQLUploadStager,
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Optional
7
7
  from dateutil import parser
8
8
 
9
9
  from unstructured_ingest.enhanced_dataclass import enhanced_field
10
+ from unstructured_ingest.error import DestinationConnectionError
10
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
11
12
  from unstructured_ingest.v2.interfaces import (
12
13
  AccessConfig,
@@ -156,15 +157,21 @@ class WeaviateUploaderConfig(UploaderConfig):
156
157
  class WeaviateUploader(Uploader):
157
158
  upload_config: WeaviateUploaderConfig
158
159
  connection_config: WeaviateConnectionConfig
159
- client: Optional["Client"] = field(init=False)
160
160
  connector_type: str = CONNECTOR_TYPE
161
161
 
162
162
  @requires_dependencies(["weaviate"], extras="weaviate")
163
- def __post_init__(self):
163
+ def get_client(self) -> "Client":
164
164
  from weaviate import Client
165
165
 
166
166
  auth = self._resolve_auth_method()
167
- self.client = Client(url=self.connection_config.host_url, auth_client_secret=auth)
167
+ return Client(url=self.connection_config.host_url, auth_client_secret=auth)
168
+
169
+ def precheck(self) -> None:
170
+ try:
171
+ self.get_client()
172
+ except Exception as e:
173
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
174
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
168
175
 
169
176
  @requires_dependencies(["weaviate"], extras="weaviate")
170
177
  def _resolve_auth_method(self):
@@ -215,8 +222,9 @@ class WeaviateUploader(Uploader):
215
222
  f"at {self.connection_config.host_url}",
216
223
  )
217
224
 
218
- self.client.batch.configure(batch_size=self.upload_config.batch_size)
219
- with self.client.batch as b:
225
+ client = self.get_client()
226
+ client.batch.configure(batch_size=self.upload_config.batch_size)
227
+ with client.batch as b:
220
228
  for e in elements_dict:
221
229
  vector = e.pop("embeddings", None)
222
230
  b.add_data_object(
@@ -0,0 +1,54 @@
1
+ import fnmatch
2
+ from abc import ABC
3
+ from dataclasses import dataclass, field
4
+ from typing import Any, Callable, Optional
5
+
6
+ from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
7
+ from unstructured_ingest.v2.interfaces import FileData
8
+ from unstructured_ingest.v2.interfaces.process import BaseProcess
9
+ from unstructured_ingest.v2.logger import logger
10
+
11
+
12
+ @dataclass
13
+ class FiltererConfig(EnhancedDataClassJsonMixin):
14
+ file_glob: Optional[list[str]] = None
15
+ max_file_size: Optional[int] = None
16
+
17
+
18
+ @dataclass
19
+ class Filterer(BaseProcess, ABC):
20
+ config: FiltererConfig = field(default_factory=lambda: FiltererConfig())
21
+ filters: list[Callable[[FileData], bool]] = field(init=False, default_factory=list)
22
+
23
+ def __post_init__(self):
24
+ # Populate the filters based on values in config
25
+ if self.config.file_glob is not None:
26
+ self.filters.append(self.glob_filter)
27
+ if self.config.max_file_size:
28
+ self.filters.append(self.file_size_filter)
29
+
30
+ def is_async(self) -> bool:
31
+ return False
32
+
33
+ def file_size_filter(self, file_data: FileData) -> bool:
34
+ if filesize_bytes := file_data.metadata.filesize_bytes:
35
+ return filesize_bytes <= self.config.max_file_size
36
+ return True
37
+
38
+ def glob_filter(self, file_data: FileData) -> bool:
39
+ patterns = self.config.file_glob
40
+ path = file_data.source_identifiers.fullpath
41
+ for pattern in patterns:
42
+ if fnmatch.filter([path], pattern):
43
+ return True
44
+ logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
45
+ return False
46
+
47
+ def run(self, file_data: FileData, **kwargs: Any) -> Optional[FileData]:
48
+ for filter in self.filters:
49
+ if not filter(file_data):
50
+ logger.debug(
51
+ f"filtered out file data due to {filter.__name__}: {file_data.identifier}"
52
+ )
53
+ return None
54
+ return file_data
@@ -0,0 +1,175 @@
1
+ Metadata-Version: 2.1
2
+ Name: unstructured-ingest
3
+ Version: 0.0.3
4
+ Summary: A library that prepares raw documents for downstream ML tasks.
5
+ Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
+ Author: Unstructured Technologies
7
+ Author-email: devops@unstructuredai.io
8
+ License: Apache-2.0
9
+ Keywords: NLP PDF HTML CV XML parsing preprocessing
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Education
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: Apache Software License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Requires-Python: >=3.9.0,<3.13
23
+ Description-Content-Type: text/markdown
24
+ Requires-Dist: unstructured
25
+ Requires-Dist: python-dateutil
26
+ Requires-Dist: pandas
27
+ Provides-Extra: airtable
28
+ Requires-Dist: pyairtable ; extra == 'airtable'
29
+ Provides-Extra: astra
30
+ Requires-Dist: astrapy ; extra == 'astra'
31
+ Provides-Extra: azure
32
+ Requires-Dist: fsspec ; extra == 'azure'
33
+ Requires-Dist: adlfs ; extra == 'azure'
34
+ Provides-Extra: azure-cognitive-search
35
+ Requires-Dist: azure-search-documents ; extra == 'azure-cognitive-search'
36
+ Provides-Extra: bedrock
37
+ Requires-Dist: boto3 ; extra == 'bedrock'
38
+ Requires-Dist: langchain-community ; extra == 'bedrock'
39
+ Provides-Extra: biomed
40
+ Requires-Dist: bs4 ; extra == 'biomed'
41
+ Provides-Extra: box
42
+ Requires-Dist: fsspec ; extra == 'box'
43
+ Requires-Dist: boxfs ; extra == 'box'
44
+ Provides-Extra: chroma
45
+ Requires-Dist: typer <=0.9.0 ; extra == 'chroma'
46
+ Requires-Dist: importlib-metadata >=7.1.0 ; extra == 'chroma'
47
+ Requires-Dist: chromadb ; extra == 'chroma'
48
+ Provides-Extra: clarifai
49
+ Requires-Dist: clarifai ; extra == 'clarifai'
50
+ Provides-Extra: confluence
51
+ Requires-Dist: atlassian-python-api ; extra == 'confluence'
52
+ Provides-Extra: csv
53
+ Requires-Dist: unstructured[tsv] ; extra == 'csv'
54
+ Provides-Extra: databricks-volumes
55
+ Requires-Dist: databricks-sdk ; extra == 'databricks-volumes'
56
+ Provides-Extra: delta-table
57
+ Requires-Dist: fsspec ; extra == 'delta-table'
58
+ Requires-Dist: deltalake ; extra == 'delta-table'
59
+ Provides-Extra: discord
60
+ Requires-Dist: discord-py ; extra == 'discord'
61
+ Provides-Extra: doc
62
+ Requires-Dist: unstructured[docx] ; extra == 'doc'
63
+ Provides-Extra: docx
64
+ Requires-Dist: unstructured[docx] ; extra == 'docx'
65
+ Provides-Extra: dropbox
66
+ Requires-Dist: dropboxdrivefs ; extra == 'dropbox'
67
+ Requires-Dist: fsspec ; extra == 'dropbox'
68
+ Provides-Extra: elasticsearch
69
+ Requires-Dist: elasticsearch[async] ; extra == 'elasticsearch'
70
+ Provides-Extra: embed-huggingface
71
+ Requires-Dist: sentence-transformers ; extra == 'embed-huggingface'
72
+ Requires-Dist: langchain-community ; extra == 'embed-huggingface'
73
+ Requires-Dist: huggingface ; extra == 'embed-huggingface'
74
+ Provides-Extra: embed-octoai
75
+ Requires-Dist: tiktoken ; extra == 'embed-octoai'
76
+ Requires-Dist: openai ; extra == 'embed-octoai'
77
+ Provides-Extra: embed-vertexai
78
+ Requires-Dist: langchain ; extra == 'embed-vertexai'
79
+ Requires-Dist: langchain-community ; extra == 'embed-vertexai'
80
+ Requires-Dist: langchain-google-vertexai ; extra == 'embed-vertexai'
81
+ Provides-Extra: embed-voyageai
82
+ Requires-Dist: langchain ; extra == 'embed-voyageai'
83
+ Requires-Dist: langchain-voyageai ; extra == 'embed-voyageai'
84
+ Provides-Extra: epub
85
+ Requires-Dist: unstructured[epub] ; extra == 'epub'
86
+ Provides-Extra: gcs
87
+ Requires-Dist: fsspec ; extra == 'gcs'
88
+ Requires-Dist: bs4 ; extra == 'gcs'
89
+ Requires-Dist: gcsfs ; extra == 'gcs'
90
+ Provides-Extra: github
91
+ Requires-Dist: pygithub >1.58.0 ; extra == 'github'
92
+ Provides-Extra: gitlab
93
+ Requires-Dist: python-gitlab ; extra == 'gitlab'
94
+ Provides-Extra: google-drive
95
+ Requires-Dist: google-api-python-client ; extra == 'google-drive'
96
+ Provides-Extra: hubspot
97
+ Requires-Dist: urllib3 ; extra == 'hubspot'
98
+ Requires-Dist: hubspot-api-client ; extra == 'hubspot'
99
+ Provides-Extra: jira
100
+ Requires-Dist: atlassian-python-api ; extra == 'jira'
101
+ Provides-Extra: kafka
102
+ Requires-Dist: confluent-kafka ; extra == 'kafka'
103
+ Provides-Extra: md
104
+ Requires-Dist: unstructured[md] ; extra == 'md'
105
+ Provides-Extra: milvus
106
+ Requires-Dist: pymilvus ; extra == 'milvus'
107
+ Provides-Extra: mongodb
108
+ Requires-Dist: pymongo ; extra == 'mongodb'
109
+ Provides-Extra: msg
110
+ Requires-Dist: unstructured[msg] ; extra == 'msg'
111
+ Provides-Extra: notion
112
+ Requires-Dist: notion-client ; extra == 'notion'
113
+ Requires-Dist: htmlBuilder ; extra == 'notion'
114
+ Provides-Extra: odt
115
+ Requires-Dist: unstructured[odt] ; extra == 'odt'
116
+ Provides-Extra: onedrive
117
+ Requires-Dist: bs4 ; extra == 'onedrive'
118
+ Requires-Dist: msal ; extra == 'onedrive'
119
+ Requires-Dist: Office365-REST-Python-Client ; extra == 'onedrive'
120
+ Provides-Extra: openai
121
+ Requires-Dist: tiktoken ; extra == 'openai'
122
+ Requires-Dist: langchain-community ; extra == 'openai'
123
+ Requires-Dist: openai ; extra == 'openai'
124
+ Provides-Extra: opensearch
125
+ Requires-Dist: opensearch-py ; extra == 'opensearch'
126
+ Provides-Extra: org
127
+ Requires-Dist: unstructured[org] ; extra == 'org'
128
+ Provides-Extra: outlook
129
+ Requires-Dist: msal ; extra == 'outlook'
130
+ Requires-Dist: Office365-REST-Python-Client ; extra == 'outlook'
131
+ Provides-Extra: pdf
132
+ Requires-Dist: unstructured[pdf] ; extra == 'pdf'
133
+ Provides-Extra: pinecone
134
+ Requires-Dist: pinecone-client >=3.7.1 ; extra == 'pinecone'
135
+ Provides-Extra: postgres
136
+ Requires-Dist: psycopg2-binary ; extra == 'postgres'
137
+ Provides-Extra: ppt
138
+ Requires-Dist: unstructured[pptx] ; extra == 'ppt'
139
+ Provides-Extra: pptx
140
+ Requires-Dist: unstructured[pptx] ; extra == 'pptx'
141
+ Provides-Extra: qdrant
142
+ Requires-Dist: qdrant-client ; extra == 'qdrant'
143
+ Provides-Extra: reddit
144
+ Requires-Dist: praw ; extra == 'reddit'
145
+ Provides-Extra: rst
146
+ Requires-Dist: unstructured[rst] ; extra == 'rst'
147
+ Provides-Extra: rtf
148
+ Requires-Dist: unstructured[rtf] ; extra == 'rtf'
149
+ Provides-Extra: s3
150
+ Requires-Dist: fsspec ; extra == 's3'
151
+ Requires-Dist: s3fs ; extra == 's3'
152
+ Provides-Extra: salesforce
153
+ Requires-Dist: simple-salesforce ; extra == 'salesforce'
154
+ Provides-Extra: sftp
155
+ Requires-Dist: fsspec ; extra == 'sftp'
156
+ Requires-Dist: paramiko ; extra == 'sftp'
157
+ Provides-Extra: sharepoint
158
+ Requires-Dist: msal ; extra == 'sharepoint'
159
+ Requires-Dist: Office365-REST-Python-Client ; extra == 'sharepoint'
160
+ Provides-Extra: singlestore
161
+ Requires-Dist: singlestoredb ; extra == 'singlestore'
162
+ Provides-Extra: slack
163
+ Requires-Dist: slack-sdk ; extra == 'slack'
164
+ Provides-Extra: tsv
165
+ Requires-Dist: unstructured[tsv] ; extra == 'tsv'
166
+ Provides-Extra: weaviate
167
+ Requires-Dist: weaviate-client ; extra == 'weaviate'
168
+ Provides-Extra: wikipedia
169
+ Requires-Dist: wikipedia ; extra == 'wikipedia'
170
+ Provides-Extra: xlsx
171
+ Requires-Dist: unstructured[xlsx] ; extra == 'xlsx'
172
+
173
+ # Unstructured Ingest
174
+
175
+ For details, see the [Unstructured Ingest overview](https://docs.unstructured.io/ingestion/overview) in the Unstructured documentation.