unstructured-ingest 0.0.2.dev0__py3-none-any.whl → 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/cli/base/cmd.py +10 -0
- unstructured_ingest/v2/cli/base/src.py +2 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +1 -9
- unstructured_ingest/v2/cli/cmds/local.py +0 -8
- unstructured_ingest/v2/cli/configs/__init__.py +8 -1
- unstructured_ingest/v2/cli/configs/filter.py +28 -0
- unstructured_ingest/v2/interfaces/__init__.py +2 -1
- unstructured_ingest/v2/interfaces/downloader.py +9 -3
- unstructured_ingest/v2/interfaces/file_data.py +6 -1
- unstructured_ingest/v2/interfaces/process.py +3 -4
- unstructured_ingest/v2/pipeline/interfaces.py +3 -5
- unstructured_ingest/v2/pipeline/pipeline.py +72 -2
- unstructured_ingest/v2/pipeline/steps/download.py +77 -13
- unstructured_ingest/v2/pipeline/steps/filter.py +40 -0
- unstructured_ingest/v2/processes/connectors/astra.py +8 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +8 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +8 -6
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +9 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +23 -9
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +8 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +8 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +8 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +30 -28
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +8 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +21 -5
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +8 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +13 -9
- unstructured_ingest/v2/processes/connectors/local.py +15 -15
- unstructured_ingest/v2/processes/connectors/mongodb.py +10 -4
- unstructured_ingest/v2/processes/connectors/onedrive.py +14 -2
- unstructured_ingest/v2/processes/connectors/opensearch.py +33 -5
- unstructured_ingest/v2/processes/connectors/pinecone.py +6 -3
- unstructured_ingest/v2/processes/connectors/salesforce.py +10 -8
- unstructured_ingest/v2/processes/connectors/sharepoint.py +14 -8
- unstructured_ingest/v2/processes/connectors/sql.py +24 -9
- unstructured_ingest/v2/processes/connectors/weaviate.py +13 -5
- unstructured_ingest/v2/processes/filter.py +54 -0
- unstructured_ingest-0.0.3.dist-info/METADATA +175 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.3.dist-info}/RECORD +43 -40
- unstructured_ingest-0.0.2.dev0.dist-info/METADATA +0 -321
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.3.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.3.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.3.dist-info}/top_level.txt +0 -0
|
@@ -100,9 +100,15 @@ class OpenSearchConnectionConfig(ConnectionConfig):
|
|
|
100
100
|
return OpenSearch(**self.get_client_kwargs())
|
|
101
101
|
|
|
102
102
|
|
|
103
|
+
@dataclass
|
|
104
|
+
class OpensearchIndexerConfig(ElasticsearchIndexerConfig):
|
|
105
|
+
pass
|
|
106
|
+
|
|
107
|
+
|
|
103
108
|
@dataclass
|
|
104
109
|
class OpenSearchIndexer(ElasticsearchIndexer):
|
|
105
110
|
connection_config: OpenSearchConnectionConfig
|
|
111
|
+
index_config: OpensearchIndexerConfig
|
|
106
112
|
client: "OpenSearch" = field(init=False)
|
|
107
113
|
|
|
108
114
|
@requires_dependencies(["opensearchpy"], extras="opensearch")
|
|
@@ -112,9 +118,15 @@ class OpenSearchIndexer(ElasticsearchIndexer):
|
|
|
112
118
|
return scan
|
|
113
119
|
|
|
114
120
|
|
|
121
|
+
@dataclass
|
|
122
|
+
class OpensearchDownloaderConfig(ElasticsearchDownloaderConfig):
|
|
123
|
+
pass
|
|
124
|
+
|
|
125
|
+
|
|
115
126
|
@dataclass
|
|
116
127
|
class OpenSearchDownloader(ElasticsearchDownloader):
|
|
117
128
|
connection_config: OpenSearchConnectionConfig
|
|
129
|
+
download_config: OpensearchDownloaderConfig
|
|
118
130
|
connector_type: str = CONNECTOR_TYPE
|
|
119
131
|
|
|
120
132
|
@requires_dependencies(["opensearchpy"], extras="opensearch")
|
|
@@ -125,9 +137,15 @@ class OpenSearchDownloader(ElasticsearchDownloader):
|
|
|
125
137
|
return AsyncOpenSearch, async_scan
|
|
126
138
|
|
|
127
139
|
|
|
140
|
+
@dataclass
|
|
141
|
+
class OpensearchUploaderConfig(ElasticsearchUploaderConfig):
|
|
142
|
+
pass
|
|
143
|
+
|
|
144
|
+
|
|
128
145
|
@dataclass
|
|
129
146
|
class OpenSearchUploader(ElasticsearchUploader):
|
|
130
147
|
connection_config: OpenSearchConnectionConfig
|
|
148
|
+
upload_config: OpensearchUploaderConfig
|
|
131
149
|
connector_type: str = CONNECTOR_TYPE
|
|
132
150
|
|
|
133
151
|
@requires_dependencies(["opensearchpy"], extras="opensearch")
|
|
@@ -137,19 +155,29 @@ class OpenSearchUploader(ElasticsearchUploader):
|
|
|
137
155
|
return parallel_bulk
|
|
138
156
|
|
|
139
157
|
|
|
158
|
+
@dataclass
|
|
159
|
+
class OpensearchUploadStagerConfig(ElasticsearchUploadStagerConfig):
|
|
160
|
+
pass
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
@dataclass
|
|
164
|
+
class OpensearchUploadStager(ElasticsearchUploadStager):
|
|
165
|
+
upload_stager_config: OpensearchUploadStagerConfig
|
|
166
|
+
|
|
167
|
+
|
|
140
168
|
opensearch_source_entry = SourceRegistryEntry(
|
|
141
169
|
connection_config=OpenSearchConnectionConfig,
|
|
142
170
|
indexer=OpenSearchIndexer,
|
|
143
|
-
indexer_config=
|
|
171
|
+
indexer_config=OpensearchIndexerConfig,
|
|
144
172
|
downloader=OpenSearchDownloader,
|
|
145
|
-
downloader_config=
|
|
173
|
+
downloader_config=OpensearchDownloaderConfig,
|
|
146
174
|
)
|
|
147
175
|
|
|
148
176
|
|
|
149
177
|
opensearch_destination_entry = DestinationRegistryEntry(
|
|
150
178
|
connection_config=OpenSearchConnectionConfig,
|
|
151
|
-
upload_stager_config=
|
|
152
|
-
upload_stager=
|
|
153
|
-
uploader_config=
|
|
179
|
+
upload_stager_config=OpensearchUploadStagerConfig,
|
|
180
|
+
upload_stager=OpensearchUploadStager,
|
|
181
|
+
uploader_config=OpensearchUploaderConfig,
|
|
154
182
|
uploader=OpenSearchUploader,
|
|
155
183
|
)
|
|
@@ -123,9 +123,12 @@ class PineconeUploader(Uploader):
|
|
|
123
123
|
connection_config: PineconeConnectionConfig
|
|
124
124
|
connector_type: str = CONNECTOR_TYPE
|
|
125
125
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
126
|
+
def precheck(self):
|
|
127
|
+
try:
|
|
128
|
+
self.connection_config.get_index()
|
|
129
|
+
except Exception as e:
|
|
130
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
131
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
129
132
|
|
|
130
133
|
@requires_dependencies(["pinecone"], extras="pinecone")
|
|
131
134
|
def upsert_batch(self, batch):
|
|
@@ -18,10 +18,9 @@ from textwrap import dedent
|
|
|
18
18
|
from typing import TYPE_CHECKING, Any, Generator, Type
|
|
19
19
|
|
|
20
20
|
from dateutil import parser
|
|
21
|
-
from unstructured.documents.elements import DataSourceMetadata
|
|
22
21
|
|
|
23
22
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
24
|
-
from unstructured_ingest.error import SourceConnectionNetworkError
|
|
23
|
+
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
25
24
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
26
25
|
from unstructured_ingest.v2.interfaces import (
|
|
27
26
|
AccessConfig,
|
|
@@ -30,6 +29,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
30
29
|
DownloaderConfig,
|
|
31
30
|
DownloadResponse,
|
|
32
31
|
FileData,
|
|
32
|
+
FileDataSourceMetadata,
|
|
33
33
|
Indexer,
|
|
34
34
|
IndexerConfig,
|
|
35
35
|
SourceIdentifiers,
|
|
@@ -132,6 +132,13 @@ class SalesforceIndexer(Indexer):
|
|
|
132
132
|
if record_type not in ACCEPTED_CATEGORIES:
|
|
133
133
|
raise ValueError(f"{record_type} not currently an accepted Salesforce category")
|
|
134
134
|
|
|
135
|
+
def precheck(self) -> None:
|
|
136
|
+
try:
|
|
137
|
+
self.connection_config.get_client()
|
|
138
|
+
except Exception as e:
|
|
139
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
140
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
141
|
+
|
|
135
142
|
def get_file_extension(self, record_type) -> str:
|
|
136
143
|
if record_type == "EmailMessage":
|
|
137
144
|
extension = ".eml"
|
|
@@ -172,7 +179,7 @@ class SalesforceIndexer(Indexer):
|
|
|
172
179
|
filename=record_with_extension,
|
|
173
180
|
fullpath=f"{record['attributes']['type']}/{record_with_extension}",
|
|
174
181
|
),
|
|
175
|
-
metadata=
|
|
182
|
+
metadata=FileDataSourceMetadata(
|
|
176
183
|
url=record["attributes"]["url"],
|
|
177
184
|
version=str(parser.parse(record["SystemModstamp"]).timestamp()),
|
|
178
185
|
date_created=str(parser.parse(record["CreatedDate"]).timestamp()),
|
|
@@ -207,11 +214,6 @@ class SalesforceDownloader(Downloader):
|
|
|
207
214
|
)
|
|
208
215
|
connector_type: str = CONNECTOR_TYPE
|
|
209
216
|
|
|
210
|
-
def get_download_path(self, file_data: FileData) -> Path:
|
|
211
|
-
rel_path = file_data.source_identifiers.relative_path
|
|
212
|
-
rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
|
|
213
|
-
return self.download_dir / Path(rel_path)
|
|
214
|
-
|
|
215
217
|
def _xml_for_record(self, record: OrderedDict) -> str:
|
|
216
218
|
"""Creates partitionable xml file from a record"""
|
|
217
219
|
import xml.etree.ElementTree as ET
|
|
@@ -6,10 +6,8 @@ from time import time
|
|
|
6
6
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
7
7
|
from urllib.parse import quote
|
|
8
8
|
|
|
9
|
-
from unstructured.documents.elements import DataSourceMetadata
|
|
10
|
-
|
|
11
9
|
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
|
|
12
|
-
from unstructured_ingest.error import SourceConnectionNetworkError
|
|
10
|
+
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
13
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
12
|
from unstructured_ingest.v2.interfaces import (
|
|
15
13
|
AccessConfig,
|
|
@@ -18,6 +16,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
18
16
|
DownloaderConfig,
|
|
19
17
|
DownloadResponse,
|
|
20
18
|
FileData,
|
|
19
|
+
FileDataSourceMetadata,
|
|
21
20
|
Indexer,
|
|
22
21
|
IndexerConfig,
|
|
23
22
|
SourceIdentifiers,
|
|
@@ -134,6 +133,14 @@ class SharepointIndexer(Indexer):
|
|
|
134
133
|
connection_config: SharepointConnectionConfig
|
|
135
134
|
index_config: SharepointIndexerConfig = field(default_factory=lambda: SharepointIndexerConfig())
|
|
136
135
|
|
|
136
|
+
def precheck(self) -> None:
|
|
137
|
+
try:
|
|
138
|
+
site_client = self.connection_config.get_client()
|
|
139
|
+
site_client.site_pages.pages.get().execute_query()
|
|
140
|
+
except Exception as e:
|
|
141
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
142
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
143
|
+
|
|
137
144
|
def list_files(self, folder: "Folder", recursive: bool = False) -> list["File"]:
|
|
138
145
|
if not recursive:
|
|
139
146
|
folder.expand(["Files"]).get().execute_query()
|
|
@@ -187,7 +194,7 @@ class SharepointIndexer(Indexer):
|
|
|
187
194
|
fullpath=file_path,
|
|
188
195
|
rel_path=file_path.replace(self.index_config.path, ""),
|
|
189
196
|
),
|
|
190
|
-
metadata=
|
|
197
|
+
metadata=FileDataSourceMetadata(
|
|
191
198
|
url=url,
|
|
192
199
|
version=version,
|
|
193
200
|
date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
|
|
@@ -222,7 +229,7 @@ class SharepointIndexer(Indexer):
|
|
|
222
229
|
fullpath=fullpath,
|
|
223
230
|
rel_path=rel_path,
|
|
224
231
|
),
|
|
225
|
-
metadata=
|
|
232
|
+
metadata=FileDataSourceMetadata(
|
|
226
233
|
url=absolute_url,
|
|
227
234
|
version=f"{file.major_version}.{file.minor_version}",
|
|
228
235
|
date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
|
|
@@ -340,10 +347,9 @@ class SharepointDownloader(Downloader):
|
|
|
340
347
|
connector_type: str = CONNECTOR_TYPE
|
|
341
348
|
|
|
342
349
|
def get_download_path(self, file_data: FileData) -> Path:
|
|
350
|
+
download_path = super().get_download_path(file_data=file_data)
|
|
351
|
+
|
|
343
352
|
content_type = file_data.additional_metadata.get("sharepoint_content_type")
|
|
344
|
-
rel_path = file_data.source_identifiers.fullpath
|
|
345
|
-
rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
|
|
346
|
-
download_path = self.download_dir / Path(rel_path)
|
|
347
353
|
if content_type == SharepointContentType.SITEPAGE.value:
|
|
348
354
|
# Update output extension to html if site page
|
|
349
355
|
download_path = download_path.with_suffix(".html")
|
|
@@ -4,13 +4,14 @@ import uuid
|
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
5
|
from datetime import date, datetime
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Any, Optional, Union
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Callable, Optional, Union
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
import pandas as pd
|
|
11
11
|
from dateutil import parser
|
|
12
12
|
|
|
13
13
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
14
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
14
15
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
15
16
|
from unstructured_ingest.v2.interfaces import (
|
|
16
17
|
AccessConfig,
|
|
@@ -25,6 +26,11 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
25
26
|
from unstructured_ingest.v2.logger import logger
|
|
26
27
|
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
27
28
|
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from sqlite3 import Connection as SqliteConnection
|
|
31
|
+
|
|
32
|
+
from psycopg2.extensions import connection as PostgresConnection
|
|
33
|
+
|
|
28
34
|
CONNECTOR_TYPE = "sql"
|
|
29
35
|
ELEMENTS_TABLE_NAME = "elements"
|
|
30
36
|
|
|
@@ -41,7 +47,7 @@ class DatabaseType(str, enum.Enum):
|
|
|
41
47
|
|
|
42
48
|
|
|
43
49
|
@dataclass
|
|
44
|
-
class
|
|
50
|
+
class SQLConnectionConfig(ConnectionConfig):
|
|
45
51
|
db_type: DatabaseType = (
|
|
46
52
|
# required default value here because of parent class
|
|
47
53
|
DatabaseType.SQLITE
|
|
@@ -134,7 +140,7 @@ class SQLUploadStager(UploadStager):
|
|
|
134
140
|
**kwargs: Any,
|
|
135
141
|
) -> Path:
|
|
136
142
|
with open(elements_filepath) as elements_file:
|
|
137
|
-
elements_contents = json.load(elements_file)
|
|
143
|
+
elements_contents: list[dict] = json.load(elements_file)
|
|
138
144
|
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
139
145
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
140
146
|
|
|
@@ -151,7 +157,7 @@ class SQLUploadStager(UploadStager):
|
|
|
151
157
|
data["id"] = str(uuid.uuid4())
|
|
152
158
|
|
|
153
159
|
# remove extraneous, not supported columns
|
|
154
|
-
|
|
160
|
+
data = {k: v for k, v in data.items() if k in _COLUMNS}
|
|
155
161
|
|
|
156
162
|
output.append(data)
|
|
157
163
|
|
|
@@ -185,23 +191,32 @@ class SQLUploaderConfig(UploaderConfig):
|
|
|
185
191
|
class SQLUploader(Uploader):
|
|
186
192
|
connector_type: str = CONNECTOR_TYPE
|
|
187
193
|
upload_config: SQLUploaderConfig
|
|
188
|
-
connection_config:
|
|
194
|
+
connection_config: SQLConnectionConfig
|
|
195
|
+
|
|
196
|
+
def precheck(self) -> None:
|
|
197
|
+
try:
|
|
198
|
+
cursor = self.connection().cursor()
|
|
199
|
+
cursor.execute("SELECT 1;")
|
|
200
|
+
cursor.close()
|
|
201
|
+
except Exception as e:
|
|
202
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
203
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
189
204
|
|
|
190
205
|
@property
|
|
191
|
-
def connection(self):
|
|
206
|
+
def connection(self) -> Callable[[], Union["SqliteConnection", "PostgresConnection"]]:
|
|
192
207
|
if self.connection_config.db_type == DatabaseType.POSTGRESQL:
|
|
193
208
|
return self._make_psycopg_connection
|
|
194
209
|
elif self.connection_config.db_type == DatabaseType.SQLITE:
|
|
195
210
|
return self._make_sqlite_connection
|
|
196
211
|
raise ValueError(f"Unsupported database {self.connection_config.db_type} connection.")
|
|
197
212
|
|
|
198
|
-
def _make_sqlite_connection(self):
|
|
213
|
+
def _make_sqlite_connection(self) -> "SqliteConnection":
|
|
199
214
|
from sqlite3 import connect
|
|
200
215
|
|
|
201
216
|
return connect(database=self.connection_config.database)
|
|
202
217
|
|
|
203
218
|
@requires_dependencies(["psycopg2"], extras="postgres")
|
|
204
|
-
def _make_psycopg_connection(self):
|
|
219
|
+
def _make_psycopg_connection(self) -> "PostgresConnection":
|
|
205
220
|
from psycopg2 import connect
|
|
206
221
|
|
|
207
222
|
return connect(
|
|
@@ -261,7 +276,7 @@ class SQLUploader(Uploader):
|
|
|
261
276
|
|
|
262
277
|
|
|
263
278
|
sql_destination_entry = DestinationRegistryEntry(
|
|
264
|
-
connection_config=
|
|
279
|
+
connection_config=SQLConnectionConfig,
|
|
265
280
|
uploader=SQLUploader,
|
|
266
281
|
uploader_config=SQLUploaderConfig,
|
|
267
282
|
upload_stager=SQLUploadStager,
|
|
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Optional
|
|
|
7
7
|
from dateutil import parser
|
|
8
8
|
|
|
9
9
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
10
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
10
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
12
|
from unstructured_ingest.v2.interfaces import (
|
|
12
13
|
AccessConfig,
|
|
@@ -156,15 +157,21 @@ class WeaviateUploaderConfig(UploaderConfig):
|
|
|
156
157
|
class WeaviateUploader(Uploader):
|
|
157
158
|
upload_config: WeaviateUploaderConfig
|
|
158
159
|
connection_config: WeaviateConnectionConfig
|
|
159
|
-
client: Optional["Client"] = field(init=False)
|
|
160
160
|
connector_type: str = CONNECTOR_TYPE
|
|
161
161
|
|
|
162
162
|
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
163
|
-
def
|
|
163
|
+
def get_client(self) -> "Client":
|
|
164
164
|
from weaviate import Client
|
|
165
165
|
|
|
166
166
|
auth = self._resolve_auth_method()
|
|
167
|
-
|
|
167
|
+
return Client(url=self.connection_config.host_url, auth_client_secret=auth)
|
|
168
|
+
|
|
169
|
+
def precheck(self) -> None:
|
|
170
|
+
try:
|
|
171
|
+
self.get_client()
|
|
172
|
+
except Exception as e:
|
|
173
|
+
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
174
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
168
175
|
|
|
169
176
|
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
170
177
|
def _resolve_auth_method(self):
|
|
@@ -215,8 +222,9 @@ class WeaviateUploader(Uploader):
|
|
|
215
222
|
f"at {self.connection_config.host_url}",
|
|
216
223
|
)
|
|
217
224
|
|
|
218
|
-
|
|
219
|
-
|
|
225
|
+
client = self.get_client()
|
|
226
|
+
client.batch.configure(batch_size=self.upload_config.batch_size)
|
|
227
|
+
with client.batch as b:
|
|
220
228
|
for e in elements_dict:
|
|
221
229
|
vector = e.pop("embeddings", None)
|
|
222
230
|
b.add_data_object(
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import fnmatch
|
|
2
|
+
from abc import ABC
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Any, Callable, Optional
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
|
|
7
|
+
from unstructured_ingest.v2.interfaces import FileData
|
|
8
|
+
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
9
|
+
from unstructured_ingest.v2.logger import logger
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class FiltererConfig(EnhancedDataClassJsonMixin):
|
|
14
|
+
file_glob: Optional[list[str]] = None
|
|
15
|
+
max_file_size: Optional[int] = None
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class Filterer(BaseProcess, ABC):
|
|
20
|
+
config: FiltererConfig = field(default_factory=lambda: FiltererConfig())
|
|
21
|
+
filters: list[Callable[[FileData], bool]] = field(init=False, default_factory=list)
|
|
22
|
+
|
|
23
|
+
def __post_init__(self):
|
|
24
|
+
# Populate the filters based on values in config
|
|
25
|
+
if self.config.file_glob is not None:
|
|
26
|
+
self.filters.append(self.glob_filter)
|
|
27
|
+
if self.config.max_file_size:
|
|
28
|
+
self.filters.append(self.file_size_filter)
|
|
29
|
+
|
|
30
|
+
def is_async(self) -> bool:
|
|
31
|
+
return False
|
|
32
|
+
|
|
33
|
+
def file_size_filter(self, file_data: FileData) -> bool:
|
|
34
|
+
if filesize_bytes := file_data.metadata.filesize_bytes:
|
|
35
|
+
return filesize_bytes <= self.config.max_file_size
|
|
36
|
+
return True
|
|
37
|
+
|
|
38
|
+
def glob_filter(self, file_data: FileData) -> bool:
|
|
39
|
+
patterns = self.config.file_glob
|
|
40
|
+
path = file_data.source_identifiers.fullpath
|
|
41
|
+
for pattern in patterns:
|
|
42
|
+
if fnmatch.filter([path], pattern):
|
|
43
|
+
return True
|
|
44
|
+
logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
|
|
45
|
+
return False
|
|
46
|
+
|
|
47
|
+
def run(self, file_data: FileData, **kwargs: Any) -> Optional[FileData]:
|
|
48
|
+
for filter in self.filters:
|
|
49
|
+
if not filter(file_data):
|
|
50
|
+
logger.debug(
|
|
51
|
+
f"filtered out file data due to {filter.__name__}: {file_data.identifier}"
|
|
52
|
+
)
|
|
53
|
+
return None
|
|
54
|
+
return file_data
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: unstructured-ingest
|
|
3
|
+
Version: 0.0.3
|
|
4
|
+
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
|
+
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
|
+
Author: Unstructured Technologies
|
|
7
|
+
Author-email: devops@unstructuredai.io
|
|
8
|
+
License: Apache-2.0
|
|
9
|
+
Keywords: NLP PDF HTML CV XML parsing preprocessing
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Education
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Requires-Python: >=3.9.0,<3.13
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
Requires-Dist: unstructured
|
|
25
|
+
Requires-Dist: python-dateutil
|
|
26
|
+
Requires-Dist: pandas
|
|
27
|
+
Provides-Extra: airtable
|
|
28
|
+
Requires-Dist: pyairtable ; extra == 'airtable'
|
|
29
|
+
Provides-Extra: astra
|
|
30
|
+
Requires-Dist: astrapy ; extra == 'astra'
|
|
31
|
+
Provides-Extra: azure
|
|
32
|
+
Requires-Dist: fsspec ; extra == 'azure'
|
|
33
|
+
Requires-Dist: adlfs ; extra == 'azure'
|
|
34
|
+
Provides-Extra: azure-cognitive-search
|
|
35
|
+
Requires-Dist: azure-search-documents ; extra == 'azure-cognitive-search'
|
|
36
|
+
Provides-Extra: bedrock
|
|
37
|
+
Requires-Dist: boto3 ; extra == 'bedrock'
|
|
38
|
+
Requires-Dist: langchain-community ; extra == 'bedrock'
|
|
39
|
+
Provides-Extra: biomed
|
|
40
|
+
Requires-Dist: bs4 ; extra == 'biomed'
|
|
41
|
+
Provides-Extra: box
|
|
42
|
+
Requires-Dist: fsspec ; extra == 'box'
|
|
43
|
+
Requires-Dist: boxfs ; extra == 'box'
|
|
44
|
+
Provides-Extra: chroma
|
|
45
|
+
Requires-Dist: typer <=0.9.0 ; extra == 'chroma'
|
|
46
|
+
Requires-Dist: importlib-metadata >=7.1.0 ; extra == 'chroma'
|
|
47
|
+
Requires-Dist: chromadb ; extra == 'chroma'
|
|
48
|
+
Provides-Extra: clarifai
|
|
49
|
+
Requires-Dist: clarifai ; extra == 'clarifai'
|
|
50
|
+
Provides-Extra: confluence
|
|
51
|
+
Requires-Dist: atlassian-python-api ; extra == 'confluence'
|
|
52
|
+
Provides-Extra: csv
|
|
53
|
+
Requires-Dist: unstructured[tsv] ; extra == 'csv'
|
|
54
|
+
Provides-Extra: databricks-volumes
|
|
55
|
+
Requires-Dist: databricks-sdk ; extra == 'databricks-volumes'
|
|
56
|
+
Provides-Extra: delta-table
|
|
57
|
+
Requires-Dist: fsspec ; extra == 'delta-table'
|
|
58
|
+
Requires-Dist: deltalake ; extra == 'delta-table'
|
|
59
|
+
Provides-Extra: discord
|
|
60
|
+
Requires-Dist: discord-py ; extra == 'discord'
|
|
61
|
+
Provides-Extra: doc
|
|
62
|
+
Requires-Dist: unstructured[docx] ; extra == 'doc'
|
|
63
|
+
Provides-Extra: docx
|
|
64
|
+
Requires-Dist: unstructured[docx] ; extra == 'docx'
|
|
65
|
+
Provides-Extra: dropbox
|
|
66
|
+
Requires-Dist: dropboxdrivefs ; extra == 'dropbox'
|
|
67
|
+
Requires-Dist: fsspec ; extra == 'dropbox'
|
|
68
|
+
Provides-Extra: elasticsearch
|
|
69
|
+
Requires-Dist: elasticsearch[async] ; extra == 'elasticsearch'
|
|
70
|
+
Provides-Extra: embed-huggingface
|
|
71
|
+
Requires-Dist: sentence-transformers ; extra == 'embed-huggingface'
|
|
72
|
+
Requires-Dist: langchain-community ; extra == 'embed-huggingface'
|
|
73
|
+
Requires-Dist: huggingface ; extra == 'embed-huggingface'
|
|
74
|
+
Provides-Extra: embed-octoai
|
|
75
|
+
Requires-Dist: tiktoken ; extra == 'embed-octoai'
|
|
76
|
+
Requires-Dist: openai ; extra == 'embed-octoai'
|
|
77
|
+
Provides-Extra: embed-vertexai
|
|
78
|
+
Requires-Dist: langchain ; extra == 'embed-vertexai'
|
|
79
|
+
Requires-Dist: langchain-community ; extra == 'embed-vertexai'
|
|
80
|
+
Requires-Dist: langchain-google-vertexai ; extra == 'embed-vertexai'
|
|
81
|
+
Provides-Extra: embed-voyageai
|
|
82
|
+
Requires-Dist: langchain ; extra == 'embed-voyageai'
|
|
83
|
+
Requires-Dist: langchain-voyageai ; extra == 'embed-voyageai'
|
|
84
|
+
Provides-Extra: epub
|
|
85
|
+
Requires-Dist: unstructured[epub] ; extra == 'epub'
|
|
86
|
+
Provides-Extra: gcs
|
|
87
|
+
Requires-Dist: fsspec ; extra == 'gcs'
|
|
88
|
+
Requires-Dist: bs4 ; extra == 'gcs'
|
|
89
|
+
Requires-Dist: gcsfs ; extra == 'gcs'
|
|
90
|
+
Provides-Extra: github
|
|
91
|
+
Requires-Dist: pygithub >1.58.0 ; extra == 'github'
|
|
92
|
+
Provides-Extra: gitlab
|
|
93
|
+
Requires-Dist: python-gitlab ; extra == 'gitlab'
|
|
94
|
+
Provides-Extra: google-drive
|
|
95
|
+
Requires-Dist: google-api-python-client ; extra == 'google-drive'
|
|
96
|
+
Provides-Extra: hubspot
|
|
97
|
+
Requires-Dist: urllib3 ; extra == 'hubspot'
|
|
98
|
+
Requires-Dist: hubspot-api-client ; extra == 'hubspot'
|
|
99
|
+
Provides-Extra: jira
|
|
100
|
+
Requires-Dist: atlassian-python-api ; extra == 'jira'
|
|
101
|
+
Provides-Extra: kafka
|
|
102
|
+
Requires-Dist: confluent-kafka ; extra == 'kafka'
|
|
103
|
+
Provides-Extra: md
|
|
104
|
+
Requires-Dist: unstructured[md] ; extra == 'md'
|
|
105
|
+
Provides-Extra: milvus
|
|
106
|
+
Requires-Dist: pymilvus ; extra == 'milvus'
|
|
107
|
+
Provides-Extra: mongodb
|
|
108
|
+
Requires-Dist: pymongo ; extra == 'mongodb'
|
|
109
|
+
Provides-Extra: msg
|
|
110
|
+
Requires-Dist: unstructured[msg] ; extra == 'msg'
|
|
111
|
+
Provides-Extra: notion
|
|
112
|
+
Requires-Dist: notion-client ; extra == 'notion'
|
|
113
|
+
Requires-Dist: htmlBuilder ; extra == 'notion'
|
|
114
|
+
Provides-Extra: odt
|
|
115
|
+
Requires-Dist: unstructured[odt] ; extra == 'odt'
|
|
116
|
+
Provides-Extra: onedrive
|
|
117
|
+
Requires-Dist: bs4 ; extra == 'onedrive'
|
|
118
|
+
Requires-Dist: msal ; extra == 'onedrive'
|
|
119
|
+
Requires-Dist: Office365-REST-Python-Client ; extra == 'onedrive'
|
|
120
|
+
Provides-Extra: openai
|
|
121
|
+
Requires-Dist: tiktoken ; extra == 'openai'
|
|
122
|
+
Requires-Dist: langchain-community ; extra == 'openai'
|
|
123
|
+
Requires-Dist: openai ; extra == 'openai'
|
|
124
|
+
Provides-Extra: opensearch
|
|
125
|
+
Requires-Dist: opensearch-py ; extra == 'opensearch'
|
|
126
|
+
Provides-Extra: org
|
|
127
|
+
Requires-Dist: unstructured[org] ; extra == 'org'
|
|
128
|
+
Provides-Extra: outlook
|
|
129
|
+
Requires-Dist: msal ; extra == 'outlook'
|
|
130
|
+
Requires-Dist: Office365-REST-Python-Client ; extra == 'outlook'
|
|
131
|
+
Provides-Extra: pdf
|
|
132
|
+
Requires-Dist: unstructured[pdf] ; extra == 'pdf'
|
|
133
|
+
Provides-Extra: pinecone
|
|
134
|
+
Requires-Dist: pinecone-client >=3.7.1 ; extra == 'pinecone'
|
|
135
|
+
Provides-Extra: postgres
|
|
136
|
+
Requires-Dist: psycopg2-binary ; extra == 'postgres'
|
|
137
|
+
Provides-Extra: ppt
|
|
138
|
+
Requires-Dist: unstructured[pptx] ; extra == 'ppt'
|
|
139
|
+
Provides-Extra: pptx
|
|
140
|
+
Requires-Dist: unstructured[pptx] ; extra == 'pptx'
|
|
141
|
+
Provides-Extra: qdrant
|
|
142
|
+
Requires-Dist: qdrant-client ; extra == 'qdrant'
|
|
143
|
+
Provides-Extra: reddit
|
|
144
|
+
Requires-Dist: praw ; extra == 'reddit'
|
|
145
|
+
Provides-Extra: rst
|
|
146
|
+
Requires-Dist: unstructured[rst] ; extra == 'rst'
|
|
147
|
+
Provides-Extra: rtf
|
|
148
|
+
Requires-Dist: unstructured[rtf] ; extra == 'rtf'
|
|
149
|
+
Provides-Extra: s3
|
|
150
|
+
Requires-Dist: fsspec ; extra == 's3'
|
|
151
|
+
Requires-Dist: s3fs ; extra == 's3'
|
|
152
|
+
Provides-Extra: salesforce
|
|
153
|
+
Requires-Dist: simple-salesforce ; extra == 'salesforce'
|
|
154
|
+
Provides-Extra: sftp
|
|
155
|
+
Requires-Dist: fsspec ; extra == 'sftp'
|
|
156
|
+
Requires-Dist: paramiko ; extra == 'sftp'
|
|
157
|
+
Provides-Extra: sharepoint
|
|
158
|
+
Requires-Dist: msal ; extra == 'sharepoint'
|
|
159
|
+
Requires-Dist: Office365-REST-Python-Client ; extra == 'sharepoint'
|
|
160
|
+
Provides-Extra: singlestore
|
|
161
|
+
Requires-Dist: singlestoredb ; extra == 'singlestore'
|
|
162
|
+
Provides-Extra: slack
|
|
163
|
+
Requires-Dist: slack-sdk ; extra == 'slack'
|
|
164
|
+
Provides-Extra: tsv
|
|
165
|
+
Requires-Dist: unstructured[tsv] ; extra == 'tsv'
|
|
166
|
+
Provides-Extra: weaviate
|
|
167
|
+
Requires-Dist: weaviate-client ; extra == 'weaviate'
|
|
168
|
+
Provides-Extra: wikipedia
|
|
169
|
+
Requires-Dist: wikipedia ; extra == 'wikipedia'
|
|
170
|
+
Provides-Extra: xlsx
|
|
171
|
+
Requires-Dist: unstructured[xlsx] ; extra == 'xlsx'
|
|
172
|
+
|
|
173
|
+
# Unstructured Ingest
|
|
174
|
+
|
|
175
|
+
For details, see the [Unstructured Ingest overview](https://docs.unstructured.io/ingestion/overview) in the Unstructured documentation.
|