unstructured-ingest 0.3.9__py3-none-any.whl → 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/sql/test_postgres.py +3 -3
- test/integration/connectors/sql/test_singlestore.py +3 -3
- test/integration/connectors/sql/test_sqlite.py +3 -3
- test/integration/connectors/test_astradb.py +40 -0
- test/integration/connectors/test_kafka.py +2 -2
- test/integration/connectors/test_mongodb.py +4 -1
- test/integration/connectors/utils/validation/source.py +31 -11
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/interfaces/__init__.py +3 -1
- unstructured_ingest/v2/interfaces/file_data.py +58 -14
- unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
- unstructured_ingest/v2/pipeline/steps/download.py +5 -4
- unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
- unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
- unstructured_ingest/v2/pipeline/steps/index.py +4 -4
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
- unstructured_ingest/v2/pipeline/steps/stage.py +2 -2
- unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
- unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +35 -33
- unstructured_ingest/v2/processes/connectors/couchbase.py +50 -41
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +41 -45
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
- unstructured_ingest/v2/processes/connectors/mongodb.py +95 -100
- unstructured_ingest/v2/processes/connectors/neo4j.py +5 -3
- unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/sql.py +31 -26
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/METADATA +14 -13
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/RECORD +44 -44
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/top_level.txt +0 -0
|
@@ -1,13 +1,11 @@
|
|
|
1
|
-
import copy
|
|
2
1
|
import csv
|
|
3
2
|
import hashlib
|
|
4
|
-
import sys
|
|
5
3
|
from dataclasses import dataclass, field
|
|
6
4
|
from pathlib import Path
|
|
7
5
|
from time import time
|
|
8
6
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
9
7
|
|
|
10
|
-
from pydantic import Field, Secret
|
|
8
|
+
from pydantic import BaseModel, Field, Secret
|
|
11
9
|
|
|
12
10
|
from unstructured_ingest import __name__ as integration_name
|
|
13
11
|
from unstructured_ingest.__version__ import __version__ as integration_version
|
|
@@ -22,6 +20,8 @@ from unstructured_ingest.utils.string_and_date_utils import truncate_string_byte
|
|
|
22
20
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
23
21
|
from unstructured_ingest.v2.interfaces import (
|
|
24
22
|
AccessConfig,
|
|
23
|
+
BatchFileData,
|
|
24
|
+
BatchItem,
|
|
25
25
|
ConnectionConfig,
|
|
26
26
|
Downloader,
|
|
27
27
|
DownloaderConfig,
|
|
@@ -53,6 +53,15 @@ CONNECTOR_TYPE = "astradb"
|
|
|
53
53
|
MAX_CONTENT_PARAM_BYTE_SIZE = 8000
|
|
54
54
|
|
|
55
55
|
|
|
56
|
+
class AstraDBAdditionalMetadata(BaseModel):
|
|
57
|
+
collection_name: str
|
|
58
|
+
keyspace: Optional[str] = None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class AstraDBBatchFileData(BatchFileData):
|
|
62
|
+
additional_metadata: AstraDBAdditionalMetadata
|
|
63
|
+
|
|
64
|
+
|
|
56
65
|
class AstraDBAccessConfig(AccessConfig):
|
|
57
66
|
token: str = Field(description="Astra DB Token with access to the database.")
|
|
58
67
|
api_endpoint: str = Field(description="The API endpoint for the Astra DB.")
|
|
@@ -179,9 +188,6 @@ class AstraDBIndexer(Indexer):
|
|
|
179
188
|
|
|
180
189
|
def _get_doc_ids(self) -> set[str]:
|
|
181
190
|
"""Fetches all document ids in an index"""
|
|
182
|
-
# Initialize set of ids
|
|
183
|
-
ids = set()
|
|
184
|
-
|
|
185
191
|
# Get the collection
|
|
186
192
|
collection = self.get_collection()
|
|
187
193
|
|
|
@@ -194,31 +200,26 @@ class AstraDBIndexer(Indexer):
|
|
|
194
200
|
astra_db_docs.append(result)
|
|
195
201
|
|
|
196
202
|
# Create file data for each astra record
|
|
197
|
-
for astra_record in astra_db_docs
|
|
198
|
-
ids.add(astra_record["_id"])
|
|
203
|
+
ids = sorted([astra_record["_id"] for astra_record in astra_db_docs])
|
|
199
204
|
|
|
200
|
-
return ids
|
|
205
|
+
return set(ids)
|
|
201
206
|
|
|
202
|
-
def run(self, **kwargs: Any) -> Generator[
|
|
207
|
+
def run(self, **kwargs: Any) -> Generator[AstraDBBatchFileData, None, None]:
|
|
203
208
|
all_ids = self._get_doc_ids()
|
|
204
209
|
ids = list(all_ids)
|
|
205
210
|
id_batches = batch_generator(ids, self.index_config.batch_size)
|
|
206
211
|
|
|
207
212
|
for batch in id_batches:
|
|
208
|
-
|
|
209
|
-
identified = str(hash(batch) + sys.maxsize + 1)
|
|
210
|
-
fd = FileData(
|
|
211
|
-
identifier=identified,
|
|
213
|
+
fd = AstraDBBatchFileData(
|
|
212
214
|
connector_type=CONNECTOR_TYPE,
|
|
213
|
-
doc_type="batch",
|
|
214
215
|
metadata=FileDataSourceMetadata(
|
|
215
216
|
date_processed=str(time()),
|
|
216
217
|
),
|
|
217
|
-
additional_metadata=
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
218
|
+
additional_metadata=AstraDBAdditionalMetadata(
|
|
219
|
+
collection_name=self.index_config.collection_name,
|
|
220
|
+
keyspace=self.index_config.keyspace,
|
|
221
|
+
),
|
|
222
|
+
batch_items=[BatchItem(identifier=b) for b in batch],
|
|
222
223
|
)
|
|
223
224
|
yield fd
|
|
224
225
|
|
|
@@ -247,7 +248,9 @@ class AstraDBDownloader(Downloader):
|
|
|
247
248
|
writer.writerow(astra_result.keys())
|
|
248
249
|
writer.writerow(astra_result.values())
|
|
249
250
|
|
|
250
|
-
def generate_download_response(
|
|
251
|
+
def generate_download_response(
|
|
252
|
+
self, result: dict, file_data: AstraDBBatchFileData
|
|
253
|
+
) -> DownloadResponse:
|
|
251
254
|
record_id = result["_id"]
|
|
252
255
|
filename_id = self.get_identifier(record_id=record_id)
|
|
253
256
|
filename = f"{filename_id}.csv" # csv to preserve column info
|
|
@@ -255,7 +258,7 @@ class AstraDBDownloader(Downloader):
|
|
|
255
258
|
logger.debug(f"Downloading results from record {record_id} as csv to {download_path}")
|
|
256
259
|
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
257
260
|
try:
|
|
258
|
-
self.write_astra_result_to_csv(astra_result=result, download_path=download_path)
|
|
261
|
+
self.write_astra_result_to_csv(astra_result=result, download_path=str(download_path))
|
|
259
262
|
except Exception as e:
|
|
260
263
|
logger.error(
|
|
261
264
|
f"failed to download from record {record_id} to {download_path}: {e}",
|
|
@@ -264,14 +267,12 @@ class AstraDBDownloader(Downloader):
|
|
|
264
267
|
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
265
268
|
|
|
266
269
|
# modify input file_data for download_response
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
copied_file_data.metadata.record_locator = {"document_id": record_id}
|
|
272
|
-
copied_file_data.additional_metadata.pop("ids", None)
|
|
270
|
+
cast_file_data = FileData.cast(file_data=file_data)
|
|
271
|
+
cast_file_data.identifier = filename
|
|
272
|
+
cast_file_data.metadata.date_processed = str(time())
|
|
273
|
+
cast_file_data.metadata.record_locator = {"document_id": record_id}
|
|
273
274
|
return super().generate_download_response(
|
|
274
|
-
file_data=
|
|
275
|
+
file_data=cast_file_data, download_path=download_path
|
|
275
276
|
)
|
|
276
277
|
|
|
277
278
|
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
@@ -279,9 +280,10 @@ class AstraDBDownloader(Downloader):
|
|
|
279
280
|
|
|
280
281
|
async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
281
282
|
# Get metadata from file_data
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
283
|
+
astra_file_data = AstraDBBatchFileData.cast(file_data=file_data)
|
|
284
|
+
ids: list[str] = [item.identifier for item in astra_file_data.batch_items]
|
|
285
|
+
collection_name: str = astra_file_data.additional_metadata.collection_name
|
|
286
|
+
keyspace: str = astra_file_data.additional_metadata.keyspace
|
|
285
287
|
|
|
286
288
|
# Retrieve results from async collection
|
|
287
289
|
download_responses = []
|
|
@@ -292,7 +294,7 @@ class AstraDBDownloader(Downloader):
|
|
|
292
294
|
)
|
|
293
295
|
async for result in async_astra_collection.find({"_id": {"$in": ids}}):
|
|
294
296
|
download_responses.append(
|
|
295
|
-
self.generate_download_response(result=result, file_data=
|
|
297
|
+
self.generate_download_response(result=result, file_data=astra_file_data)
|
|
296
298
|
)
|
|
297
299
|
return download_responses
|
|
298
300
|
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import hashlib
|
|
2
|
-
import sys
|
|
3
2
|
import time
|
|
4
3
|
from contextlib import contextmanager
|
|
5
4
|
from dataclasses import dataclass, field
|
|
@@ -7,7 +6,7 @@ from datetime import timedelta
|
|
|
7
6
|
from pathlib import Path
|
|
8
7
|
from typing import TYPE_CHECKING, Any, Generator, List
|
|
9
8
|
|
|
10
|
-
from pydantic import Field, Secret
|
|
9
|
+
from pydantic import BaseModel, Field, Secret
|
|
11
10
|
|
|
12
11
|
from unstructured_ingest.error import (
|
|
13
12
|
DestinationConnectionError,
|
|
@@ -18,6 +17,8 @@ from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
|
|
|
18
17
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
19
18
|
from unstructured_ingest.v2.interfaces import (
|
|
20
19
|
AccessConfig,
|
|
20
|
+
BatchFileData,
|
|
21
|
+
BatchItem,
|
|
21
22
|
ConnectionConfig,
|
|
22
23
|
Downloader,
|
|
23
24
|
DownloaderConfig,
|
|
@@ -40,11 +41,20 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
40
41
|
|
|
41
42
|
if TYPE_CHECKING:
|
|
42
43
|
from couchbase.cluster import Cluster
|
|
44
|
+
from couchbase.collection import Collection
|
|
43
45
|
|
|
44
46
|
CONNECTOR_TYPE = "couchbase"
|
|
45
47
|
SERVER_API_VERSION = "1"
|
|
46
48
|
|
|
47
49
|
|
|
50
|
+
class CouchbaseAdditionalMetadata(BaseModel):
|
|
51
|
+
bucket: str
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class CouchbaseBatchFileData(BatchFileData):
|
|
55
|
+
additional_metadata: CouchbaseAdditionalMetadata
|
|
56
|
+
|
|
57
|
+
|
|
48
58
|
class CouchbaseAccessConfig(AccessConfig):
|
|
49
59
|
password: str = Field(description="The password for the Couchbase server")
|
|
50
60
|
|
|
@@ -180,31 +190,21 @@ class CouchbaseIndexer(Indexer):
|
|
|
180
190
|
if attempts == max_attempts:
|
|
181
191
|
raise SourceConnectionError(f"failed to get document ids: {e}")
|
|
182
192
|
|
|
183
|
-
def run(self, **kwargs: Any) -> Generator[
|
|
193
|
+
def run(self, **kwargs: Any) -> Generator[CouchbaseBatchFileData, None, None]:
|
|
184
194
|
ids = self._get_doc_ids()
|
|
185
|
-
|
|
186
|
-
id_batches = [
|
|
187
|
-
ids[i * self.index_config.batch_size : (i + 1) * self.index_config.batch_size]
|
|
188
|
-
for i in range(
|
|
189
|
-
(len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
|
|
190
|
-
)
|
|
191
|
-
]
|
|
192
|
-
for batch in id_batches:
|
|
195
|
+
for batch in batch_generator(ids, self.index_config.batch_size):
|
|
193
196
|
# Make sure the hash is always a positive number to create identified
|
|
194
|
-
|
|
195
|
-
yield FileData(
|
|
196
|
-
identifier=identified,
|
|
197
|
+
yield CouchbaseBatchFileData(
|
|
197
198
|
connector_type=CONNECTOR_TYPE,
|
|
198
|
-
doc_type="batch",
|
|
199
199
|
metadata=FileDataSourceMetadata(
|
|
200
200
|
url=f"{self.connection_config.connection_string}/"
|
|
201
201
|
f"{self.connection_config.bucket}",
|
|
202
202
|
date_processed=str(time.time()),
|
|
203
203
|
),
|
|
204
|
-
additional_metadata=
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
204
|
+
additional_metadata=CouchbaseAdditionalMetadata(
|
|
205
|
+
bucket=self.connection_config.bucket
|
|
206
|
+
),
|
|
207
|
+
batch_items=[BatchItem(identifier=b) for b in batch],
|
|
208
208
|
)
|
|
209
209
|
|
|
210
210
|
|
|
@@ -241,7 +241,7 @@ class CouchbaseDownloader(Downloader):
|
|
|
241
241
|
return concatenated_values
|
|
242
242
|
|
|
243
243
|
def generate_download_response(
|
|
244
|
-
self, result: dict, bucket: str, file_data:
|
|
244
|
+
self, result: dict, bucket: str, file_data: CouchbaseBatchFileData
|
|
245
245
|
) -> DownloadResponse:
|
|
246
246
|
record_id = result[self.download_config.collection_id]
|
|
247
247
|
filename_id = self.get_identifier(bucket=bucket, record_id=record_id)
|
|
@@ -261,28 +261,25 @@ class CouchbaseDownloader(Downloader):
|
|
|
261
261
|
exc_info=True,
|
|
262
262
|
)
|
|
263
263
|
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
},
|
|
278
|
-
),
|
|
279
|
-
),
|
|
280
|
-
path=download_path,
|
|
264
|
+
cast_file_data = FileData.cast(file_data=file_data)
|
|
265
|
+
cast_file_data.identifier = filename_id
|
|
266
|
+
cast_file_data.metadata.date_processed = str(time.time())
|
|
267
|
+
cast_file_data.metadata.record_locator = {
|
|
268
|
+
"connection_string": self.connection_config.connection_string,
|
|
269
|
+
"bucket": bucket,
|
|
270
|
+
"scope": self.connection_config.scope,
|
|
271
|
+
"collection": self.connection_config.collection,
|
|
272
|
+
"document_id": record_id,
|
|
273
|
+
}
|
|
274
|
+
return super().generate_download_response(
|
|
275
|
+
file_data=cast_file_data,
|
|
276
|
+
download_path=download_path,
|
|
281
277
|
)
|
|
282
278
|
|
|
283
279
|
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
284
|
-
|
|
285
|
-
|
|
280
|
+
couchbase_file_data = CouchbaseBatchFileData.cast(file_data=file_data)
|
|
281
|
+
bucket_name: str = couchbase_file_data.additional_metadata.bucket
|
|
282
|
+
ids: list[str] = [item.identifier for item in couchbase_file_data.batch_items]
|
|
286
283
|
|
|
287
284
|
with self.connection_config.get_client() as client:
|
|
288
285
|
bucket = client.bucket(bucket_name)
|
|
@@ -292,13 +289,25 @@ class CouchbaseDownloader(Downloader):
|
|
|
292
289
|
download_resp = self.process_all_doc_ids(ids, collection, bucket_name, file_data)
|
|
293
290
|
return list(download_resp)
|
|
294
291
|
|
|
295
|
-
def process_doc_id(
|
|
292
|
+
def process_doc_id(
|
|
293
|
+
self,
|
|
294
|
+
doc_id: str,
|
|
295
|
+
collection: "Collection",
|
|
296
|
+
bucket_name: str,
|
|
297
|
+
file_data: CouchbaseBatchFileData,
|
|
298
|
+
):
|
|
296
299
|
result = collection.get(doc_id)
|
|
297
300
|
return self.generate_download_response(
|
|
298
301
|
result=result.content_as[dict], bucket=bucket_name, file_data=file_data
|
|
299
302
|
)
|
|
300
303
|
|
|
301
|
-
def process_all_doc_ids(
|
|
304
|
+
def process_all_doc_ids(
|
|
305
|
+
self,
|
|
306
|
+
ids: list[str],
|
|
307
|
+
collection: "Collection",
|
|
308
|
+
bucket_name: str,
|
|
309
|
+
file_data: CouchbaseBatchFileData,
|
|
310
|
+
):
|
|
302
311
|
for doc_id in ids:
|
|
303
312
|
yield self.process_doc_id(doc_id, collection, bucket_name, file_data)
|
|
304
313
|
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import collections
|
|
2
2
|
import hashlib
|
|
3
|
-
import sys
|
|
4
3
|
from contextlib import contextmanager
|
|
5
4
|
from dataclasses import dataclass, field
|
|
6
5
|
from pathlib import Path
|
|
@@ -15,11 +14,17 @@ from unstructured_ingest.error import (
|
|
|
15
14
|
SourceConnectionNetworkError,
|
|
16
15
|
WriteError,
|
|
17
16
|
)
|
|
18
|
-
from unstructured_ingest.utils.data_prep import
|
|
17
|
+
from unstructured_ingest.utils.data_prep import (
|
|
18
|
+
batch_generator,
|
|
19
|
+
flatten_dict,
|
|
20
|
+
generator_batching_wbytes,
|
|
21
|
+
)
|
|
19
22
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
20
23
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
21
24
|
from unstructured_ingest.v2.interfaces import (
|
|
22
25
|
AccessConfig,
|
|
26
|
+
BatchFileData,
|
|
27
|
+
BatchItem,
|
|
23
28
|
ConnectionConfig,
|
|
24
29
|
Downloader,
|
|
25
30
|
DownloaderConfig,
|
|
@@ -48,6 +53,14 @@ if TYPE_CHECKING:
|
|
|
48
53
|
CONNECTOR_TYPE = "elasticsearch"
|
|
49
54
|
|
|
50
55
|
|
|
56
|
+
class ElastisearchAdditionalMetadata(BaseModel):
|
|
57
|
+
index_name: str
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class ElasticsearchBatchFileData(BatchFileData):
|
|
61
|
+
additional_metadata: ElastisearchAdditionalMetadata
|
|
62
|
+
|
|
63
|
+
|
|
51
64
|
class ElasticsearchAccessConfig(AccessConfig):
|
|
52
65
|
password: Optional[str] = Field(
|
|
53
66
|
default=None, description="password when using basic auth or connecting to a cloud instance"
|
|
@@ -174,36 +187,21 @@ class ElasticsearchIndexer(Indexer):
|
|
|
174
187
|
|
|
175
188
|
return {hit["_id"] for hit in hits}
|
|
176
189
|
|
|
177
|
-
def run(self, **kwargs: Any) -> Generator[
|
|
190
|
+
def run(self, **kwargs: Any) -> Generator[ElasticsearchBatchFileData, None, None]:
|
|
178
191
|
all_ids = self._get_doc_ids()
|
|
179
192
|
ids = list(all_ids)
|
|
180
|
-
|
|
181
|
-
frozenset(
|
|
182
|
-
ids[
|
|
183
|
-
i
|
|
184
|
-
* self.index_config.batch_size : (i + 1) # noqa
|
|
185
|
-
* self.index_config.batch_size
|
|
186
|
-
]
|
|
187
|
-
)
|
|
188
|
-
for i in range(
|
|
189
|
-
(len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
|
|
190
|
-
)
|
|
191
|
-
]
|
|
192
|
-
for batch in id_batches:
|
|
193
|
+
for batch in batch_generator(ids, self.index_config.batch_size):
|
|
193
194
|
# Make sure the hash is always a positive number to create identified
|
|
194
|
-
|
|
195
|
-
yield FileData(
|
|
196
|
-
identifier=identified,
|
|
195
|
+
yield ElasticsearchBatchFileData(
|
|
197
196
|
connector_type=CONNECTOR_TYPE,
|
|
198
|
-
doc_type="batch",
|
|
199
197
|
metadata=FileDataSourceMetadata(
|
|
200
198
|
url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
|
|
201
199
|
date_processed=str(time()),
|
|
202
200
|
),
|
|
203
|
-
additional_metadata=
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
201
|
+
additional_metadata=ElastisearchAdditionalMetadata(
|
|
202
|
+
index_name=self.index_config.index_name,
|
|
203
|
+
),
|
|
204
|
+
batch_items=[BatchItem(identifier=b) for b in batch],
|
|
207
205
|
)
|
|
208
206
|
|
|
209
207
|
|
|
@@ -237,7 +235,7 @@ class ElasticsearchDownloader(Downloader):
|
|
|
237
235
|
return concatenated_values
|
|
238
236
|
|
|
239
237
|
def generate_download_response(
|
|
240
|
-
self, result: dict, index_name: str, file_data:
|
|
238
|
+
self, result: dict, index_name: str, file_data: ElasticsearchBatchFileData
|
|
241
239
|
) -> DownloadResponse:
|
|
242
240
|
record_id = result["_id"]
|
|
243
241
|
filename_id = self.get_identifier(index_name=index_name, record_id=record_id)
|
|
@@ -257,22 +255,19 @@ class ElasticsearchDownloader(Downloader):
|
|
|
257
255
|
exc_info=True,
|
|
258
256
|
)
|
|
259
257
|
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
),
|
|
274
|
-
),
|
|
275
|
-
path=download_path,
|
|
258
|
+
cast_file_data = FileData.cast(file_data=file_data)
|
|
259
|
+
cast_file_data.identifier = filename_id
|
|
260
|
+
cast_file_data.metadata.date_processed = str(time())
|
|
261
|
+
cast_file_data.metadata.version = str(result["_version"]) if "_version" in result else None
|
|
262
|
+
cast_file_data.metadata.record_locator = {
|
|
263
|
+
"hosts": self.connection_config.hosts,
|
|
264
|
+
"index_name": index_name,
|
|
265
|
+
"document_id": record_id,
|
|
266
|
+
}
|
|
267
|
+
cast_file_data.source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
|
|
268
|
+
return super().generate_download_response(
|
|
269
|
+
file_data=cast_file_data,
|
|
270
|
+
download_path=download_path,
|
|
276
271
|
)
|
|
277
272
|
|
|
278
273
|
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
@@ -285,11 +280,12 @@ class ElasticsearchDownloader(Downloader):
|
|
|
285
280
|
|
|
286
281
|
return AsyncElasticsearch, async_scan
|
|
287
282
|
|
|
288
|
-
async def run_async(self, file_data:
|
|
283
|
+
async def run_async(self, file_data: BatchFileData, **kwargs: Any) -> download_responses:
|
|
284
|
+
elasticsearch_filedata = ElasticsearchBatchFileData.cast(file_data=file_data)
|
|
289
285
|
AsyncClient, async_scan = self.load_async()
|
|
290
286
|
|
|
291
|
-
index_name: str =
|
|
292
|
-
ids: list[str] =
|
|
287
|
+
index_name: str = elasticsearch_filedata.additional_metadata.index_name
|
|
288
|
+
ids: list[str] = [item.identifier for item in elasticsearch_filedata.batch_items]
|
|
293
289
|
|
|
294
290
|
scan_query = {
|
|
295
291
|
"_source": self.download_config.fields,
|
|
@@ -307,7 +303,7 @@ class ElasticsearchDownloader(Downloader):
|
|
|
307
303
|
):
|
|
308
304
|
download_responses.append(
|
|
309
305
|
self.generate_download_response(
|
|
310
|
-
result=result, index_name=index_name, file_data=
|
|
306
|
+
result=result, index_name=index_name, file_data=elasticsearch_filedata
|
|
311
307
|
)
|
|
312
308
|
)
|
|
313
309
|
return download_responses
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from contextlib import contextmanager
|
|
3
4
|
from dataclasses import dataclass, field
|
|
4
|
-
from pathlib import Path
|
|
5
5
|
from time import time
|
|
6
|
-
from typing import Any, Generator, Optional
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
7
7
|
|
|
8
8
|
from pydantic import Field, Secret
|
|
9
9
|
|
|
10
10
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
|
-
from unstructured_ingest.v2.interfaces import
|
|
11
|
+
from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
|
|
12
12
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
13
13
|
DestinationRegistryEntry,
|
|
14
14
|
SourceRegistryEntry,
|
|
@@ -25,6 +25,9 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
25
25
|
)
|
|
26
26
|
from unstructured_ingest.v2.processes.connectors.fsspec.utils import json_serial, sterilize_dict
|
|
27
27
|
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from adlfs import AzureBlobFileSystem
|
|
30
|
+
|
|
28
31
|
CONNECTOR_TYPE = "azure"
|
|
29
32
|
|
|
30
33
|
|
|
@@ -89,6 +92,12 @@ class AzureConnectionConfig(FsspecConnectionConfig):
|
|
|
89
92
|
}
|
|
90
93
|
return access_configs
|
|
91
94
|
|
|
95
|
+
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
96
|
+
@contextmanager
|
|
97
|
+
def get_client(self, protocol: str) -> Generator["AzureBlobFileSystem", None, None]:
|
|
98
|
+
with super().get_client(protocol=protocol) as client:
|
|
99
|
+
yield client
|
|
100
|
+
|
|
92
101
|
|
|
93
102
|
@dataclass
|
|
94
103
|
class AzureIndexer(FsspecIndexer):
|
|
@@ -96,17 +105,9 @@ class AzureIndexer(FsspecIndexer):
|
|
|
96
105
|
index_config: AzureIndexerConfig
|
|
97
106
|
connector_type: str = CONNECTOR_TYPE
|
|
98
107
|
|
|
99
|
-
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
100
|
-
def precheck(self) -> None:
|
|
101
|
-
super().precheck()
|
|
102
|
-
|
|
103
108
|
def sterilize_info(self, file_data: dict) -> dict:
|
|
104
109
|
return sterilize_dict(data=file_data, default=azure_json_serial)
|
|
105
110
|
|
|
106
|
-
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
107
|
-
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
108
|
-
return super().run(**kwargs)
|
|
109
|
-
|
|
110
111
|
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
111
112
|
path = file_data["name"]
|
|
112
113
|
date_created = (
|
|
@@ -149,14 +150,6 @@ class AzureDownloader(FsspecDownloader):
|
|
|
149
150
|
connector_type: str = CONNECTOR_TYPE
|
|
150
151
|
download_config: Optional[AzureDownloaderConfig] = field(default_factory=AzureDownloaderConfig)
|
|
151
152
|
|
|
152
|
-
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
153
|
-
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
154
|
-
return super().run(file_data=file_data, **kwargs)
|
|
155
|
-
|
|
156
|
-
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
157
|
-
async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
158
|
-
return await super().run_async(file_data=file_data, **kwargs)
|
|
159
|
-
|
|
160
153
|
|
|
161
154
|
class AzureUploaderConfig(FsspecUploaderConfig):
|
|
162
155
|
pass
|
|
@@ -168,22 +161,6 @@ class AzureUploader(FsspecUploader):
|
|
|
168
161
|
connection_config: AzureConnectionConfig
|
|
169
162
|
upload_config: AzureUploaderConfig = field(default=None)
|
|
170
163
|
|
|
171
|
-
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
172
|
-
def __post_init__(self):
|
|
173
|
-
super().__post_init__()
|
|
174
|
-
|
|
175
|
-
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
176
|
-
def precheck(self) -> None:
|
|
177
|
-
super().precheck()
|
|
178
|
-
|
|
179
|
-
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
180
|
-
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
181
|
-
return super().run(path=path, file_data=file_data, **kwargs)
|
|
182
|
-
|
|
183
|
-
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
184
|
-
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
185
|
-
return await super().run_async(path=path, file_data=file_data, **kwargs)
|
|
186
|
-
|
|
187
164
|
|
|
188
165
|
azure_source_entry = SourceRegistryEntry(
|
|
189
166
|
indexer=AzureIndexer,
|
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from contextlib import contextmanager
|
|
3
4
|
from dataclasses import dataclass, field
|
|
4
|
-
from pathlib import Path
|
|
5
5
|
from time import time
|
|
6
|
-
from typing import Annotated, Any, Generator, Optional
|
|
6
|
+
from typing import TYPE_CHECKING, Annotated, Any, Generator, Optional
|
|
7
7
|
|
|
8
8
|
from dateutil import parser
|
|
9
9
|
from pydantic import Field, Secret
|
|
10
10
|
from pydantic.functional_validators import BeforeValidator
|
|
11
11
|
|
|
12
12
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
|
-
from unstructured_ingest.v2.interfaces import
|
|
13
|
+
from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
|
|
14
14
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
15
15
|
DestinationRegistryEntry,
|
|
16
16
|
SourceRegistryEntry,
|
|
@@ -28,6 +28,9 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
28
28
|
)
|
|
29
29
|
from unstructured_ingest.v2.processes.connectors.utils import conform_string_to_dict
|
|
30
30
|
|
|
31
|
+
if TYPE_CHECKING:
|
|
32
|
+
from boxfs import BoxFileSystem
|
|
33
|
+
|
|
31
34
|
CONNECTOR_TYPE = "box"
|
|
32
35
|
|
|
33
36
|
|
|
@@ -72,6 +75,12 @@ class BoxConnectionConfig(FsspecConnectionConfig):
|
|
|
72
75
|
|
|
73
76
|
return access_kwargs_with_oauth
|
|
74
77
|
|
|
78
|
+
@requires_dependencies(["boxfs"], extras="box")
|
|
79
|
+
@contextmanager
|
|
80
|
+
def get_client(self, protocol: str) -> Generator["BoxFileSystem", None, None]:
|
|
81
|
+
with super().get_client(protocol=protocol) as client:
|
|
82
|
+
yield client
|
|
83
|
+
|
|
75
84
|
|
|
76
85
|
@dataclass
|
|
77
86
|
class BoxIndexer(FsspecIndexer):
|
|
@@ -79,14 +88,6 @@ class BoxIndexer(FsspecIndexer):
|
|
|
79
88
|
index_config: BoxIndexerConfig
|
|
80
89
|
connector_type: str = CONNECTOR_TYPE
|
|
81
90
|
|
|
82
|
-
@requires_dependencies(["boxfs"], extras="box")
|
|
83
|
-
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
84
|
-
return super().run(**kwargs)
|
|
85
|
-
|
|
86
|
-
@requires_dependencies(["boxfs"], extras="box")
|
|
87
|
-
def precheck(self) -> None:
|
|
88
|
-
super().precheck()
|
|
89
|
-
|
|
90
91
|
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
91
92
|
path = file_data["name"]
|
|
92
93
|
date_created = None
|
|
@@ -126,14 +127,6 @@ class BoxDownloader(FsspecDownloader):
|
|
|
126
127
|
connector_type: str = CONNECTOR_TYPE
|
|
127
128
|
download_config: Optional[BoxDownloaderConfig] = field(default_factory=BoxDownloaderConfig)
|
|
128
129
|
|
|
129
|
-
@requires_dependencies(["boxfs"], extras="box")
|
|
130
|
-
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
131
|
-
return super().run(file_data=file_data, **kwargs)
|
|
132
|
-
|
|
133
|
-
@requires_dependencies(["boxfs"], extras="box")
|
|
134
|
-
async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
135
|
-
return await super().run_async(file_data=file_data, **kwargs)
|
|
136
|
-
|
|
137
130
|
|
|
138
131
|
class BoxUploaderConfig(FsspecUploaderConfig):
|
|
139
132
|
pass
|
|
@@ -145,22 +138,6 @@ class BoxUploader(FsspecUploader):
|
|
|
145
138
|
connection_config: BoxConnectionConfig
|
|
146
139
|
upload_config: BoxUploaderConfig = field(default=None)
|
|
147
140
|
|
|
148
|
-
@requires_dependencies(["boxfs"], extras="box")
|
|
149
|
-
def __post_init__(self):
|
|
150
|
-
super().__post_init__()
|
|
151
|
-
|
|
152
|
-
@requires_dependencies(["boxfs"], extras="box")
|
|
153
|
-
def precheck(self) -> None:
|
|
154
|
-
super().precheck()
|
|
155
|
-
|
|
156
|
-
@requires_dependencies(["boxfs"], extras="box")
|
|
157
|
-
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
158
|
-
return super().run(path=path, file_data=file_data, **kwargs)
|
|
159
|
-
|
|
160
|
-
@requires_dependencies(["boxfs"], extras="box")
|
|
161
|
-
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
162
|
-
return await super().run_async(path=path, file_data=file_data, **kwargs)
|
|
163
|
-
|
|
164
141
|
|
|
165
142
|
box_source_entry = SourceRegistryEntry(
|
|
166
143
|
indexer=BoxIndexer,
|