unstructured-ingest 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/{databricks_tests → databricks}/test_volumes_native.py +75 -19
- test/integration/connectors/sql/test_postgres.py +9 -5
- test/integration/connectors/sql/test_singlestore.py +9 -5
- test/integration/connectors/sql/test_snowflake.py +6 -2
- test/integration/connectors/sql/test_sqlite.py +9 -5
- test/integration/connectors/test_astradb.py +40 -0
- test/integration/connectors/test_kafka.py +2 -2
- test/integration/connectors/test_mongodb.py +4 -1
- test/integration/connectors/utils/validation/source.py +31 -11
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/interfaces/__init__.py +3 -1
- unstructured_ingest/v2/interfaces/file_data.py +69 -15
- unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
- unstructured_ingest/v2/pipeline/steps/download.py +5 -4
- unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
- unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
- unstructured_ingest/v2/pipeline/steps/index.py +4 -4
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
- unstructured_ingest/v2/pipeline/steps/stage.py +2 -2
- unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
- unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +37 -33
- unstructured_ingest/v2/processes/connectors/couchbase.py +52 -41
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +5 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +2 -2
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +2 -2
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +2 -2
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +41 -45
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
- unstructured_ingest/v2/processes/connectors/mongodb.py +94 -100
- unstructured_ingest/v2/processes/connectors/neo4j.py +5 -3
- unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/sql.py +36 -26
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/METADATA +11 -10
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/RECORD +52 -52
- /test/integration/connectors/{databricks_tests → databricks}/__init__.py +0 -0
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/top_level.txt +0 -0
|
@@ -6,6 +6,7 @@ from pathlib import Path
|
|
|
6
6
|
from typing import Callable, Optional, TypedDict
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.v2.interfaces import FileData
|
|
9
|
+
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
9
10
|
from unstructured_ingest.v2.logger import logger
|
|
10
11
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
11
12
|
from unstructured_ingest.v2.processes.chunker import Chunker
|
|
@@ -51,7 +52,7 @@ class ChunkStep(PipelineStep):
|
|
|
51
52
|
self, fn: Callable, path: str, file_data_path: str, **kwargs
|
|
52
53
|
) -> ChunkStepResponse:
|
|
53
54
|
path = Path(path)
|
|
54
|
-
file_data =
|
|
55
|
+
file_data = file_data_from_file(path=file_data_path)
|
|
55
56
|
output_filepath = self.get_output_filepath(filename=path)
|
|
56
57
|
if not self.should_chunk(filepath=output_filepath, file_data=file_data):
|
|
57
58
|
logger.debug(f"skipping chunking, output already exists: {output_filepath}")
|
|
@@ -8,6 +8,7 @@ from typing import Callable, Optional, TypedDict, TypeVar
|
|
|
8
8
|
|
|
9
9
|
from unstructured_ingest.v2.interfaces import FileData, download_responses
|
|
10
10
|
from unstructured_ingest.v2.interfaces.downloader import Downloader
|
|
11
|
+
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
11
12
|
from unstructured_ingest.v2.logger import logger
|
|
12
13
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
13
14
|
from unstructured_ingest.v2.utils import serialize_base_model_json
|
|
@@ -87,12 +88,12 @@ class DownloadStep(PipelineStep):
|
|
|
87
88
|
f"match size of local file: {file_size_bytes}, updating"
|
|
88
89
|
)
|
|
89
90
|
file_data.metadata.filesize_bytes = file_size_bytes
|
|
90
|
-
logger.debug(f"updating file data with new content: {file_data.
|
|
91
|
+
logger.debug(f"updating file data with new content: {file_data.model_dump()}")
|
|
91
92
|
with file_data_path.open("w") as file:
|
|
92
|
-
json.dump(file_data.
|
|
93
|
+
json.dump(file_data.model_dump(), file, indent=2)
|
|
93
94
|
|
|
94
95
|
async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
|
|
95
|
-
file_data =
|
|
96
|
+
file_data = file_data_from_file(path=file_data_path)
|
|
96
97
|
download_path = self.process.get_download_path(file_data=file_data)
|
|
97
98
|
if not self.should_download(file_data=file_data, file_data_path=file_data_path):
|
|
98
99
|
logger.debug(f"skipping download, file already exists locally: {download_path}")
|
|
@@ -172,7 +173,7 @@ class DownloadStep(PipelineStep):
|
|
|
172
173
|
filepath = (self.cache_dir / filename).resolve()
|
|
173
174
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
174
175
|
with open(str(filepath), "w") as f:
|
|
175
|
-
json.dump(file_data.
|
|
176
|
+
json.dump(file_data.model_dump(), f, indent=2)
|
|
176
177
|
return str(filepath)
|
|
177
178
|
|
|
178
179
|
def get_hash(self, extras: Optional[list[str]]) -> str:
|
|
@@ -6,6 +6,7 @@ from pathlib import Path
|
|
|
6
6
|
from typing import Callable, Optional, TypedDict
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.v2.interfaces import FileData
|
|
9
|
+
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
9
10
|
from unstructured_ingest.v2.logger import logger
|
|
10
11
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
11
12
|
from unstructured_ingest.v2.processes.embedder import Embedder
|
|
@@ -49,7 +50,7 @@ class EmbedStep(PipelineStep):
|
|
|
49
50
|
|
|
50
51
|
async def _run_async(self, fn: Callable, path: str, file_data_path: str) -> EmbedStepResponse:
|
|
51
52
|
path = Path(path)
|
|
52
|
-
file_data =
|
|
53
|
+
file_data = file_data_from_file(path=file_data_path)
|
|
53
54
|
output_filepath = self.get_output_filepath(filename=path)
|
|
54
55
|
if not self.should_embed(filepath=output_filepath, file_data=file_data):
|
|
55
56
|
logger.debug(f"skipping embedding, output already exists: {output_filepath}")
|
|
@@ -2,7 +2,7 @@ import asyncio
|
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from typing import Callable, Optional
|
|
4
4
|
|
|
5
|
-
from unstructured_ingest.v2.interfaces.file_data import
|
|
5
|
+
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
6
6
|
from unstructured_ingest.v2.logger import logger
|
|
7
7
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
8
8
|
from unstructured_ingest.v2.processes.filter import Filterer
|
|
@@ -20,7 +20,7 @@ class FilterStep(PipelineStep):
|
|
|
20
20
|
logger.info(f"created {self.identifier} with configs: {config}")
|
|
21
21
|
|
|
22
22
|
async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
|
|
23
|
-
file_data =
|
|
23
|
+
file_data = file_data_from_file(path=file_data_path)
|
|
24
24
|
fn_kwargs = {"file_data": file_data}
|
|
25
25
|
if not asyncio.iscoroutinefunction(fn):
|
|
26
26
|
resp = fn(**fn_kwargs)
|
|
@@ -37,14 +37,14 @@ class IndexStep(PipelineStep):
|
|
|
37
37
|
@instrument(span_name=STEP_ID)
|
|
38
38
|
def run(self) -> Generator[str, None, None]:
|
|
39
39
|
for file_data in self.process.run():
|
|
40
|
-
logger.debug(f"generated file data: {file_data.
|
|
40
|
+
logger.debug(f"generated file data: {file_data.model_dump()}")
|
|
41
41
|
try:
|
|
42
42
|
record_hash = self.get_hash(extras=[file_data.identifier])
|
|
43
43
|
filename = f"{record_hash}.json"
|
|
44
44
|
filepath = (self.cache_dir / filename).resolve()
|
|
45
45
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
46
46
|
with open(str(filepath), "w") as f:
|
|
47
|
-
json.dump(file_data.
|
|
47
|
+
json.dump(file_data.model_dump(), f, indent=2)
|
|
48
48
|
yield str(filepath)
|
|
49
49
|
except Exception as e:
|
|
50
50
|
logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
|
|
@@ -54,14 +54,14 @@ class IndexStep(PipelineStep):
|
|
|
54
54
|
|
|
55
55
|
async def run_async(self) -> AsyncGenerator[str, None]:
|
|
56
56
|
async for file_data in self.process.run_async():
|
|
57
|
-
logger.debug(f"generated file data: {file_data.
|
|
57
|
+
logger.debug(f"generated file data: {file_data.model_dump()}")
|
|
58
58
|
try:
|
|
59
59
|
record_hash = self.get_hash(extras=[file_data.identifier])
|
|
60
60
|
filename = f"{record_hash}.json"
|
|
61
61
|
filepath = (self.cache_dir / filename).resolve()
|
|
62
62
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
63
63
|
with open(str(filepath), "w") as f:
|
|
64
|
-
json.dump(file_data.
|
|
64
|
+
json.dump(file_data.model_dump(), f, indent=2)
|
|
65
65
|
yield str(filepath)
|
|
66
66
|
except Exception as e:
|
|
67
67
|
logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
|
|
@@ -6,6 +6,7 @@ from pathlib import Path
|
|
|
6
6
|
from typing import Callable, Optional, TypedDict
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.v2.interfaces import FileData
|
|
9
|
+
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
9
10
|
from unstructured_ingest.v2.logger import logger
|
|
10
11
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
11
12
|
from unstructured_ingest.v2.processes.partitioner import Partitioner
|
|
@@ -51,12 +52,12 @@ class PartitionStep(PipelineStep):
|
|
|
51
52
|
self, fn: Callable, path: str, file_data_path: str
|
|
52
53
|
) -> Optional[PartitionStepResponse]:
|
|
53
54
|
path = Path(path)
|
|
54
|
-
file_data =
|
|
55
|
+
file_data = file_data_from_file(path=file_data_path)
|
|
55
56
|
output_filepath = self.get_output_filepath(filename=Path(file_data_path))
|
|
56
57
|
if not self.should_partition(filepath=output_filepath, file_data=file_data):
|
|
57
58
|
logger.debug(f"skipping partitioning, output already exists: {output_filepath}")
|
|
58
59
|
return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
|
|
59
|
-
fn_kwargs = {"filename": path, "metadata": file_data.metadata.
|
|
60
|
+
fn_kwargs = {"filename": path, "metadata": file_data.metadata.model_dump()}
|
|
60
61
|
if not asyncio.iscoroutinefunction(fn):
|
|
61
62
|
partitioned_content = fn(**fn_kwargs)
|
|
62
63
|
elif semaphore := self.context.semaphore:
|
|
@@ -4,7 +4,7 @@ from dataclasses import dataclass
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Callable, Optional, TypedDict
|
|
6
6
|
|
|
7
|
-
from unstructured_ingest.v2.interfaces.file_data import
|
|
7
|
+
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
8
8
|
from unstructured_ingest.v2.interfaces.upload_stager import UploadStager
|
|
9
9
|
from unstructured_ingest.v2.logger import logger
|
|
10
10
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
@@ -43,7 +43,7 @@ class UploadStageStep(PipelineStep):
|
|
|
43
43
|
output_filename = f"{self.get_hash(extras=[path.name])}{path.suffix}"
|
|
44
44
|
fn_kwargs = {
|
|
45
45
|
"elements_filepath": path,
|
|
46
|
-
"file_data":
|
|
46
|
+
"file_data": file_data_from_file(path=file_data_path),
|
|
47
47
|
"output_dir": self.cache_dir,
|
|
48
48
|
"output_filename": output_filename,
|
|
49
49
|
}
|
|
@@ -3,7 +3,7 @@ from dataclasses import dataclass
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Callable, TypedDict
|
|
5
5
|
|
|
6
|
-
from unstructured_ingest.v2.interfaces.file_data import
|
|
6
|
+
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
7
7
|
from unstructured_ingest.v2.logger import logger
|
|
8
8
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
9
9
|
from unstructured_ingest.v2.processes.uncompress import Uncompressor
|
|
@@ -28,7 +28,7 @@ class UncompressStep(PipelineStep):
|
|
|
28
28
|
async def _run_async(
|
|
29
29
|
self, fn: Callable, path: str, file_data_path: str
|
|
30
30
|
) -> list[UncompressStepResponse]:
|
|
31
|
-
file_data =
|
|
31
|
+
file_data = file_data_from_file(path=file_data_path)
|
|
32
32
|
fn_kwargs = {"file_data": file_data}
|
|
33
33
|
if not asyncio.iscoroutinefunction(fn):
|
|
34
34
|
new_file_data = fn(**fn_kwargs)
|
|
@@ -3,7 +3,7 @@ from dataclasses import dataclass
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Callable, Optional, TypedDict
|
|
5
5
|
|
|
6
|
-
from unstructured_ingest.v2.interfaces import
|
|
6
|
+
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
7
7
|
from unstructured_ingest.v2.interfaces.uploader import UploadContent
|
|
8
8
|
from unstructured_ingest.v2.logger import logger
|
|
9
9
|
from unstructured_ingest.v2.pipeline.interfaces import BatchPipelineStep
|
|
@@ -41,14 +41,14 @@ class UploadStep(BatchPipelineStep):
|
|
|
41
41
|
@instrument(span_name=STEP_ID)
|
|
42
42
|
def _run_batch(self, contents: list[UploadStepContent]) -> None:
|
|
43
43
|
upload_contents = [
|
|
44
|
-
UploadContent(path=Path(c["path"]), file_data=
|
|
44
|
+
UploadContent(path=Path(c["path"]), file_data=file_data_from_file(c["file_data_path"]))
|
|
45
45
|
for c in contents
|
|
46
46
|
]
|
|
47
47
|
self.process.run_batch(contents=upload_contents)
|
|
48
48
|
|
|
49
49
|
async def _run_async(self, path: str, file_data_path: str, fn: Optional[Callable] = None):
|
|
50
50
|
fn = fn or self.process.run_async
|
|
51
|
-
fn_kwargs = {"path": Path(path), "file_data":
|
|
51
|
+
fn_kwargs = {"path": Path(path), "file_data": file_data_from_file(path=file_data_path)}
|
|
52
52
|
if not asyncio.iscoroutinefunction(fn):
|
|
53
53
|
fn(**fn_kwargs)
|
|
54
54
|
elif semaphore := self.context.semaphore:
|
|
@@ -40,6 +40,8 @@ from .milvus import CONNECTOR_TYPE as MILVUS_CONNECTOR_TYPE
|
|
|
40
40
|
from .milvus import milvus_destination_entry
|
|
41
41
|
from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
|
|
42
42
|
from .mongodb import mongodb_destination_entry, mongodb_source_entry
|
|
43
|
+
from .neo4j import CONNECTOR_TYPE as NEO4J_CONNECTOR_TYPE
|
|
44
|
+
from .neo4j import neo4j_destination_entry
|
|
43
45
|
from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
|
|
44
46
|
from .onedrive import onedrive_destination_entry, onedrive_source_entry
|
|
45
47
|
from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
|
|
@@ -74,6 +76,7 @@ add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=local_destina
|
|
|
74
76
|
add_source_entry(source_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_source_entry)
|
|
75
77
|
add_destination_entry(destination_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_destination_entry)
|
|
76
78
|
|
|
79
|
+
add_destination_entry(destination_type=NEO4J_CONNECTOR_TYPE, entry=neo4j_destination_entry)
|
|
77
80
|
|
|
78
81
|
add_source_entry(source_type=SALESFORCE_CONNECTOR_TYPE, entry=salesforce_source_entry)
|
|
79
82
|
|
|
@@ -1,13 +1,11 @@
|
|
|
1
|
-
import copy
|
|
2
1
|
import csv
|
|
3
2
|
import hashlib
|
|
4
|
-
import sys
|
|
5
3
|
from dataclasses import dataclass, field
|
|
6
4
|
from pathlib import Path
|
|
7
5
|
from time import time
|
|
8
6
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
9
7
|
|
|
10
|
-
from pydantic import Field, Secret
|
|
8
|
+
from pydantic import BaseModel, Field, Secret
|
|
11
9
|
|
|
12
10
|
from unstructured_ingest import __name__ as integration_name
|
|
13
11
|
from unstructured_ingest.__version__ import __version__ as integration_version
|
|
@@ -22,6 +20,8 @@ from unstructured_ingest.utils.string_and_date_utils import truncate_string_byte
|
|
|
22
20
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
23
21
|
from unstructured_ingest.v2.interfaces import (
|
|
24
22
|
AccessConfig,
|
|
23
|
+
BatchFileData,
|
|
24
|
+
BatchItem,
|
|
25
25
|
ConnectionConfig,
|
|
26
26
|
Downloader,
|
|
27
27
|
DownloaderConfig,
|
|
@@ -30,6 +30,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
30
30
|
FileDataSourceMetadata,
|
|
31
31
|
Indexer,
|
|
32
32
|
IndexerConfig,
|
|
33
|
+
SourceIdentifiers,
|
|
33
34
|
Uploader,
|
|
34
35
|
UploaderConfig,
|
|
35
36
|
UploadStager,
|
|
@@ -53,6 +54,15 @@ CONNECTOR_TYPE = "astradb"
|
|
|
53
54
|
MAX_CONTENT_PARAM_BYTE_SIZE = 8000
|
|
54
55
|
|
|
55
56
|
|
|
57
|
+
class AstraDBAdditionalMetadata(BaseModel):
|
|
58
|
+
collection_name: str
|
|
59
|
+
keyspace: Optional[str] = None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class AstraDBBatchFileData(BatchFileData):
|
|
63
|
+
additional_metadata: AstraDBAdditionalMetadata
|
|
64
|
+
|
|
65
|
+
|
|
56
66
|
class AstraDBAccessConfig(AccessConfig):
|
|
57
67
|
token: str = Field(description="Astra DB Token with access to the database.")
|
|
58
68
|
api_endpoint: str = Field(description="The API endpoint for the Astra DB.")
|
|
@@ -179,9 +189,6 @@ class AstraDBIndexer(Indexer):
|
|
|
179
189
|
|
|
180
190
|
def _get_doc_ids(self) -> set[str]:
|
|
181
191
|
"""Fetches all document ids in an index"""
|
|
182
|
-
# Initialize set of ids
|
|
183
|
-
ids = set()
|
|
184
|
-
|
|
185
192
|
# Get the collection
|
|
186
193
|
collection = self.get_collection()
|
|
187
194
|
|
|
@@ -194,31 +201,26 @@ class AstraDBIndexer(Indexer):
|
|
|
194
201
|
astra_db_docs.append(result)
|
|
195
202
|
|
|
196
203
|
# Create file data for each astra record
|
|
197
|
-
for astra_record in astra_db_docs
|
|
198
|
-
ids.add(astra_record["_id"])
|
|
204
|
+
ids = sorted([astra_record["_id"] for astra_record in astra_db_docs])
|
|
199
205
|
|
|
200
|
-
return ids
|
|
206
|
+
return set(ids)
|
|
201
207
|
|
|
202
|
-
def run(self, **kwargs: Any) -> Generator[
|
|
208
|
+
def run(self, **kwargs: Any) -> Generator[AstraDBBatchFileData, None, None]:
|
|
203
209
|
all_ids = self._get_doc_ids()
|
|
204
210
|
ids = list(all_ids)
|
|
205
211
|
id_batches = batch_generator(ids, self.index_config.batch_size)
|
|
206
212
|
|
|
207
213
|
for batch in id_batches:
|
|
208
|
-
|
|
209
|
-
identified = str(hash(batch) + sys.maxsize + 1)
|
|
210
|
-
fd = FileData(
|
|
211
|
-
identifier=identified,
|
|
214
|
+
fd = AstraDBBatchFileData(
|
|
212
215
|
connector_type=CONNECTOR_TYPE,
|
|
213
|
-
doc_type="batch",
|
|
214
216
|
metadata=FileDataSourceMetadata(
|
|
215
217
|
date_processed=str(time()),
|
|
216
218
|
),
|
|
217
|
-
additional_metadata=
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
219
|
+
additional_metadata=AstraDBAdditionalMetadata(
|
|
220
|
+
collection_name=self.index_config.collection_name,
|
|
221
|
+
keyspace=self.index_config.keyspace,
|
|
222
|
+
),
|
|
223
|
+
batch_items=[BatchItem(identifier=b) for b in batch],
|
|
222
224
|
)
|
|
223
225
|
yield fd
|
|
224
226
|
|
|
@@ -247,7 +249,9 @@ class AstraDBDownloader(Downloader):
|
|
|
247
249
|
writer.writerow(astra_result.keys())
|
|
248
250
|
writer.writerow(astra_result.values())
|
|
249
251
|
|
|
250
|
-
def generate_download_response(
|
|
252
|
+
def generate_download_response(
|
|
253
|
+
self, result: dict, file_data: AstraDBBatchFileData
|
|
254
|
+
) -> DownloadResponse:
|
|
251
255
|
record_id = result["_id"]
|
|
252
256
|
filename_id = self.get_identifier(record_id=record_id)
|
|
253
257
|
filename = f"{filename_id}.csv" # csv to preserve column info
|
|
@@ -255,7 +259,7 @@ class AstraDBDownloader(Downloader):
|
|
|
255
259
|
logger.debug(f"Downloading results from record {record_id} as csv to {download_path}")
|
|
256
260
|
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
257
261
|
try:
|
|
258
|
-
self.write_astra_result_to_csv(astra_result=result, download_path=download_path)
|
|
262
|
+
self.write_astra_result_to_csv(astra_result=result, download_path=str(download_path))
|
|
259
263
|
except Exception as e:
|
|
260
264
|
logger.error(
|
|
261
265
|
f"failed to download from record {record_id} to {download_path}: {e}",
|
|
@@ -264,14 +268,13 @@ class AstraDBDownloader(Downloader):
|
|
|
264
268
|
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
265
269
|
|
|
266
270
|
# modify input file_data for download_response
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
copied_file_data.additional_metadata.pop("ids", None)
|
|
271
|
+
file_data.source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
|
|
272
|
+
cast_file_data = FileData.cast(file_data=file_data)
|
|
273
|
+
cast_file_data.identifier = filename
|
|
274
|
+
cast_file_data.metadata.date_processed = str(time())
|
|
275
|
+
cast_file_data.metadata.record_locator = {"document_id": record_id}
|
|
273
276
|
return super().generate_download_response(
|
|
274
|
-
file_data=
|
|
277
|
+
file_data=cast_file_data, download_path=download_path
|
|
275
278
|
)
|
|
276
279
|
|
|
277
280
|
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
@@ -279,9 +282,10 @@ class AstraDBDownloader(Downloader):
|
|
|
279
282
|
|
|
280
283
|
async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
281
284
|
# Get metadata from file_data
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
+
astra_file_data = AstraDBBatchFileData.cast(file_data=file_data)
|
|
286
|
+
ids: list[str] = [item.identifier for item in astra_file_data.batch_items]
|
|
287
|
+
collection_name: str = astra_file_data.additional_metadata.collection_name
|
|
288
|
+
keyspace: str = astra_file_data.additional_metadata.keyspace
|
|
285
289
|
|
|
286
290
|
# Retrieve results from async collection
|
|
287
291
|
download_responses = []
|
|
@@ -292,7 +296,7 @@ class AstraDBDownloader(Downloader):
|
|
|
292
296
|
)
|
|
293
297
|
async for result in async_astra_collection.find({"_id": {"$in": ids}}):
|
|
294
298
|
download_responses.append(
|
|
295
|
-
self.generate_download_response(result=result, file_data=
|
|
299
|
+
self.generate_download_response(result=result, file_data=astra_file_data)
|
|
296
300
|
)
|
|
297
301
|
return download_responses
|
|
298
302
|
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import hashlib
|
|
2
|
-
import sys
|
|
3
2
|
import time
|
|
4
3
|
from contextlib import contextmanager
|
|
5
4
|
from dataclasses import dataclass, field
|
|
@@ -7,7 +6,7 @@ from datetime import timedelta
|
|
|
7
6
|
from pathlib import Path
|
|
8
7
|
from typing import TYPE_CHECKING, Any, Generator, List
|
|
9
8
|
|
|
10
|
-
from pydantic import Field, Secret
|
|
9
|
+
from pydantic import BaseModel, Field, Secret
|
|
11
10
|
|
|
12
11
|
from unstructured_ingest.error import (
|
|
13
12
|
DestinationConnectionError,
|
|
@@ -18,6 +17,8 @@ from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
|
|
|
18
17
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
19
18
|
from unstructured_ingest.v2.interfaces import (
|
|
20
19
|
AccessConfig,
|
|
20
|
+
BatchFileData,
|
|
21
|
+
BatchItem,
|
|
21
22
|
ConnectionConfig,
|
|
22
23
|
Downloader,
|
|
23
24
|
DownloaderConfig,
|
|
@@ -26,6 +27,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
26
27
|
FileDataSourceMetadata,
|
|
27
28
|
Indexer,
|
|
28
29
|
IndexerConfig,
|
|
30
|
+
SourceIdentifiers,
|
|
29
31
|
Uploader,
|
|
30
32
|
UploaderConfig,
|
|
31
33
|
UploadStager,
|
|
@@ -40,11 +42,20 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
40
42
|
|
|
41
43
|
if TYPE_CHECKING:
|
|
42
44
|
from couchbase.cluster import Cluster
|
|
45
|
+
from couchbase.collection import Collection
|
|
43
46
|
|
|
44
47
|
CONNECTOR_TYPE = "couchbase"
|
|
45
48
|
SERVER_API_VERSION = "1"
|
|
46
49
|
|
|
47
50
|
|
|
51
|
+
class CouchbaseAdditionalMetadata(BaseModel):
|
|
52
|
+
bucket: str
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class CouchbaseBatchFileData(BatchFileData):
|
|
56
|
+
additional_metadata: CouchbaseAdditionalMetadata
|
|
57
|
+
|
|
58
|
+
|
|
48
59
|
class CouchbaseAccessConfig(AccessConfig):
|
|
49
60
|
password: str = Field(description="The password for the Couchbase server")
|
|
50
61
|
|
|
@@ -180,31 +191,21 @@ class CouchbaseIndexer(Indexer):
|
|
|
180
191
|
if attempts == max_attempts:
|
|
181
192
|
raise SourceConnectionError(f"failed to get document ids: {e}")
|
|
182
193
|
|
|
183
|
-
def run(self, **kwargs: Any) -> Generator[
|
|
194
|
+
def run(self, **kwargs: Any) -> Generator[CouchbaseBatchFileData, None, None]:
|
|
184
195
|
ids = self._get_doc_ids()
|
|
185
|
-
|
|
186
|
-
id_batches = [
|
|
187
|
-
ids[i * self.index_config.batch_size : (i + 1) * self.index_config.batch_size]
|
|
188
|
-
for i in range(
|
|
189
|
-
(len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
|
|
190
|
-
)
|
|
191
|
-
]
|
|
192
|
-
for batch in id_batches:
|
|
196
|
+
for batch in batch_generator(ids, self.index_config.batch_size):
|
|
193
197
|
# Make sure the hash is always a positive number to create identified
|
|
194
|
-
|
|
195
|
-
yield FileData(
|
|
196
|
-
identifier=identified,
|
|
198
|
+
yield CouchbaseBatchFileData(
|
|
197
199
|
connector_type=CONNECTOR_TYPE,
|
|
198
|
-
doc_type="batch",
|
|
199
200
|
metadata=FileDataSourceMetadata(
|
|
200
201
|
url=f"{self.connection_config.connection_string}/"
|
|
201
202
|
f"{self.connection_config.bucket}",
|
|
202
203
|
date_processed=str(time.time()),
|
|
203
204
|
),
|
|
204
|
-
additional_metadata=
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
205
|
+
additional_metadata=CouchbaseAdditionalMetadata(
|
|
206
|
+
bucket=self.connection_config.bucket
|
|
207
|
+
),
|
|
208
|
+
batch_items=[BatchItem(identifier=b) for b in batch],
|
|
208
209
|
)
|
|
209
210
|
|
|
210
211
|
|
|
@@ -241,7 +242,7 @@ class CouchbaseDownloader(Downloader):
|
|
|
241
242
|
return concatenated_values
|
|
242
243
|
|
|
243
244
|
def generate_download_response(
|
|
244
|
-
self, result: dict, bucket: str, file_data:
|
|
245
|
+
self, result: dict, bucket: str, file_data: CouchbaseBatchFileData
|
|
245
246
|
) -> DownloadResponse:
|
|
246
247
|
record_id = result[self.download_config.collection_id]
|
|
247
248
|
filename_id = self.get_identifier(bucket=bucket, record_id=record_id)
|
|
@@ -261,28 +262,26 @@ class CouchbaseDownloader(Downloader):
|
|
|
261
262
|
exc_info=True,
|
|
262
263
|
)
|
|
263
264
|
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
),
|
|
279
|
-
),
|
|
280
|
-
path=download_path,
|
|
265
|
+
file_data.source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
|
|
266
|
+
cast_file_data = FileData.cast(file_data=file_data)
|
|
267
|
+
cast_file_data.identifier = filename_id
|
|
268
|
+
cast_file_data.metadata.date_processed = str(time.time())
|
|
269
|
+
cast_file_data.metadata.record_locator = {
|
|
270
|
+
"connection_string": self.connection_config.connection_string,
|
|
271
|
+
"bucket": bucket,
|
|
272
|
+
"scope": self.connection_config.scope,
|
|
273
|
+
"collection": self.connection_config.collection,
|
|
274
|
+
"document_id": record_id,
|
|
275
|
+
}
|
|
276
|
+
return super().generate_download_response(
|
|
277
|
+
file_data=cast_file_data,
|
|
278
|
+
download_path=download_path,
|
|
281
279
|
)
|
|
282
280
|
|
|
283
281
|
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
284
|
-
|
|
285
|
-
|
|
282
|
+
couchbase_file_data = CouchbaseBatchFileData.cast(file_data=file_data)
|
|
283
|
+
bucket_name: str = couchbase_file_data.additional_metadata.bucket
|
|
284
|
+
ids: list[str] = [item.identifier for item in couchbase_file_data.batch_items]
|
|
286
285
|
|
|
287
286
|
with self.connection_config.get_client() as client:
|
|
288
287
|
bucket = client.bucket(bucket_name)
|
|
@@ -292,13 +291,25 @@ class CouchbaseDownloader(Downloader):
|
|
|
292
291
|
download_resp = self.process_all_doc_ids(ids, collection, bucket_name, file_data)
|
|
293
292
|
return list(download_resp)
|
|
294
293
|
|
|
295
|
-
def process_doc_id(
|
|
294
|
+
def process_doc_id(
|
|
295
|
+
self,
|
|
296
|
+
doc_id: str,
|
|
297
|
+
collection: "Collection",
|
|
298
|
+
bucket_name: str,
|
|
299
|
+
file_data: CouchbaseBatchFileData,
|
|
300
|
+
):
|
|
296
301
|
result = collection.get(doc_id)
|
|
297
302
|
return self.generate_download_response(
|
|
298
303
|
result=result.content_as[dict], bucket=bucket_name, file_data=file_data
|
|
299
304
|
)
|
|
300
305
|
|
|
301
|
-
def process_all_doc_ids(
|
|
306
|
+
def process_all_doc_ids(
|
|
307
|
+
self,
|
|
308
|
+
ids: list[str],
|
|
309
|
+
collection: "Collection",
|
|
310
|
+
bucket_name: str,
|
|
311
|
+
file_data: CouchbaseBatchFileData,
|
|
312
|
+
):
|
|
302
313
|
for doc_id in ids:
|
|
303
314
|
yield self.process_doc_id(doc_id, collection, bucket_name, file_data)
|
|
304
315
|
|
|
@@ -14,6 +14,7 @@ from unstructured_ingest.error import (
|
|
|
14
14
|
)
|
|
15
15
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
16
16
|
from unstructured_ingest.v2.interfaces import (
|
|
17
|
+
AccessConfig,
|
|
17
18
|
ConnectionConfig,
|
|
18
19
|
Downloader,
|
|
19
20
|
DownloaderConfig,
|
|
@@ -52,6 +53,10 @@ class DatabricksPathMixin(BaseModel):
|
|
|
52
53
|
return path
|
|
53
54
|
|
|
54
55
|
|
|
56
|
+
class DatabricksVolumesAccessConfig(AccessConfig):
|
|
57
|
+
token: Optional[str] = Field(default=None, description="Databricks Personal Access Token")
|
|
58
|
+
|
|
59
|
+
|
|
55
60
|
class DatabricksVolumesConnectionConfig(ConnectionConfig, ABC):
|
|
56
61
|
host: Optional[str] = Field(
|
|
57
62
|
default=None,
|
|
@@ -3,12 +3,12 @@ from typing import Optional
|
|
|
3
3
|
|
|
4
4
|
from pydantic import Field, Secret
|
|
5
5
|
|
|
6
|
-
from unstructured_ingest.v2.interfaces import AccessConfig
|
|
7
6
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
8
7
|
DestinationRegistryEntry,
|
|
9
8
|
SourceRegistryEntry,
|
|
10
9
|
)
|
|
11
10
|
from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
11
|
+
DatabricksVolumesAccessConfig,
|
|
12
12
|
DatabricksVolumesConnectionConfig,
|
|
13
13
|
DatabricksVolumesDownloader,
|
|
14
14
|
DatabricksVolumesDownloaderConfig,
|
|
@@ -21,7 +21,7 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
|
21
21
|
CONNECTOR_TYPE = "databricks_volumes_aws"
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
class DatabricksAWSVolumesAccessConfig(
|
|
24
|
+
class DatabricksAWSVolumesAccessConfig(DatabricksVolumesAccessConfig):
|
|
25
25
|
account_id: Optional[str] = Field(
|
|
26
26
|
default=None,
|
|
27
27
|
description="The Databricks account ID for the Databricks " "accounts endpoint",
|
|
@@ -3,12 +3,12 @@ from typing import Optional
|
|
|
3
3
|
|
|
4
4
|
from pydantic import Field, Secret
|
|
5
5
|
|
|
6
|
-
from unstructured_ingest.v2.interfaces import AccessConfig
|
|
7
6
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
8
7
|
DestinationRegistryEntry,
|
|
9
8
|
SourceRegistryEntry,
|
|
10
9
|
)
|
|
11
10
|
from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
11
|
+
DatabricksVolumesAccessConfig,
|
|
12
12
|
DatabricksVolumesConnectionConfig,
|
|
13
13
|
DatabricksVolumesDownloader,
|
|
14
14
|
DatabricksVolumesDownloaderConfig,
|
|
@@ -21,7 +21,7 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
|
21
21
|
CONNECTOR_TYPE = "databricks_volumes_azure"
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
class DatabricksAzureVolumesAccessConfig(
|
|
24
|
+
class DatabricksAzureVolumesAccessConfig(DatabricksVolumesAccessConfig):
|
|
25
25
|
account_id: Optional[str] = Field(
|
|
26
26
|
default=None,
|
|
27
27
|
description="The Databricks account ID for the Databricks " "accounts endpoint.",
|
|
@@ -3,12 +3,12 @@ from typing import Optional
|
|
|
3
3
|
|
|
4
4
|
from pydantic import Field, Secret
|
|
5
5
|
|
|
6
|
-
from unstructured_ingest.v2.interfaces import AccessConfig
|
|
7
6
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
8
7
|
DestinationRegistryEntry,
|
|
9
8
|
SourceRegistryEntry,
|
|
10
9
|
)
|
|
11
10
|
from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
11
|
+
DatabricksVolumesAccessConfig,
|
|
12
12
|
DatabricksVolumesConnectionConfig,
|
|
13
13
|
DatabricksVolumesDownloader,
|
|
14
14
|
DatabricksVolumesDownloaderConfig,
|
|
@@ -21,7 +21,7 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
|
21
21
|
CONNECTOR_TYPE = "databricks_volumes_gcp"
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
class DatabricksGoogleVolumesAccessConfig(
|
|
24
|
+
class DatabricksGoogleVolumesAccessConfig(DatabricksVolumesAccessConfig):
|
|
25
25
|
account_id: Optional[str] = Field(
|
|
26
26
|
default=None,
|
|
27
27
|
description="The Databricks account ID for the Databricks " "accounts endpoint.",
|