unstructured-ingest 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/chunkers/test_chunkers.py +0 -11
- test/integration/connectors/conftest.py +11 -1
- test/integration/connectors/databricks_tests/test_volumes_native.py +4 -3
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +51 -44
- test/integration/connectors/duckdb/test_motherduck.py +37 -48
- test/integration/connectors/elasticsearch/test_elasticsearch.py +26 -4
- test/integration/connectors/elasticsearch/test_opensearch.py +26 -3
- test/integration/connectors/sql/test_postgres.py +103 -92
- test/integration/connectors/sql/test_singlestore.py +112 -100
- test/integration/connectors/sql/test_snowflake.py +142 -117
- test/integration/connectors/sql/test_sqlite.py +87 -76
- test/integration/connectors/test_astradb.py +62 -1
- test/integration/connectors/test_azure_ai_search.py +25 -3
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +4 -4
- test/integration/connectors/test_delta_table.py +1 -0
- test/integration/connectors/test_kafka.py +6 -6
- test/integration/connectors/test_milvus.py +21 -0
- test/integration/connectors/test_mongodb.py +7 -4
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_pinecone.py +25 -1
- test/integration/connectors/test_qdrant.py +25 -2
- test/integration/connectors/test_s3.py +9 -6
- test/integration/connectors/utils/docker.py +6 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +88 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/{validation.py → validation/source.py} +42 -98
- test/integration/connectors/utils/validation/utils.py +36 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/utils/chunking.py +11 -0
- unstructured_ingest/utils/data_prep.py +36 -0
- unstructured_ingest/v2/interfaces/__init__.py +3 -1
- unstructured_ingest/v2/interfaces/file_data.py +58 -14
- unstructured_ingest/v2/interfaces/upload_stager.py +70 -6
- unstructured_ingest/v2/interfaces/uploader.py +11 -2
- unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
- unstructured_ingest/v2/pipeline/steps/download.py +5 -4
- unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
- unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
- unstructured_ingest/v2/pipeline/steps/index.py +4 -4
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
- unstructured_ingest/v2/pipeline/steps/stage.py +5 -3
- unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
- unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +43 -63
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +16 -40
- unstructured_ingest/v2/processes/connectors/chroma.py +36 -59
- unstructured_ingest/v2/processes/connectors/couchbase.py +92 -93
- unstructured_ingest/v2/processes/connectors/delta_table.py +11 -33
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +26 -26
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +29 -20
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +37 -44
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +46 -75
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
- unstructured_ingest/v2/processes/connectors/gitlab.py +32 -31
- unstructured_ingest/v2/processes/connectors/google_drive.py +32 -29
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +2 -4
- unstructured_ingest/v2/processes/connectors/kdbai.py +44 -70
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -10
- unstructured_ingest/v2/processes/connectors/local.py +13 -2
- unstructured_ingest/v2/processes/connectors/milvus.py +16 -57
- unstructured_ingest/v2/processes/connectors/mongodb.py +99 -108
- unstructured_ingest/v2/processes/connectors/neo4j.py +383 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
- unstructured_ingest/v2/processes/connectors/pinecone.py +3 -33
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +32 -41
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/sql.py +72 -66
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +9 -31
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/METADATA +20 -15
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/RECORD +87 -79
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/top_level.txt +0 -0
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
|
|
1
|
+
import json
|
|
2
|
+
from abc import ABC
|
|
2
3
|
from dataclasses import dataclass
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
from typing import Any, TypeVar
|
|
5
6
|
|
|
7
|
+
import ndjson
|
|
6
8
|
from pydantic import BaseModel
|
|
7
9
|
|
|
8
10
|
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
@@ -20,16 +22,78 @@ UploadStagerConfigT = TypeVar("UploadStagerConfigT", bound=UploadStagerConfig)
|
|
|
20
22
|
class UploadStager(BaseProcess, ABC):
|
|
21
23
|
upload_stager_config: UploadStagerConfigT
|
|
22
24
|
|
|
23
|
-
|
|
25
|
+
def write_output(self, output_path: Path, data: list[dict]) -> None:
|
|
26
|
+
if output_path.suffix == ".json":
|
|
27
|
+
with output_path.open("w") as f:
|
|
28
|
+
json.dump(data, f, indent=2)
|
|
29
|
+
elif output_path.suffix == ".ndjson":
|
|
30
|
+
with output_path.open("w") as f:
|
|
31
|
+
ndjson.dump(data, f)
|
|
32
|
+
else:
|
|
33
|
+
raise ValueError(f"Unsupported output format: {output_path}")
|
|
34
|
+
|
|
35
|
+
def get_data(self, elements_filepath: Path) -> list[dict]:
|
|
36
|
+
if elements_filepath.suffix == ".json":
|
|
37
|
+
with elements_filepath.open() as f:
|
|
38
|
+
return json.load(f)
|
|
39
|
+
elif elements_filepath.suffix == ".ndjson":
|
|
40
|
+
with elements_filepath.open() as f:
|
|
41
|
+
return ndjson.load(f)
|
|
42
|
+
else:
|
|
43
|
+
raise ValueError(f"Unsupported input format: {elements_filepath}")
|
|
44
|
+
|
|
45
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
46
|
+
return element_dict
|
|
47
|
+
|
|
48
|
+
def get_output_path(self, output_filename: str, output_dir: Path) -> Path:
|
|
49
|
+
output_path = Path(output_filename)
|
|
50
|
+
output_filename = f"{Path(output_filename).stem}{output_path.suffix}"
|
|
51
|
+
output_path = Path(output_dir) / Path(f"{output_filename}")
|
|
52
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
53
|
+
return output_path
|
|
54
|
+
|
|
55
|
+
def stream_update(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
|
|
56
|
+
with input_file.open() as in_f:
|
|
57
|
+
reader = ndjson.reader(in_f)
|
|
58
|
+
with output_file.open("w") as out_f:
|
|
59
|
+
writer = ndjson.writer(out_f)
|
|
60
|
+
for element in reader:
|
|
61
|
+
conformed_element = self.conform_dict(element_dict=element, file_data=file_data)
|
|
62
|
+
writer.writerow(row=conformed_element)
|
|
63
|
+
writer.f.flush()
|
|
64
|
+
|
|
65
|
+
def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
|
|
66
|
+
with input_file.open() as in_f:
|
|
67
|
+
elements_contents = json.load(in_f)
|
|
68
|
+
|
|
69
|
+
conformed_elements = [
|
|
70
|
+
self.conform_dict(element_dict=element, file_data=file_data)
|
|
71
|
+
for element in elements_contents
|
|
72
|
+
]
|
|
73
|
+
|
|
74
|
+
with open(output_file, "w") as out_f:
|
|
75
|
+
json.dump(conformed_elements, out_f, indent=2)
|
|
76
|
+
|
|
24
77
|
def run(
|
|
25
78
|
self,
|
|
26
79
|
elements_filepath: Path,
|
|
27
80
|
file_data: FileData,
|
|
28
81
|
output_dir: Path,
|
|
29
82
|
output_filename: str,
|
|
30
|
-
**kwargs: Any
|
|
83
|
+
**kwargs: Any,
|
|
31
84
|
) -> Path:
|
|
32
|
-
|
|
85
|
+
output_file = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
|
|
86
|
+
if elements_filepath.suffix == ".ndjson":
|
|
87
|
+
self.stream_update(
|
|
88
|
+
input_file=elements_filepath, output_file=output_file, file_data=file_data
|
|
89
|
+
)
|
|
90
|
+
elif elements_filepath.suffix == ".json":
|
|
91
|
+
self.process_whole(
|
|
92
|
+
input_file=elements_filepath, output_file=output_file, file_data=file_data
|
|
93
|
+
)
|
|
94
|
+
else:
|
|
95
|
+
raise ValueError(f"Unsupported file extension: {elements_filepath}")
|
|
96
|
+
return output_file
|
|
33
97
|
|
|
34
98
|
async def run_async(
|
|
35
99
|
self,
|
|
@@ -37,12 +101,12 @@ class UploadStager(BaseProcess, ABC):
|
|
|
37
101
|
file_data: FileData,
|
|
38
102
|
output_dir: Path,
|
|
39
103
|
output_filename: str,
|
|
40
|
-
**kwargs: Any
|
|
104
|
+
**kwargs: Any,
|
|
41
105
|
) -> Path:
|
|
42
106
|
return self.run(
|
|
43
107
|
elements_filepath=elements_filepath,
|
|
44
108
|
output_dir=output_dir,
|
|
45
109
|
output_filename=output_filename,
|
|
46
110
|
file_data=file_data,
|
|
47
|
-
**kwargs
|
|
111
|
+
**kwargs,
|
|
48
112
|
)
|
|
@@ -5,6 +5,7 @@ from typing import Any, TypeVar
|
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
8
|
+
from unstructured_ingest.utils.data_prep import get_data
|
|
8
9
|
from unstructured_ingest.v2.interfaces.connector import BaseConnector
|
|
9
10
|
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
10
11
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
@@ -38,7 +39,15 @@ class Uploader(BaseProcess, BaseConnector, ABC):
|
|
|
38
39
|
raise NotImplementedError()
|
|
39
40
|
|
|
40
41
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
41
|
-
|
|
42
|
+
data = get_data(path=path)
|
|
43
|
+
self.run_data(data=data, file_data=file_data, **kwargs)
|
|
42
44
|
|
|
43
45
|
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
44
|
-
|
|
46
|
+
data = get_data(path=path)
|
|
47
|
+
await self.run_data_async(data=data, file_data=file_data, **kwargs)
|
|
48
|
+
|
|
49
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
50
|
+
raise NotImplementedError()
|
|
51
|
+
|
|
52
|
+
async def run_data_async(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
53
|
+
return self.run_data(data=data, file_data=file_data, **kwargs)
|
|
@@ -6,6 +6,7 @@ from pathlib import Path
|
|
|
6
6
|
from typing import Callable, Optional, TypedDict
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.v2.interfaces import FileData
|
|
9
|
+
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
9
10
|
from unstructured_ingest.v2.logger import logger
|
|
10
11
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
11
12
|
from unstructured_ingest.v2.processes.chunker import Chunker
|
|
@@ -51,7 +52,7 @@ class ChunkStep(PipelineStep):
|
|
|
51
52
|
self, fn: Callable, path: str, file_data_path: str, **kwargs
|
|
52
53
|
) -> ChunkStepResponse:
|
|
53
54
|
path = Path(path)
|
|
54
|
-
file_data =
|
|
55
|
+
file_data = file_data_from_file(path=file_data_path)
|
|
55
56
|
output_filepath = self.get_output_filepath(filename=path)
|
|
56
57
|
if not self.should_chunk(filepath=output_filepath, file_data=file_data):
|
|
57
58
|
logger.debug(f"skipping chunking, output already exists: {output_filepath}")
|
|
@@ -8,6 +8,7 @@ from typing import Callable, Optional, TypedDict, TypeVar
|
|
|
8
8
|
|
|
9
9
|
from unstructured_ingest.v2.interfaces import FileData, download_responses
|
|
10
10
|
from unstructured_ingest.v2.interfaces.downloader import Downloader
|
|
11
|
+
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
11
12
|
from unstructured_ingest.v2.logger import logger
|
|
12
13
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
13
14
|
from unstructured_ingest.v2.utils import serialize_base_model_json
|
|
@@ -87,12 +88,12 @@ class DownloadStep(PipelineStep):
|
|
|
87
88
|
f"match size of local file: {file_size_bytes}, updating"
|
|
88
89
|
)
|
|
89
90
|
file_data.metadata.filesize_bytes = file_size_bytes
|
|
90
|
-
logger.debug(f"updating file data with new content: {file_data.
|
|
91
|
+
logger.debug(f"updating file data with new content: {file_data.model_dump()}")
|
|
91
92
|
with file_data_path.open("w") as file:
|
|
92
|
-
json.dump(file_data.
|
|
93
|
+
json.dump(file_data.model_dump(), file, indent=2)
|
|
93
94
|
|
|
94
95
|
async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
|
|
95
|
-
file_data =
|
|
96
|
+
file_data = file_data_from_file(path=file_data_path)
|
|
96
97
|
download_path = self.process.get_download_path(file_data=file_data)
|
|
97
98
|
if not self.should_download(file_data=file_data, file_data_path=file_data_path):
|
|
98
99
|
logger.debug(f"skipping download, file already exists locally: {download_path}")
|
|
@@ -172,7 +173,7 @@ class DownloadStep(PipelineStep):
|
|
|
172
173
|
filepath = (self.cache_dir / filename).resolve()
|
|
173
174
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
174
175
|
with open(str(filepath), "w") as f:
|
|
175
|
-
json.dump(file_data.
|
|
176
|
+
json.dump(file_data.model_dump(), f, indent=2)
|
|
176
177
|
return str(filepath)
|
|
177
178
|
|
|
178
179
|
def get_hash(self, extras: Optional[list[str]]) -> str:
|
|
@@ -6,6 +6,7 @@ from pathlib import Path
|
|
|
6
6
|
from typing import Callable, Optional, TypedDict
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.v2.interfaces import FileData
|
|
9
|
+
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
9
10
|
from unstructured_ingest.v2.logger import logger
|
|
10
11
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
11
12
|
from unstructured_ingest.v2.processes.embedder import Embedder
|
|
@@ -49,7 +50,7 @@ class EmbedStep(PipelineStep):
|
|
|
49
50
|
|
|
50
51
|
async def _run_async(self, fn: Callable, path: str, file_data_path: str) -> EmbedStepResponse:
|
|
51
52
|
path = Path(path)
|
|
52
|
-
file_data =
|
|
53
|
+
file_data = file_data_from_file(path=file_data_path)
|
|
53
54
|
output_filepath = self.get_output_filepath(filename=path)
|
|
54
55
|
if not self.should_embed(filepath=output_filepath, file_data=file_data):
|
|
55
56
|
logger.debug(f"skipping embedding, output already exists: {output_filepath}")
|
|
@@ -2,7 +2,7 @@ import asyncio
|
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from typing import Callable, Optional
|
|
4
4
|
|
|
5
|
-
from unstructured_ingest.v2.interfaces.file_data import
|
|
5
|
+
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
6
6
|
from unstructured_ingest.v2.logger import logger
|
|
7
7
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
8
8
|
from unstructured_ingest.v2.processes.filter import Filterer
|
|
@@ -20,7 +20,7 @@ class FilterStep(PipelineStep):
|
|
|
20
20
|
logger.info(f"created {self.identifier} with configs: {config}")
|
|
21
21
|
|
|
22
22
|
async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
|
|
23
|
-
file_data =
|
|
23
|
+
file_data = file_data_from_file(path=file_data_path)
|
|
24
24
|
fn_kwargs = {"file_data": file_data}
|
|
25
25
|
if not asyncio.iscoroutinefunction(fn):
|
|
26
26
|
resp = fn(**fn_kwargs)
|
|
@@ -37,14 +37,14 @@ class IndexStep(PipelineStep):
|
|
|
37
37
|
@instrument(span_name=STEP_ID)
|
|
38
38
|
def run(self) -> Generator[str, None, None]:
|
|
39
39
|
for file_data in self.process.run():
|
|
40
|
-
logger.debug(f"generated file data: {file_data.
|
|
40
|
+
logger.debug(f"generated file data: {file_data.model_dump()}")
|
|
41
41
|
try:
|
|
42
42
|
record_hash = self.get_hash(extras=[file_data.identifier])
|
|
43
43
|
filename = f"{record_hash}.json"
|
|
44
44
|
filepath = (self.cache_dir / filename).resolve()
|
|
45
45
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
46
46
|
with open(str(filepath), "w") as f:
|
|
47
|
-
json.dump(file_data.
|
|
47
|
+
json.dump(file_data.model_dump(), f, indent=2)
|
|
48
48
|
yield str(filepath)
|
|
49
49
|
except Exception as e:
|
|
50
50
|
logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
|
|
@@ -54,14 +54,14 @@ class IndexStep(PipelineStep):
|
|
|
54
54
|
|
|
55
55
|
async def run_async(self) -> AsyncGenerator[str, None]:
|
|
56
56
|
async for file_data in self.process.run_async():
|
|
57
|
-
logger.debug(f"generated file data: {file_data.
|
|
57
|
+
logger.debug(f"generated file data: {file_data.model_dump()}")
|
|
58
58
|
try:
|
|
59
59
|
record_hash = self.get_hash(extras=[file_data.identifier])
|
|
60
60
|
filename = f"{record_hash}.json"
|
|
61
61
|
filepath = (self.cache_dir / filename).resolve()
|
|
62
62
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
63
63
|
with open(str(filepath), "w") as f:
|
|
64
|
-
json.dump(file_data.
|
|
64
|
+
json.dump(file_data.model_dump(), f, indent=2)
|
|
65
65
|
yield str(filepath)
|
|
66
66
|
except Exception as e:
|
|
67
67
|
logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
|
|
@@ -6,6 +6,7 @@ from pathlib import Path
|
|
|
6
6
|
from typing import Callable, Optional, TypedDict
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.v2.interfaces import FileData
|
|
9
|
+
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
9
10
|
from unstructured_ingest.v2.logger import logger
|
|
10
11
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
11
12
|
from unstructured_ingest.v2.processes.partitioner import Partitioner
|
|
@@ -51,12 +52,12 @@ class PartitionStep(PipelineStep):
|
|
|
51
52
|
self, fn: Callable, path: str, file_data_path: str
|
|
52
53
|
) -> Optional[PartitionStepResponse]:
|
|
53
54
|
path = Path(path)
|
|
54
|
-
file_data =
|
|
55
|
+
file_data = file_data_from_file(path=file_data_path)
|
|
55
56
|
output_filepath = self.get_output_filepath(filename=Path(file_data_path))
|
|
56
57
|
if not self.should_partition(filepath=output_filepath, file_data=file_data):
|
|
57
58
|
logger.debug(f"skipping partitioning, output already exists: {output_filepath}")
|
|
58
59
|
return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
|
|
59
|
-
fn_kwargs = {"filename": path, "metadata": file_data.metadata.
|
|
60
|
+
fn_kwargs = {"filename": path, "metadata": file_data.metadata.model_dump()}
|
|
60
61
|
if not asyncio.iscoroutinefunction(fn):
|
|
61
62
|
partitioned_content = fn(**fn_kwargs)
|
|
62
63
|
elif semaphore := self.context.semaphore:
|
|
@@ -4,7 +4,7 @@ from dataclasses import dataclass
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Callable, Optional, TypedDict
|
|
6
6
|
|
|
7
|
-
from unstructured_ingest.v2.interfaces.file_data import
|
|
7
|
+
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
8
8
|
from unstructured_ingest.v2.interfaces.upload_stager import UploadStager
|
|
9
9
|
from unstructured_ingest.v2.logger import logger
|
|
10
10
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
@@ -39,11 +39,13 @@ class UploadStageStep(PipelineStep):
|
|
|
39
39
|
self, fn: Callable, path: str, file_data_path: str
|
|
40
40
|
) -> UploadStageStepResponse:
|
|
41
41
|
path = Path(path)
|
|
42
|
+
# Maintain extension
|
|
43
|
+
output_filename = f"{self.get_hash(extras=[path.name])}{path.suffix}"
|
|
42
44
|
fn_kwargs = {
|
|
43
45
|
"elements_filepath": path,
|
|
44
|
-
"file_data":
|
|
46
|
+
"file_data": file_data_from_file(path=file_data_path),
|
|
45
47
|
"output_dir": self.cache_dir,
|
|
46
|
-
"output_filename":
|
|
48
|
+
"output_filename": output_filename,
|
|
47
49
|
}
|
|
48
50
|
if not asyncio.iscoroutinefunction(fn):
|
|
49
51
|
staged_output_path = fn(**fn_kwargs)
|
|
@@ -3,7 +3,7 @@ from dataclasses import dataclass
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Callable, TypedDict
|
|
5
5
|
|
|
6
|
-
from unstructured_ingest.v2.interfaces.file_data import
|
|
6
|
+
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
7
7
|
from unstructured_ingest.v2.logger import logger
|
|
8
8
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
9
9
|
from unstructured_ingest.v2.processes.uncompress import Uncompressor
|
|
@@ -28,7 +28,7 @@ class UncompressStep(PipelineStep):
|
|
|
28
28
|
async def _run_async(
|
|
29
29
|
self, fn: Callable, path: str, file_data_path: str
|
|
30
30
|
) -> list[UncompressStepResponse]:
|
|
31
|
-
file_data =
|
|
31
|
+
file_data = file_data_from_file(path=file_data_path)
|
|
32
32
|
fn_kwargs = {"file_data": file_data}
|
|
33
33
|
if not asyncio.iscoroutinefunction(fn):
|
|
34
34
|
new_file_data = fn(**fn_kwargs)
|
|
@@ -3,7 +3,7 @@ from dataclasses import dataclass
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Callable, Optional, TypedDict
|
|
5
5
|
|
|
6
|
-
from unstructured_ingest.v2.interfaces import
|
|
6
|
+
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
7
7
|
from unstructured_ingest.v2.interfaces.uploader import UploadContent
|
|
8
8
|
from unstructured_ingest.v2.logger import logger
|
|
9
9
|
from unstructured_ingest.v2.pipeline.interfaces import BatchPipelineStep
|
|
@@ -41,14 +41,14 @@ class UploadStep(BatchPipelineStep):
|
|
|
41
41
|
@instrument(span_name=STEP_ID)
|
|
42
42
|
def _run_batch(self, contents: list[UploadStepContent]) -> None:
|
|
43
43
|
upload_contents = [
|
|
44
|
-
UploadContent(path=Path(c["path"]), file_data=
|
|
44
|
+
UploadContent(path=Path(c["path"]), file_data=file_data_from_file(c["file_data_path"]))
|
|
45
45
|
for c in contents
|
|
46
46
|
]
|
|
47
47
|
self.process.run_batch(contents=upload_contents)
|
|
48
48
|
|
|
49
49
|
async def _run_async(self, path: str, file_data_path: str, fn: Optional[Callable] = None):
|
|
50
50
|
fn = fn or self.process.run_async
|
|
51
|
-
fn_kwargs = {"path": Path(path), "file_data":
|
|
51
|
+
fn_kwargs = {"path": Path(path), "file_data": file_data_from_file(path=file_data_path)}
|
|
52
52
|
if not asyncio.iscoroutinefunction(fn):
|
|
53
53
|
fn(**fn_kwargs)
|
|
54
54
|
elif semaphore := self.context.semaphore:
|
|
@@ -40,6 +40,8 @@ from .milvus import CONNECTOR_TYPE as MILVUS_CONNECTOR_TYPE
|
|
|
40
40
|
from .milvus import milvus_destination_entry
|
|
41
41
|
from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
|
|
42
42
|
from .mongodb import mongodb_destination_entry, mongodb_source_entry
|
|
43
|
+
from .neo4j import CONNECTOR_TYPE as NEO4J_CONNECTOR_TYPE
|
|
44
|
+
from .neo4j import neo4j_destination_entry
|
|
43
45
|
from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
|
|
44
46
|
from .onedrive import onedrive_destination_entry, onedrive_source_entry
|
|
45
47
|
from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
|
|
@@ -74,6 +76,7 @@ add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=local_destina
|
|
|
74
76
|
add_source_entry(source_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_source_entry)
|
|
75
77
|
add_destination_entry(destination_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_destination_entry)
|
|
76
78
|
|
|
79
|
+
add_destination_entry(destination_type=NEO4J_CONNECTOR_TYPE, entry=neo4j_destination_entry)
|
|
77
80
|
|
|
78
81
|
add_source_entry(source_type=SALESFORCE_CONNECTOR_TYPE, entry=salesforce_source_entry)
|
|
79
82
|
|
|
@@ -1,14 +1,11 @@
|
|
|
1
|
-
import copy
|
|
2
1
|
import csv
|
|
3
2
|
import hashlib
|
|
4
|
-
import json
|
|
5
|
-
import sys
|
|
6
3
|
from dataclasses import dataclass, field
|
|
7
4
|
from pathlib import Path
|
|
8
5
|
from time import time
|
|
9
6
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
10
7
|
|
|
11
|
-
from pydantic import Field, Secret
|
|
8
|
+
from pydantic import BaseModel, Field, Secret
|
|
12
9
|
|
|
13
10
|
from unstructured_ingest import __name__ as integration_name
|
|
14
11
|
from unstructured_ingest.__version__ import __version__ as integration_version
|
|
@@ -17,12 +14,14 @@ from unstructured_ingest.error import (
|
|
|
17
14
|
SourceConnectionError,
|
|
18
15
|
SourceConnectionNetworkError,
|
|
19
16
|
)
|
|
20
|
-
from unstructured_ingest.utils.data_prep import batch_generator
|
|
17
|
+
from unstructured_ingest.utils.data_prep import batch_generator, get_data
|
|
21
18
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
22
19
|
from unstructured_ingest.utils.string_and_date_utils import truncate_string_bytes
|
|
23
20
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
24
21
|
from unstructured_ingest.v2.interfaces import (
|
|
25
22
|
AccessConfig,
|
|
23
|
+
BatchFileData,
|
|
24
|
+
BatchItem,
|
|
26
25
|
ConnectionConfig,
|
|
27
26
|
Downloader,
|
|
28
27
|
DownloaderConfig,
|
|
@@ -54,6 +53,15 @@ CONNECTOR_TYPE = "astradb"
|
|
|
54
53
|
MAX_CONTENT_PARAM_BYTE_SIZE = 8000
|
|
55
54
|
|
|
56
55
|
|
|
56
|
+
class AstraDBAdditionalMetadata(BaseModel):
|
|
57
|
+
collection_name: str
|
|
58
|
+
keyspace: Optional[str] = None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class AstraDBBatchFileData(BatchFileData):
|
|
62
|
+
additional_metadata: AstraDBAdditionalMetadata
|
|
63
|
+
|
|
64
|
+
|
|
57
65
|
class AstraDBAccessConfig(AccessConfig):
|
|
58
66
|
token: str = Field(description="Astra DB Token with access to the database.")
|
|
59
67
|
api_endpoint: str = Field(description="The API endpoint for the Astra DB.")
|
|
@@ -180,9 +188,6 @@ class AstraDBIndexer(Indexer):
|
|
|
180
188
|
|
|
181
189
|
def _get_doc_ids(self) -> set[str]:
|
|
182
190
|
"""Fetches all document ids in an index"""
|
|
183
|
-
# Initialize set of ids
|
|
184
|
-
ids = set()
|
|
185
|
-
|
|
186
191
|
# Get the collection
|
|
187
192
|
collection = self.get_collection()
|
|
188
193
|
|
|
@@ -195,31 +200,26 @@ class AstraDBIndexer(Indexer):
|
|
|
195
200
|
astra_db_docs.append(result)
|
|
196
201
|
|
|
197
202
|
# Create file data for each astra record
|
|
198
|
-
for astra_record in astra_db_docs
|
|
199
|
-
ids.add(astra_record["_id"])
|
|
203
|
+
ids = sorted([astra_record["_id"] for astra_record in astra_db_docs])
|
|
200
204
|
|
|
201
|
-
return ids
|
|
205
|
+
return set(ids)
|
|
202
206
|
|
|
203
|
-
def run(self, **kwargs: Any) -> Generator[
|
|
207
|
+
def run(self, **kwargs: Any) -> Generator[AstraDBBatchFileData, None, None]:
|
|
204
208
|
all_ids = self._get_doc_ids()
|
|
205
209
|
ids = list(all_ids)
|
|
206
210
|
id_batches = batch_generator(ids, self.index_config.batch_size)
|
|
207
211
|
|
|
208
212
|
for batch in id_batches:
|
|
209
|
-
|
|
210
|
-
identified = str(hash(batch) + sys.maxsize + 1)
|
|
211
|
-
fd = FileData(
|
|
212
|
-
identifier=identified,
|
|
213
|
+
fd = AstraDBBatchFileData(
|
|
213
214
|
connector_type=CONNECTOR_TYPE,
|
|
214
|
-
doc_type="batch",
|
|
215
215
|
metadata=FileDataSourceMetadata(
|
|
216
216
|
date_processed=str(time()),
|
|
217
217
|
),
|
|
218
|
-
additional_metadata=
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
218
|
+
additional_metadata=AstraDBAdditionalMetadata(
|
|
219
|
+
collection_name=self.index_config.collection_name,
|
|
220
|
+
keyspace=self.index_config.keyspace,
|
|
221
|
+
),
|
|
222
|
+
batch_items=[BatchItem(identifier=b) for b in batch],
|
|
223
223
|
)
|
|
224
224
|
yield fd
|
|
225
225
|
|
|
@@ -248,7 +248,9 @@ class AstraDBDownloader(Downloader):
|
|
|
248
248
|
writer.writerow(astra_result.keys())
|
|
249
249
|
writer.writerow(astra_result.values())
|
|
250
250
|
|
|
251
|
-
def generate_download_response(
|
|
251
|
+
def generate_download_response(
|
|
252
|
+
self, result: dict, file_data: AstraDBBatchFileData
|
|
253
|
+
) -> DownloadResponse:
|
|
252
254
|
record_id = result["_id"]
|
|
253
255
|
filename_id = self.get_identifier(record_id=record_id)
|
|
254
256
|
filename = f"{filename_id}.csv" # csv to preserve column info
|
|
@@ -256,7 +258,7 @@ class AstraDBDownloader(Downloader):
|
|
|
256
258
|
logger.debug(f"Downloading results from record {record_id} as csv to {download_path}")
|
|
257
259
|
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
258
260
|
try:
|
|
259
|
-
self.write_astra_result_to_csv(astra_result=result, download_path=download_path)
|
|
261
|
+
self.write_astra_result_to_csv(astra_result=result, download_path=str(download_path))
|
|
260
262
|
except Exception as e:
|
|
261
263
|
logger.error(
|
|
262
264
|
f"failed to download from record {record_id} to {download_path}: {e}",
|
|
@@ -265,14 +267,12 @@ class AstraDBDownloader(Downloader):
|
|
|
265
267
|
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
266
268
|
|
|
267
269
|
# modify input file_data for download_response
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
copied_file_data.metadata.record_locator = {"document_id": record_id}
|
|
273
|
-
copied_file_data.additional_metadata.pop("ids", None)
|
|
270
|
+
cast_file_data = FileData.cast(file_data=file_data)
|
|
271
|
+
cast_file_data.identifier = filename
|
|
272
|
+
cast_file_data.metadata.date_processed = str(time())
|
|
273
|
+
cast_file_data.metadata.record_locator = {"document_id": record_id}
|
|
274
274
|
return super().generate_download_response(
|
|
275
|
-
file_data=
|
|
275
|
+
file_data=cast_file_data, download_path=download_path
|
|
276
276
|
)
|
|
277
277
|
|
|
278
278
|
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
@@ -280,9 +280,10 @@ class AstraDBDownloader(Downloader):
|
|
|
280
280
|
|
|
281
281
|
async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
282
282
|
# Get metadata from file_data
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
283
|
+
astra_file_data = AstraDBBatchFileData.cast(file_data=file_data)
|
|
284
|
+
ids: list[str] = [item.identifier for item in astra_file_data.batch_items]
|
|
285
|
+
collection_name: str = astra_file_data.additional_metadata.collection_name
|
|
286
|
+
keyspace: str = astra_file_data.additional_metadata.keyspace
|
|
286
287
|
|
|
287
288
|
# Retrieve results from async collection
|
|
288
289
|
download_responses = []
|
|
@@ -293,7 +294,7 @@ class AstraDBDownloader(Downloader):
|
|
|
293
294
|
)
|
|
294
295
|
async for result in async_astra_collection.find({"_id": {"$in": ids}}):
|
|
295
296
|
download_responses.append(
|
|
296
|
-
self.generate_download_response(result=result, file_data=
|
|
297
|
+
self.generate_download_response(result=result, file_data=astra_file_data)
|
|
297
298
|
)
|
|
298
299
|
return download_responses
|
|
299
300
|
|
|
@@ -325,29 +326,6 @@ class AstraDBUploadStager(UploadStager):
|
|
|
325
326
|
"metadata": element_dict,
|
|
326
327
|
}
|
|
327
328
|
|
|
328
|
-
def run(
|
|
329
|
-
self,
|
|
330
|
-
elements_filepath: Path,
|
|
331
|
-
file_data: FileData,
|
|
332
|
-
output_dir: Path,
|
|
333
|
-
output_filename: str,
|
|
334
|
-
**kwargs: Any,
|
|
335
|
-
) -> Path:
|
|
336
|
-
with open(elements_filepath) as elements_file:
|
|
337
|
-
elements_contents = json.load(elements_file)
|
|
338
|
-
conformed_elements = []
|
|
339
|
-
for element in elements_contents:
|
|
340
|
-
conformed_elements.append(self.conform_dict(element_dict=element, file_data=file_data))
|
|
341
|
-
output_filename_path = Path(output_filename)
|
|
342
|
-
if output_filename_path.suffix == ".json":
|
|
343
|
-
output_path = Path(output_dir) / output_filename_path
|
|
344
|
-
else:
|
|
345
|
-
output_path = Path(output_dir) / output_filename_path.with_suffix(".json")
|
|
346
|
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
347
|
-
with open(output_path, "w") as output_file:
|
|
348
|
-
json.dump(conformed_elements, output_file, indent=2)
|
|
349
|
-
return output_path
|
|
350
|
-
|
|
351
329
|
|
|
352
330
|
@dataclass
|
|
353
331
|
class AstraDBUploader(Uploader):
|
|
@@ -386,11 +364,9 @@ class AstraDBUploader(Uploader):
|
|
|
386
364
|
f"deleted {delete_resp.deleted_count} records from collection {collection.name}"
|
|
387
365
|
)
|
|
388
366
|
|
|
389
|
-
def
|
|
390
|
-
with path.open("r") as file:
|
|
391
|
-
elements_dict = json.load(file)
|
|
367
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
392
368
|
logger.info(
|
|
393
|
-
f"writing {len(
|
|
369
|
+
f"writing {len(data)} objects to destination "
|
|
394
370
|
f"collection {self.upload_config.collection_name}"
|
|
395
371
|
)
|
|
396
372
|
|
|
@@ -399,9 +375,13 @@ class AstraDBUploader(Uploader):
|
|
|
399
375
|
|
|
400
376
|
self.delete_by_record_id(collection=collection, file_data=file_data)
|
|
401
377
|
|
|
402
|
-
for chunk in batch_generator(
|
|
378
|
+
for chunk in batch_generator(data, astra_db_batch_size):
|
|
403
379
|
collection.insert_many(chunk)
|
|
404
380
|
|
|
381
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
382
|
+
data = get_data(path=path)
|
|
383
|
+
self.run_data(data=data, file_data=file_data, **kwargs)
|
|
384
|
+
|
|
405
385
|
|
|
406
386
|
astra_db_source_entry = SourceRegistryEntry(
|
|
407
387
|
indexer=AstraDBIndexer,
|