unstructured-ingest 0.0.19__py3-none-any.whl → 0.0.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cmds/astradb.py +2 -2
- unstructured_ingest/connector/astradb.py +54 -24
- unstructured_ingest/embed/bedrock.py +56 -19
- unstructured_ingest/embed/huggingface.py +22 -22
- unstructured_ingest/embed/interfaces.py +11 -4
- unstructured_ingest/embed/mixedbreadai.py +17 -17
- unstructured_ingest/embed/octoai.py +7 -7
- unstructured_ingest/embed/openai.py +15 -20
- unstructured_ingest/embed/vertexai.py +25 -17
- unstructured_ingest/embed/voyageai.py +22 -17
- unstructured_ingest/v2/cli/base/cmd.py +1 -1
- unstructured_ingest/v2/interfaces/connector.py +1 -1
- unstructured_ingest/v2/pipeline/pipeline.py +3 -1
- unstructured_ingest/v2/pipeline/steps/chunk.py +1 -1
- unstructured_ingest/v2/pipeline/steps/download.py +6 -2
- unstructured_ingest/v2/pipeline/steps/embed.py +1 -1
- unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
- unstructured_ingest/v2/pipeline/steps/index.py +4 -2
- unstructured_ingest/v2/pipeline/steps/partition.py +1 -1
- unstructured_ingest/v2/pipeline/steps/stage.py +3 -1
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
- unstructured_ingest/v2/pipeline/steps/upload.py +6 -2
- unstructured_ingest/v2/processes/chunker.py +8 -29
- unstructured_ingest/v2/processes/connectors/airtable.py +1 -1
- unstructured_ingest/v2/processes/connectors/astradb.py +26 -19
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +11 -8
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +2 -2
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +31 -5
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +31 -2
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +36 -8
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +25 -77
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +30 -1
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +15 -18
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +22 -1
- unstructured_ingest/v2/processes/connectors/milvus.py +2 -2
- unstructured_ingest/v2/processes/connectors/opensearch.py +2 -2
- unstructured_ingest/v2/processes/partitioner.py +9 -55
- unstructured_ingest/v2/unstructured_api.py +87 -0
- unstructured_ingest/v2/utils.py +1 -1
- unstructured_ingest-0.0.22.dist-info/METADATA +186 -0
- {unstructured_ingest-0.0.19.dist-info → unstructured_ingest-0.0.22.dist-info}/RECORD +46 -45
- {unstructured_ingest-0.0.19.dist-info → unstructured_ingest-0.0.22.dist-info}/WHEEL +1 -1
- unstructured_ingest-0.0.19.dist-info/METADATA +0 -639
- {unstructured_ingest-0.0.19.dist-info → unstructured_ingest-0.0.22.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.19.dist-info → unstructured_ingest-0.0.22.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.19.dist-info → unstructured_ingest-0.0.22.dist-info}/top_level.txt +0 -0
|
@@ -3,7 +3,7 @@ import json
|
|
|
3
3
|
import os
|
|
4
4
|
from dataclasses import dataclass
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import TYPE_CHECKING, Annotated, Any,
|
|
6
|
+
from typing import TYPE_CHECKING, Annotated, Any, Optional
|
|
7
7
|
|
|
8
8
|
import numpy as np
|
|
9
9
|
from pydantic import Field, Secret, ValidationError
|
|
@@ -13,7 +13,7 @@ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, Embedding
|
|
|
13
13
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
14
|
|
|
15
15
|
if TYPE_CHECKING:
|
|
16
|
-
from
|
|
16
|
+
from vertexai.language_models import TextEmbeddingModel
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
def conform_string_to_dict(value: Any) -> dict:
|
|
@@ -41,45 +41,53 @@ class VertexAIEmbeddingConfig(EmbeddingConfig):
|
|
|
41
41
|
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = str(application_credentials_path)
|
|
42
42
|
|
|
43
43
|
@requires_dependencies(
|
|
44
|
-
["
|
|
44
|
+
["vertexai"],
|
|
45
45
|
extras="embed-vertexai",
|
|
46
46
|
)
|
|
47
|
-
def get_client(self) -> "
|
|
47
|
+
def get_client(self) -> "TextEmbeddingModel":
|
|
48
48
|
"""Creates a Langchain VertexAI python client to embed elements."""
|
|
49
|
-
from
|
|
49
|
+
from vertexai.language_models import TextEmbeddingModel
|
|
50
50
|
|
|
51
51
|
self.register_application_credentials()
|
|
52
|
-
|
|
53
|
-
return vertexai_client
|
|
52
|
+
return TextEmbeddingModel.from_pretrained(self.embedder_model_name)
|
|
54
53
|
|
|
55
54
|
|
|
56
55
|
@dataclass
|
|
57
56
|
class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
58
57
|
config: VertexAIEmbeddingConfig
|
|
59
58
|
|
|
60
|
-
def get_exemplary_embedding(self) ->
|
|
59
|
+
def get_exemplary_embedding(self) -> list[float]:
|
|
61
60
|
return self.embed_query(query="A sample query.")
|
|
62
61
|
|
|
63
|
-
def num_of_dimensions(self):
|
|
62
|
+
def num_of_dimensions(self) -> tuple[int, ...]:
|
|
64
63
|
exemplary_embedding = self.get_exemplary_embedding()
|
|
65
64
|
return np.shape(exemplary_embedding)
|
|
66
65
|
|
|
67
|
-
def is_unit_vector(self):
|
|
66
|
+
def is_unit_vector(self) -> bool:
|
|
68
67
|
exemplary_embedding = self.get_exemplary_embedding()
|
|
69
68
|
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
|
|
70
69
|
|
|
71
70
|
def embed_query(self, query):
|
|
72
|
-
|
|
73
|
-
result = client.embed_query(str(query))
|
|
74
|
-
return result
|
|
71
|
+
return self._embed_documents(elements=[query])[0]
|
|
75
72
|
|
|
76
|
-
def embed_documents(self, elements:
|
|
77
|
-
|
|
78
|
-
embeddings = client.embed_documents([e.get("text", "") for e in elements])
|
|
73
|
+
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
74
|
+
embeddings = self._embed_documents([e.get("text", "") for e in elements])
|
|
79
75
|
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
|
|
80
76
|
return elements_with_embeddings
|
|
81
77
|
|
|
82
|
-
|
|
78
|
+
@requires_dependencies(
|
|
79
|
+
["vertexai"],
|
|
80
|
+
extras="embed-vertexai",
|
|
81
|
+
)
|
|
82
|
+
def _embed_documents(self, elements: list[str]) -> list[list[float]]:
|
|
83
|
+
from vertexai.language_models import TextEmbeddingInput
|
|
84
|
+
|
|
85
|
+
client = self.config.get_client()
|
|
86
|
+
inputs = [TextEmbeddingInput(text=element) for element in elements]
|
|
87
|
+
embeddings = client.get_embeddings(inputs)
|
|
88
|
+
return [e.values for e in embeddings]
|
|
89
|
+
|
|
90
|
+
def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
|
|
83
91
|
assert len(elements) == len(embeddings)
|
|
84
92
|
elements_w_embedding = []
|
|
85
93
|
for i, element in enumerate(elements):
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
|
-
from typing import TYPE_CHECKING,
|
|
2
|
+
from typing import TYPE_CHECKING, Optional
|
|
3
3
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
from pydantic import Field, SecretStr
|
|
@@ -8,7 +8,7 @@ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, Embedding
|
|
|
8
8
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
9
|
|
|
10
10
|
if TYPE_CHECKING:
|
|
11
|
-
from
|
|
11
|
+
from voyageai import Client as VoyageAIClient
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class VoyageAIEmbeddingConfig(EmbeddingConfig):
|
|
@@ -16,28 +16,30 @@ class VoyageAIEmbeddingConfig(EmbeddingConfig):
|
|
|
16
16
|
embedder_model_name: str = Field(alias="model_name")
|
|
17
17
|
batch_size: Optional[int] = Field(default=None)
|
|
18
18
|
truncation: Optional[bool] = Field(default=None)
|
|
19
|
+
max_retries: int = 0
|
|
20
|
+
timeout_in_seconds: Optional[int] = None
|
|
19
21
|
|
|
20
22
|
@requires_dependencies(
|
|
21
23
|
["langchain", "langchain_voyageai"],
|
|
22
24
|
extras="embed-voyageai",
|
|
23
25
|
)
|
|
24
|
-
def get_client(self) -> "
|
|
26
|
+
def get_client(self) -> "VoyageAIClient":
|
|
25
27
|
"""Creates a Langchain VoyageAI python client to embed elements."""
|
|
26
|
-
from
|
|
28
|
+
from voyageai import Client as VoyageAIClient
|
|
27
29
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
truncation=self.truncation,
|
|
30
|
+
client = VoyageAIClient(
|
|
31
|
+
api_key=self.api_key.get_secret_value(),
|
|
32
|
+
max_retries=self.max_retries,
|
|
33
|
+
timeout=self.timeout_in_seconds,
|
|
33
34
|
)
|
|
35
|
+
return client
|
|
34
36
|
|
|
35
37
|
|
|
36
38
|
@dataclass
|
|
37
39
|
class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
38
40
|
config: VoyageAIEmbeddingConfig
|
|
39
41
|
|
|
40
|
-
def get_exemplary_embedding(self) ->
|
|
42
|
+
def get_exemplary_embedding(self) -> list[float]:
|
|
41
43
|
return self.embed_query(query="A sample query.")
|
|
42
44
|
|
|
43
45
|
@property
|
|
@@ -50,17 +52,20 @@ class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
50
52
|
exemplary_embedding = self.get_exemplary_embedding()
|
|
51
53
|
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
|
|
52
54
|
|
|
53
|
-
def
|
|
54
|
-
client = self.config.get_client()
|
|
55
|
-
|
|
55
|
+
def _embed_documents(self, elements: list[str]) -> list[list[float]]:
|
|
56
|
+
client: VoyageAIClient = self.config.get_client()
|
|
57
|
+
response = client.embed(texts=elements, model=self.config.embedder_model_name)
|
|
58
|
+
return response.embeddings
|
|
59
|
+
|
|
60
|
+
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
61
|
+
embeddings = self._embed_documents([e.get("text", "") for e in elements])
|
|
56
62
|
return self._add_embeddings_to_elements(elements, embeddings)
|
|
57
63
|
|
|
58
|
-
def embed_query(self, query: str) ->
|
|
59
|
-
|
|
60
|
-
return client.embed_query(query)
|
|
64
|
+
def embed_query(self, query: str) -> list[float]:
|
|
65
|
+
return self._embed_documents(elements=[query])[0]
|
|
61
66
|
|
|
62
67
|
@staticmethod
|
|
63
|
-
def _add_embeddings_to_elements(elements, embeddings) ->
|
|
68
|
+
def _add_embeddings_to_elements(elements, embeddings) -> list[dict]:
|
|
64
69
|
assert len(elements) == len(embeddings)
|
|
65
70
|
elements_w_embedding = []
|
|
66
71
|
for i, element in enumerate(elements):
|
|
@@ -155,7 +155,7 @@ class BaseCmd(ABC):
|
|
|
155
155
|
@staticmethod
|
|
156
156
|
def get_filterer(options: dict[str, Any]) -> Optional[Filterer]:
|
|
157
157
|
filterer_configs = extract_config(flat_data=options, config=FiltererConfig)
|
|
158
|
-
if not filterer_configs.
|
|
158
|
+
if not filterer_configs.model_dump():
|
|
159
159
|
return None
|
|
160
160
|
return Filterer(config=filterer_configs)
|
|
161
161
|
|
|
@@ -19,7 +19,7 @@ class ConnectionConfig(BaseModel):
|
|
|
19
19
|
def get_access_config(self) -> dict[str, Any]:
|
|
20
20
|
if not self.access_config:
|
|
21
21
|
return {}
|
|
22
|
-
return self.access_config.get_secret_value().
|
|
22
|
+
return self.access_config.get_secret_value().model_dump()
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
ConnectionConfigT = TypeVar("ConnectionConfigT", bound=ConnectionConfig)
|
|
@@ -187,7 +187,9 @@ class Pipeline:
|
|
|
187
187
|
return filtered_records
|
|
188
188
|
|
|
189
189
|
def _run(self):
|
|
190
|
-
logger.info(
|
|
190
|
+
logger.info(
|
|
191
|
+
f"running local pipeline: {self} with configs: " f"{self.context.model_dump_json()}"
|
|
192
|
+
)
|
|
191
193
|
if self.context.mp_supported:
|
|
192
194
|
manager = mp.Manager()
|
|
193
195
|
self.context.status = manager.dict()
|
|
@@ -28,7 +28,7 @@ class ChunkStep(PipelineStep):
|
|
|
28
28
|
return f"{self.identifier} ({self.process.config.chunking_strategy})"
|
|
29
29
|
|
|
30
30
|
def __post_init__(self):
|
|
31
|
-
config = self.process.config.
|
|
31
|
+
config = self.process.config.model_dump_json() if self.process.config else None
|
|
32
32
|
logger.info(f"created {self.identifier} with configs: {config}")
|
|
33
33
|
|
|
34
34
|
def should_chunk(self, filepath: Path, file_data: FileData) -> bool:
|
|
@@ -31,9 +31,13 @@ class DownloadStep(PipelineStep):
|
|
|
31
31
|
return f"{self.identifier} ({self.process.__class__.__name__})"
|
|
32
32
|
|
|
33
33
|
def __post_init__(self):
|
|
34
|
-
config =
|
|
34
|
+
config = (
|
|
35
|
+
self.process.download_config.model_dump_json() if self.process.download_config else None
|
|
36
|
+
)
|
|
35
37
|
connection_config = (
|
|
36
|
-
self.process.connection_config.
|
|
38
|
+
self.process.connection_config.model_dump_json()
|
|
39
|
+
if self.process.connection_config
|
|
40
|
+
else None
|
|
37
41
|
)
|
|
38
42
|
logger.info(
|
|
39
43
|
f"Created {self.identifier} with configs: {config}, "
|
|
@@ -28,7 +28,7 @@ class EmbedStep(PipelineStep):
|
|
|
28
28
|
return f"{self.identifier} ({self.process.config.embedding_provider})"
|
|
29
29
|
|
|
30
30
|
def __post_init__(self):
|
|
31
|
-
config = self.process.config.
|
|
31
|
+
config = self.process.config.model_dump_json() if self.process.config else None
|
|
32
32
|
logger.info(f"created {self.identifier} with configs: {config}")
|
|
33
33
|
|
|
34
34
|
def should_embed(self, filepath: Path, file_data: FileData) -> bool:
|
|
@@ -16,7 +16,7 @@ class FilterStep(PipelineStep):
|
|
|
16
16
|
identifier: str = STEP_ID
|
|
17
17
|
|
|
18
18
|
def __post_init__(self):
|
|
19
|
-
config = self.process.config.
|
|
19
|
+
config = self.process.config.model_dump_json() if self.process.config else None
|
|
20
20
|
logger.info(f"created {self.identifier} with configs: {config}")
|
|
21
21
|
|
|
22
22
|
async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
|
|
@@ -23,9 +23,11 @@ class IndexStep(PipelineStep):
|
|
|
23
23
|
return f"{self.identifier} ({self.process.__class__.__name__})"
|
|
24
24
|
|
|
25
25
|
def __post_init__(self):
|
|
26
|
-
config = self.process.index_config.
|
|
26
|
+
config = self.process.index_config.model_dump_json() if self.process.index_config else None
|
|
27
27
|
connection_config = (
|
|
28
|
-
self.process.connection_config.
|
|
28
|
+
self.process.connection_config.model_dump_json()
|
|
29
|
+
if self.process.connection_config
|
|
30
|
+
else None
|
|
29
31
|
)
|
|
30
32
|
logger.info(
|
|
31
33
|
f"created {self.identifier} with configs: {config}, "
|
|
@@ -28,7 +28,7 @@ class PartitionStep(PipelineStep):
|
|
|
28
28
|
return f"{self.identifier} ({self.process.config.strategy})"
|
|
29
29
|
|
|
30
30
|
def __post_init__(self):
|
|
31
|
-
config = self.process.config.
|
|
31
|
+
config = self.process.config.model_dump_json()
|
|
32
32
|
logger.info(f"created {self.identifier} with configs: {config}")
|
|
33
33
|
|
|
34
34
|
def should_partition(self, filepath: Path, file_data: FileData) -> bool:
|
|
@@ -28,7 +28,9 @@ class UploadStageStep(PipelineStep):
|
|
|
28
28
|
|
|
29
29
|
def __post_init__(self):
|
|
30
30
|
config = (
|
|
31
|
-
self.process.upload_stager_config.
|
|
31
|
+
self.process.upload_stager_config.model_dump_json()
|
|
32
|
+
if self.process.upload_stager_config
|
|
33
|
+
else None
|
|
32
34
|
)
|
|
33
35
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
34
36
|
logger.info(f"created {self.identifier} with configs: {config}")
|
|
@@ -22,7 +22,7 @@ class UncompressStep(PipelineStep):
|
|
|
22
22
|
identifier: str = STEP_ID
|
|
23
23
|
|
|
24
24
|
def __post_init__(self):
|
|
25
|
-
config = self.process.config.
|
|
25
|
+
config = self.process.config.model_dump_json() if self.process.config else None
|
|
26
26
|
logger.info(f"created {self.identifier} with configs: {config}")
|
|
27
27
|
|
|
28
28
|
async def _run_async(
|
|
@@ -25,9 +25,13 @@ class UploadStep(BatchPipelineStep):
|
|
|
25
25
|
return f"{self.identifier} ({self.process.__class__.__name__})"
|
|
26
26
|
|
|
27
27
|
def __post_init__(self):
|
|
28
|
-
config =
|
|
28
|
+
config = (
|
|
29
|
+
self.process.upload_config.model_dump_json() if self.process.upload_config else None
|
|
30
|
+
)
|
|
29
31
|
connection_config = (
|
|
30
|
-
self.process.connection_config.
|
|
32
|
+
self.process.connection_config.model_dump_json()
|
|
33
|
+
if self.process.connection_config
|
|
34
|
+
else None
|
|
31
35
|
)
|
|
32
36
|
logger.info(
|
|
33
37
|
f"Created {self.identifier} with configs: {config}, "
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from abc import ABC
|
|
2
|
-
from dataclasses import dataclass
|
|
2
|
+
from dataclasses import dataclass
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Any, Optional
|
|
5
5
|
|
|
@@ -9,6 +9,7 @@ from unstructured_ingest.utils.chunking import assign_and_map_hash_ids
|
|
|
9
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
10
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
11
11
|
from unstructured_ingest.v2.logger import logger
|
|
12
|
+
from unstructured_ingest.v2.unstructured_api import call_api
|
|
12
13
|
|
|
13
14
|
CHUNK_MAX_CHARS_DEFAULT: int = 500
|
|
14
15
|
CHUNK_MULTI_PAGE_DEFAULT: bool = True
|
|
@@ -111,35 +112,13 @@ class Chunker(BaseProcess, ABC):
|
|
|
111
112
|
|
|
112
113
|
@requires_dependencies(dependencies=["unstructured_client"], extras="remote")
|
|
113
114
|
async def run_async(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
|
|
114
|
-
|
|
115
|
-
from unstructured_client.models.operations import PartitionRequest
|
|
116
|
-
from unstructured_client.models.shared import Files, PartitionParameters
|
|
117
|
-
|
|
118
|
-
client = UnstructuredClient(
|
|
119
|
-
api_key_auth=self.config.chunk_api_key.get_secret_value(),
|
|
115
|
+
elements = await call_api(
|
|
120
116
|
server_url=self.config.chunking_endpoint,
|
|
117
|
+
api_key=self.config.chunk_api_key.get_secret_value(),
|
|
118
|
+
filename=elements_filepath,
|
|
119
|
+
api_parameters=self.config.to_chunking_kwargs(),
|
|
121
120
|
)
|
|
122
|
-
|
|
123
|
-
possible_fields = [f.name for f in fields(PartitionParameters)]
|
|
124
|
-
filtered_partition_request = {
|
|
125
|
-
k: v for k, v in partition_request.items() if k in possible_fields
|
|
126
|
-
}
|
|
127
|
-
if len(filtered_partition_request) != len(partition_request):
|
|
128
|
-
logger.debug(
|
|
129
|
-
"Following fields were omitted due to not being "
|
|
130
|
-
"supported by the currently used unstructured client: {}".format(
|
|
131
|
-
", ".join([v for v in partition_request if v not in filtered_partition_request])
|
|
132
|
-
)
|
|
133
|
-
)
|
|
134
|
-
with open(elements_filepath, "rb") as f:
|
|
135
|
-
files = Files(
|
|
136
|
-
content=f.read(),
|
|
137
|
-
file_name=str(elements_filepath.resolve()),
|
|
138
|
-
)
|
|
139
|
-
filtered_partition_request["files"] = files
|
|
140
|
-
partition_params = PartitionParameters(**filtered_partition_request)
|
|
141
|
-
partition_request_obj = PartitionRequest(partition_params)
|
|
142
|
-
resp = client.general.partition(partition_request_obj)
|
|
143
|
-
elements = resp.elements or []
|
|
121
|
+
|
|
144
122
|
elements = assign_and_map_hash_ids(elements=elements)
|
|
123
|
+
|
|
145
124
|
return elements
|
|
@@ -181,7 +181,7 @@ class AirtableIndexer(Indexer):
|
|
|
181
181
|
yield FileData(
|
|
182
182
|
identifier=table_meta.get_id(),
|
|
183
183
|
connector_type=CONNECTOR_TYPE,
|
|
184
|
-
additional_metadata=table_meta.
|
|
184
|
+
additional_metadata=table_meta.model_dump(),
|
|
185
185
|
source_identifiers=SourceIdentifiers(
|
|
186
186
|
filename=str(Path(fullpath).name),
|
|
187
187
|
fullpath=fullpath,
|
|
@@ -25,7 +25,8 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
25
25
|
)
|
|
26
26
|
|
|
27
27
|
if TYPE_CHECKING:
|
|
28
|
-
from astrapy
|
|
28
|
+
from astrapy import Collection as AstraDBCollection
|
|
29
|
+
|
|
29
30
|
|
|
30
31
|
CONNECTOR_TYPE = "astradb"
|
|
31
32
|
|
|
@@ -85,7 +86,12 @@ class AstraDBUploaderConfig(UploaderConfig):
|
|
|
85
86
|
embedding_dimension: int = Field(
|
|
86
87
|
default=384, description="The dimensionality of the embeddings"
|
|
87
88
|
)
|
|
88
|
-
|
|
89
|
+
keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
|
|
90
|
+
namespace: Optional[str] = Field(
|
|
91
|
+
default=None,
|
|
92
|
+
description="The Astra DB connection namespace.",
|
|
93
|
+
deprecated="Please use 'keyspace' instead.",
|
|
94
|
+
)
|
|
89
95
|
requested_indexing_policy: Optional[dict[str, Any]] = Field(
|
|
90
96
|
default=None,
|
|
91
97
|
description="The indexing policy to use for the collection.",
|
|
@@ -109,33 +115,34 @@ class AstraDBUploader(Uploader):
|
|
|
109
115
|
|
|
110
116
|
@requires_dependencies(["astrapy"], extras="astradb")
|
|
111
117
|
def get_collection(self) -> "AstraDBCollection":
|
|
112
|
-
from astrapy
|
|
118
|
+
from astrapy import DataAPIClient as AstraDBClient
|
|
113
119
|
|
|
114
|
-
#
|
|
115
|
-
|
|
116
|
-
embedding_dimension = self.upload_config.embedding_dimension
|
|
117
|
-
requested_indexing_policy = self.upload_config.requested_indexing_policy
|
|
120
|
+
# Choose keyspace or deprecated namespace
|
|
121
|
+
keyspace_param = self.upload_config.keyspace or self.upload_config.namespace
|
|
118
122
|
|
|
119
|
-
#
|
|
120
|
-
|
|
123
|
+
# Get the collection_name
|
|
124
|
+
collection_name = self.upload_config.collection_name
|
|
121
125
|
|
|
122
126
|
# Build the Astra DB object.
|
|
123
|
-
# caller_name/version for AstraDB tracking
|
|
124
127
|
access_configs = self.connection_config.access_config.get_secret_value()
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
128
|
+
|
|
129
|
+
# Create a client object to interact with the Astra DB
|
|
130
|
+
# caller_name/version for Astra DB tracking
|
|
131
|
+
my_client = AstraDBClient(
|
|
129
132
|
caller_name=integration_name,
|
|
130
133
|
caller_version=integration_version,
|
|
131
134
|
)
|
|
132
135
|
|
|
133
|
-
#
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
136
|
+
# Get the database object
|
|
137
|
+
astra_db = my_client.get_database(
|
|
138
|
+
api_endpoint=access_configs.api_endpoint,
|
|
139
|
+
token=access_configs.token,
|
|
140
|
+
keyspace=keyspace_param,
|
|
138
141
|
)
|
|
142
|
+
|
|
143
|
+
# Connect to the newly created collection
|
|
144
|
+
astra_db_collection = astra_db.get_collection(name=collection_name)
|
|
145
|
+
|
|
139
146
|
return astra_db_collection
|
|
140
147
|
|
|
141
148
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
@@ -42,8 +42,10 @@ class DatabricksVolumesAccessConfig(AccessConfig):
|
|
|
42
42
|
description="The Databricks password part of basic authentication. "
|
|
43
43
|
"Only possible when Host is *.cloud.databricks.com (AWS).",
|
|
44
44
|
)
|
|
45
|
-
client_id: Optional[str] = Field(default=None)
|
|
46
|
-
client_secret: Optional[str] = Field(
|
|
45
|
+
client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
|
|
46
|
+
client_secret: Optional[str] = Field(
|
|
47
|
+
default=None, description="Client Secret of the OAuth app."
|
|
48
|
+
)
|
|
47
49
|
token: Optional[str] = Field(
|
|
48
50
|
default=None,
|
|
49
51
|
description="The Databricks personal access token (PAT) (AWS, Azure, and GCP) or "
|
|
@@ -128,7 +130,7 @@ class DatabricksVolumesUploader(Uploader):
|
|
|
128
130
|
|
|
129
131
|
return WorkspaceClient(
|
|
130
132
|
host=self.connection_config.host,
|
|
131
|
-
**self.connection_config.access_config.get_secret_value().
|
|
133
|
+
**self.connection_config.access_config.get_secret_value().model_dump(),
|
|
132
134
|
)
|
|
133
135
|
|
|
134
136
|
def precheck(self) -> None:
|
|
@@ -140,11 +142,12 @@ class DatabricksVolumesUploader(Uploader):
|
|
|
140
142
|
|
|
141
143
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
142
144
|
output_path = os.path.join(self.upload_config.path, path.name)
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
145
|
+
with open(path, "rb") as elements_file:
|
|
146
|
+
self.get_client().files.upload(
|
|
147
|
+
file_path=output_path,
|
|
148
|
+
contents=elements_file,
|
|
149
|
+
overwrite=self.upload_config.overwrite,
|
|
150
|
+
)
|
|
148
151
|
|
|
149
152
|
|
|
150
153
|
databricks_volumes_destination_entry = DestinationRegistryEntry(
|
|
@@ -104,8 +104,8 @@ class ElasticsearchConnectionConfig(ConnectionConfig):
|
|
|
104
104
|
elif access_config.es_api_key:
|
|
105
105
|
client_input_kwargs["api_key"] = access_config.es_api_key
|
|
106
106
|
client_input = ElasticsearchClientInput(**client_input_kwargs)
|
|
107
|
-
logger.debug(f"elasticsearch client inputs mapped to: {client_input.
|
|
108
|
-
client_kwargs = client_input.
|
|
107
|
+
logger.debug(f"elasticsearch client inputs mapped to: {client_input.model_dump()}")
|
|
108
|
+
client_kwargs = client_input.model_dump()
|
|
109
109
|
client_kwargs["basic_auth"] = (
|
|
110
110
|
client_input.basic_auth.get_secret_value() if client_input.basic_auth else None
|
|
111
111
|
)
|
|
@@ -2,12 +2,13 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from pathlib import Path
|
|
5
|
+
from time import time
|
|
5
6
|
from typing import Any, Generator, Optional
|
|
6
7
|
|
|
7
8
|
from pydantic import Field, Secret
|
|
8
9
|
|
|
9
10
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
|
-
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
|
|
11
|
+
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
|
|
11
12
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
12
13
|
DestinationRegistryEntry,
|
|
13
14
|
SourceRegistryEntry,
|
|
@@ -84,7 +85,7 @@ class AzureConnectionConfig(FsspecConnectionConfig):
|
|
|
84
85
|
def get_access_config(self) -> dict[str, Any]:
|
|
85
86
|
# Avoid injecting None by filtering out k,v pairs where the value is None
|
|
86
87
|
access_configs: dict[str, Any] = {
|
|
87
|
-
k: v for k, v in self.access_config.get_secret_value().
|
|
88
|
+
k: v for k, v in self.access_config.get_secret_value().model_dump().items() if v
|
|
88
89
|
}
|
|
89
90
|
return access_configs
|
|
90
91
|
|
|
@@ -99,14 +100,39 @@ class AzureIndexer(FsspecIndexer):
|
|
|
99
100
|
def precheck(self) -> None:
|
|
100
101
|
super().precheck()
|
|
101
102
|
|
|
102
|
-
def sterilize_info(self,
|
|
103
|
-
|
|
104
|
-
return sterilize_dict(data=info, default=azure_json_serial)
|
|
103
|
+
def sterilize_info(self, file_data: dict) -> dict:
|
|
104
|
+
return sterilize_dict(data=file_data, default=azure_json_serial)
|
|
105
105
|
|
|
106
106
|
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
107
107
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
108
108
|
return super().run(**kwargs)
|
|
109
109
|
|
|
110
|
+
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
111
|
+
path = file_data["name"]
|
|
112
|
+
date_created = (
|
|
113
|
+
file_data.get("creation_time").timestamp() if "creation_time" in file_data else None
|
|
114
|
+
)
|
|
115
|
+
date_modified = (
|
|
116
|
+
file_data.get("last_modified").timestamp() if "last_modified" in file_data else None
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
file_size = file_data.get("size") if "size" in file_data else None
|
|
120
|
+
|
|
121
|
+
version = file_data.get("etag")
|
|
122
|
+
record_locator = {
|
|
123
|
+
"protocol": self.index_config.protocol,
|
|
124
|
+
"remote_file_path": self.index_config.remote_url,
|
|
125
|
+
}
|
|
126
|
+
return FileDataSourceMetadata(
|
|
127
|
+
date_created=date_created,
|
|
128
|
+
date_modified=date_modified,
|
|
129
|
+
date_processed=str(time()),
|
|
130
|
+
version=version,
|
|
131
|
+
url=f"{self.index_config.protocol}://{path}",
|
|
132
|
+
record_locator=record_locator,
|
|
133
|
+
filesize_bytes=file_size,
|
|
134
|
+
)
|
|
135
|
+
|
|
110
136
|
|
|
111
137
|
class AzureDownloaderConfig(FsspecDownloaderConfig):
|
|
112
138
|
pass
|
|
@@ -2,12 +2,14 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from pathlib import Path
|
|
5
|
+
from time import time
|
|
5
6
|
from typing import Any, Generator, Optional
|
|
6
7
|
|
|
8
|
+
from dateutil import parser
|
|
7
9
|
from pydantic import Field, Secret
|
|
8
10
|
|
|
9
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
|
-
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
|
|
12
|
+
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
|
|
11
13
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
12
14
|
DestinationRegistryEntry,
|
|
13
15
|
SourceRegistryEntry,
|
|
@@ -52,7 +54,7 @@ class BoxConnectionConfig(FsspecConnectionConfig):
|
|
|
52
54
|
ac.box_app_config,
|
|
53
55
|
),
|
|
54
56
|
}
|
|
55
|
-
access_config: dict[str, Any] = ac.
|
|
57
|
+
access_config: dict[str, Any] = ac.model_dump()
|
|
56
58
|
access_config.pop("box_app_config", None)
|
|
57
59
|
access_kwargs_with_oauth.update(access_config)
|
|
58
60
|
|
|
@@ -73,6 +75,33 @@ class BoxIndexer(FsspecIndexer):
|
|
|
73
75
|
def precheck(self) -> None:
|
|
74
76
|
super().precheck()
|
|
75
77
|
|
|
78
|
+
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
79
|
+
path = file_data["name"]
|
|
80
|
+
date_created = None
|
|
81
|
+
date_modified = None
|
|
82
|
+
if modified_at_str := file_data.get("modified_at"):
|
|
83
|
+
date_modified = parser.parse(modified_at_str).timestamp()
|
|
84
|
+
if created_at_str := file_data.get("created_at"):
|
|
85
|
+
date_created = parser.parse(created_at_str).timestamp()
|
|
86
|
+
|
|
87
|
+
file_size = file_data.get("size") if "size" in file_data else None
|
|
88
|
+
|
|
89
|
+
version = file_data.get("id")
|
|
90
|
+
record_locator = {
|
|
91
|
+
"protocol": self.index_config.protocol,
|
|
92
|
+
"remote_file_path": self.index_config.remote_url,
|
|
93
|
+
"file_id": file_data.get("id"),
|
|
94
|
+
}
|
|
95
|
+
return FileDataSourceMetadata(
|
|
96
|
+
date_created=date_created,
|
|
97
|
+
date_modified=date_modified,
|
|
98
|
+
date_processed=str(time()),
|
|
99
|
+
version=version,
|
|
100
|
+
url=f"{self.index_config.protocol}://{path}",
|
|
101
|
+
record_locator=record_locator,
|
|
102
|
+
filesize_bytes=file_size,
|
|
103
|
+
)
|
|
104
|
+
|
|
76
105
|
|
|
77
106
|
class BoxDownloaderConfig(FsspecDownloaderConfig):
|
|
78
107
|
pass
|