unstructured-ingest 0.0.25__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (83) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +42 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +15 -0
  7. test/integration/connectors/databricks_tests/__init__.py +0 -0
  8. test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
  9. test/integration/connectors/test_postgres.py +100 -0
  10. test/integration/connectors/test_s3.py +152 -0
  11. test/integration/connectors/test_sqlite.py +91 -0
  12. test/integration/connectors/utils/__init__.py +0 -0
  13. test/integration/connectors/utils/constants.py +7 -0
  14. test/integration/connectors/utils/docker_compose.py +44 -0
  15. test/integration/connectors/utils/validation.py +198 -0
  16. test/integration/embedders/__init__.py +0 -0
  17. test/integration/embedders/conftest.py +13 -0
  18. test/integration/embedders/test_bedrock.py +49 -0
  19. test/integration/embedders/test_huggingface.py +26 -0
  20. test/integration/embedders/test_mixedbread.py +47 -0
  21. test/integration/embedders/test_octoai.py +41 -0
  22. test/integration/embedders/test_openai.py +41 -0
  23. test/integration/embedders/test_vertexai.py +41 -0
  24. test/integration/embedders/test_voyageai.py +41 -0
  25. test/integration/embedders/togetherai.py +43 -0
  26. test/integration/embedders/utils.py +44 -0
  27. test/integration/partitioners/__init__.py +0 -0
  28. test/integration/partitioners/test_partitioner.py +75 -0
  29. test/integration/utils.py +15 -0
  30. test/unit/__init__.py +0 -0
  31. test/unit/embed/__init__.py +0 -0
  32. test/unit/embed/test_mixedbreadai.py +41 -0
  33. test/unit/embed/test_octoai.py +20 -0
  34. test/unit/embed/test_openai.py +20 -0
  35. test/unit/embed/test_vertexai.py +25 -0
  36. test/unit/embed/test_voyageai.py +24 -0
  37. test/unit/test_chunking_utils.py +36 -0
  38. test/unit/test_error.py +27 -0
  39. test/unit/test_interfaces.py +280 -0
  40. test/unit/test_interfaces_v2.py +26 -0
  41. test/unit/test_logger.py +78 -0
  42. test/unit/test_utils.py +164 -0
  43. test/unit/test_utils_v2.py +82 -0
  44. unstructured_ingest/__version__.py +1 -1
  45. unstructured_ingest/cli/interfaces.py +2 -2
  46. unstructured_ingest/connector/notion/types/block.py +1 -0
  47. unstructured_ingest/connector/notion/types/database.py +1 -0
  48. unstructured_ingest/connector/notion/types/page.py +1 -0
  49. unstructured_ingest/embed/bedrock.py +0 -20
  50. unstructured_ingest/embed/huggingface.py +0 -21
  51. unstructured_ingest/embed/interfaces.py +29 -3
  52. unstructured_ingest/embed/mixedbreadai.py +0 -36
  53. unstructured_ingest/embed/octoai.py +2 -24
  54. unstructured_ingest/embed/openai.py +0 -20
  55. unstructured_ingest/embed/togetherai.py +40 -0
  56. unstructured_ingest/embed/vertexai.py +0 -20
  57. unstructured_ingest/embed/voyageai.py +1 -24
  58. unstructured_ingest/interfaces.py +1 -1
  59. unstructured_ingest/v2/cli/utils/click.py +21 -2
  60. unstructured_ingest/v2/interfaces/connector.py +22 -2
  61. unstructured_ingest/v2/interfaces/downloader.py +1 -0
  62. unstructured_ingest/v2/processes/chunker.py +1 -1
  63. unstructured_ingest/v2/processes/connectors/__init__.py +5 -18
  64. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  65. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
  66. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  67. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  68. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  69. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  70. unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
  71. unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
  72. unstructured_ingest/v2/processes/connectors/sql/postgres.py +121 -0
  73. unstructured_ingest/v2/processes/connectors/sql/sql.py +181 -0
  74. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +109 -0
  75. unstructured_ingest/v2/processes/embedder.py +13 -0
  76. unstructured_ingest/v2/processes/partitioner.py +2 -1
  77. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/METADATA +14 -12
  78. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/RECORD +82 -29
  79. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/top_level.txt +1 -0
  80. unstructured_ingest/v2/processes/connectors/sql.py +0 -275
  81. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/LICENSE.md +0 -0
  82. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/WHEEL +0 -0
  83. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/entry_points.txt +0 -0
@@ -1,7 +1,6 @@
1
1
  from dataclasses import dataclass
2
2
  from typing import TYPE_CHECKING, Optional
3
3
 
4
- import numpy as np
5
4
  from pydantic import Field, SecretStr
6
5
 
7
6
  from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
@@ -13,7 +12,7 @@ if TYPE_CHECKING:
13
12
 
14
13
  class VoyageAIEmbeddingConfig(EmbeddingConfig):
15
14
  api_key: SecretStr
16
- embedder_model_name: str = Field(alias="model_name")
15
+ embedder_model_name: str = Field(default="voyage-3", alias="model_name")
17
16
  batch_size: Optional[int] = Field(default=None)
18
17
  truncation: Optional[bool] = Field(default=None)
19
18
  max_retries: int = 0
@@ -39,19 +38,6 @@ class VoyageAIEmbeddingConfig(EmbeddingConfig):
39
38
  class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
40
39
  config: VoyageAIEmbeddingConfig
41
40
 
42
- def get_exemplary_embedding(self) -> list[float]:
43
- return self.embed_query(query="A sample query.")
44
-
45
- @property
46
- def num_of_dimensions(self) -> tuple[int, ...]:
47
- exemplary_embedding = self.get_exemplary_embedding()
48
- return np.shape(exemplary_embedding)
49
-
50
- @property
51
- def is_unit_vector(self) -> bool:
52
- exemplary_embedding = self.get_exemplary_embedding()
53
- return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
54
-
55
41
  def _embed_documents(self, elements: list[str]) -> list[list[float]]:
56
42
  client: VoyageAIClient = self.config.get_client()
57
43
  response = client.embed(texts=elements, model=self.config.embedder_model_name)
@@ -63,12 +49,3 @@ class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
63
49
 
64
50
  def embed_query(self, query: str) -> list[float]:
65
51
  return self._embed_documents(elements=[query])[0]
66
-
67
- @staticmethod
68
- def _add_embeddings_to_elements(elements, embeddings) -> list[dict]:
69
- assert len(elements) == len(embeddings)
70
- elements_w_embedding = []
71
- for i, element in enumerate(elements):
72
- element["embeddings"] = embeddings[i]
73
- elements_w_embedding.append(element)
74
- return elements
@@ -100,7 +100,7 @@ class PartitionConfig(BaseConfig):
100
100
  flatten_metadata: bool = False
101
101
  metadata_exclude: list[str] = field(default_factory=list)
102
102
  metadata_include: list[str] = field(default_factory=list)
103
- partition_endpoint: Optional[str] = "https://api.unstructured.io/general/v0/general"
103
+ partition_endpoint: Optional[str] = "https://api.unstructuredapp.io/general/v0/general"
104
104
  partition_by_api: bool = False
105
105
  api_key: Optional[str] = str(enhanced_field(default=None, sensitive=True)) or None
106
106
  hi_res_model_name: Optional[str] = None
@@ -3,7 +3,7 @@ import os.path
3
3
  from gettext import gettext, ngettext
4
4
  from gettext import gettext as _
5
5
  from pathlib import Path
6
- from typing import Any, Optional, Type, TypeVar
6
+ from typing import Any, Optional, Type, TypeVar, Union
7
7
 
8
8
  import click
9
9
  from pydantic import BaseModel, ConfigDict, Secret
@@ -112,6 +112,20 @@ class DelimitedString(click.ParamType):
112
112
  BaseModelT = TypeVar("BaseModelT", bound=BaseModel)
113
113
 
114
114
 
115
+ def unwrap_optional(val: Any) -> tuple[Any, bool]:
116
+ if (
117
+ hasattr(val, "__origin__")
118
+ and hasattr(val, "__args__")
119
+ and val.__origin__ is Union
120
+ and len(val.__args__) == 2
121
+ and type(None) in val.__args__
122
+ ):
123
+ args = val.__args__
124
+ args = [a for a in args if a is not None]
125
+ return args[0], True
126
+ return val, False
127
+
128
+
115
129
  def extract_config(flat_data: dict, config: Type[BaseModelT]) -> BaseModelT:
116
130
  fields = config.model_fields
117
131
  config.model_config = ConfigDict(extra="ignore")
@@ -119,6 +133,7 @@ def extract_config(flat_data: dict, config: Type[BaseModelT]) -> BaseModelT:
119
133
  data = {k: v for k, v in flat_data.items() if k in field_names and v is not None}
120
134
  if access_config := fields.get("access_config"):
121
135
  access_config_type = access_config.annotation
136
+ access_config_type, is_optional = unwrap_optional(access_config_type)
122
137
  # Check if raw type is wrapped by a secret
123
138
  if (
124
139
  hasattr(access_config_type, "__origin__")
@@ -132,9 +147,13 @@ def extract_config(flat_data: dict, config: Type[BaseModelT]) -> BaseModelT:
132
147
  else:
133
148
  raise TypeError(f"Unrecognized access_config type: {access_config_type}")
134
149
  ac_field_names = [v.alias or k for k, v in ac_fields.items()]
135
- data["access_config"] = {
150
+ access_config_data = {
136
151
  k: v for k, v in flat_data.items() if k in ac_field_names and v is not None
137
152
  }
153
+ if not access_config_data and is_optional:
154
+ data["access_config"] = None
155
+ else:
156
+ data["access_config"] = access_config_data
138
157
  return config.model_validate(obj=data)
139
158
 
140
159
 
@@ -1,8 +1,9 @@
1
1
  from abc import ABC
2
2
  from dataclasses import dataclass
3
- from typing import Any, TypeVar
3
+ from typing import Any, TypeVar, Union
4
4
 
5
- from pydantic import BaseModel, Secret
5
+ from pydantic import BaseModel, Secret, model_validator
6
+ from pydantic.types import _SecretBase
6
7
 
7
8
 
8
9
  class AccessConfig(BaseModel):
@@ -21,6 +22,25 @@ class ConnectionConfig(BaseModel):
21
22
  return {}
22
23
  return self.access_config.get_secret_value().model_dump()
23
24
 
25
+ @model_validator(mode="after")
26
+ def check_access_config(self):
27
+ access_config = self.access_config
28
+ if self._is_access_config_optional() and access_config is None:
29
+ return self
30
+ if not isinstance(access_config, _SecretBase):
31
+ raise ValueError("access_config must be an instance of SecretBase")
32
+ return self
33
+
34
+ def _is_access_config_optional(self) -> bool:
35
+ access_config_type = self.model_fields["access_config"].annotation
36
+ return (
37
+ hasattr(access_config_type, "__origin__")
38
+ and hasattr(access_config_type, "__args__")
39
+ and access_config_type.__origin__ is Union
40
+ and len(access_config_type.__args__) == 2
41
+ and type(None) in access_config_type.__args__
42
+ )
43
+
24
44
 
25
45
  ConnectionConfigT = TypeVar("ConnectionConfigT", bound=ConnectionConfig)
26
46
 
@@ -62,6 +62,7 @@ class Downloader(BaseProcess, BaseConnector, ABC):
62
62
  date_modified = float(file_data.metadata.date_modified)
63
63
  date_created = float(file_data.metadata.date_created)
64
64
  os.utime(download_path, times=(date_created, date_modified))
65
+ file_data.local_download_path = str(download_path.resolve())
65
66
  return DownloadResponse(file_data=file_data, path=download_path)
66
67
 
67
68
  @property
@@ -20,7 +20,7 @@ class ChunkerConfig(BaseModel):
20
20
  default=None, description="The rule-set to use to form chunks. Omit to disable chunking."
21
21
  )
22
22
  chunking_endpoint: Optional[str] = Field(
23
- default="https://api.unstructured.io/general/v0/general",
23
+ default="https://api.unstructuredapp.io/general/v0/general",
24
24
  description="If chunking via api, use the following host.",
25
25
  )
26
26
  chunk_by_api: bool = Field(default=False, description="Flag to use api for chunking")
@@ -1,6 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import unstructured_ingest.v2.processes.connectors.databricks # noqa: F401
3
4
  import unstructured_ingest.v2.processes.connectors.fsspec # noqa: F401
5
+ import unstructured_ingest.v2.processes.connectors.sql # noqa: F401
4
6
  from unstructured_ingest.v2.processes.connector_registry import (
5
7
  add_destination_entry,
6
8
  add_source_entry,
@@ -16,11 +18,6 @@ from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
16
18
  from .chroma import chroma_destination_entry
17
19
  from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
18
20
  from .couchbase import couchbase_destination_entry, couchbase_source_entry
19
- from .databricks_volumes import CONNECTOR_TYPE as DATABRICKS_VOLUMES_CONNECTOR_TYPE
20
- from .databricks_volumes import (
21
- databricks_volumes_destination_entry,
22
- databricks_volumes_source_entry,
23
- )
24
21
  from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
25
22
  from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
26
23
  from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
@@ -32,7 +29,7 @@ from .local import local_destination_entry, local_source_entry
32
29
  from .milvus import CONNECTOR_TYPE as MILVUS_CONNECTOR_TYPE
33
30
  from .milvus import milvus_destination_entry
34
31
  from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
35
- from .mongodb import mongodb_destination_entry
32
+ from .mongodb import mongodb_destination_entry, mongodb_source_entry
36
33
  from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
37
34
  from .onedrive import onedrive_source_entry
38
35
  from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
@@ -47,8 +44,6 @@ from .sharepoint import CONNECTOR_TYPE as SHAREPOINT_CONNECTOR_TYPE
47
44
  from .sharepoint import sharepoint_source_entry
48
45
  from .singlestore import CONNECTOR_TYPE as SINGLESTORE_CONNECTOR_TYPE
49
46
  from .singlestore import singlestore_destination_entry
50
- from .sql import CONNECTOR_TYPE as SQL_CONNECTOR_TYPE
51
- from .sql import sql_destination_entry
52
47
  from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
53
48
  from .weaviate import weaviate_destination_entry
54
49
 
@@ -80,17 +75,9 @@ add_source_entry(source_type=SALESFORCE_CONNECTOR_TYPE, entry=salesforce_source_
80
75
 
81
76
  add_destination_entry(destination_type=WEAVIATE_CONNECTOR_TYPE, entry=weaviate_destination_entry)
82
77
 
83
- add_destination_entry(
84
- destination_type=DATABRICKS_VOLUMES_CONNECTOR_TYPE, entry=databricks_volumes_destination_entry
85
- )
86
- add_source_entry(
87
- source_type=DATABRICKS_VOLUMES_CONNECTOR_TYPE, entry=databricks_volumes_source_entry
88
- )
89
-
90
-
91
- add_destination_entry(destination_type=SQL_CONNECTOR_TYPE, entry=sql_destination_entry)
92
-
93
78
  add_destination_entry(destination_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_destination_entry)
79
+ add_source_entry(source_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_source_entry)
80
+
94
81
  add_destination_entry(destination_type=PINECONE_CONNECTOR_TYPE, entry=pinecone_destination_entry)
95
82
  add_source_entry(source_type=SHAREPOINT_CONNECTOR_TYPE, entry=sharepoint_source_entry)
96
83
  add_destination_entry(
@@ -0,0 +1,52 @@
1
+ from __future__ import annotations
2
+
3
+ from unstructured_ingest.v2.processes.connector_registry import (
4
+ add_destination_entry,
5
+ add_source_entry,
6
+ )
7
+
8
+ from .volumes_aws import CONNECTOR_TYPE as VOLUMES_AWS_CONNECTOR_TYPE
9
+ from .volumes_aws import (
10
+ databricks_aws_volumes_destination_entry,
11
+ databricks_aws_volumes_source_entry,
12
+ )
13
+ from .volumes_azure import CONNECTOR_TYPE as VOLUMES_AZURE_CONNECTOR_TYPE
14
+ from .volumes_azure import (
15
+ databricks_azure_volumes_destination_entry,
16
+ databricks_azure_volumes_source_entry,
17
+ )
18
+ from .volumes_gcp import CONNECTOR_TYPE as VOLUMES_GCP_CONNECTOR_TYPE
19
+ from .volumes_gcp import (
20
+ databricks_gcp_volumes_destination_entry,
21
+ databricks_gcp_volumes_source_entry,
22
+ )
23
+ from .volumes_native import CONNECTOR_TYPE as VOLUMES_NATIVE_CONNECTOR_TYPE
24
+ from .volumes_native import (
25
+ databricks_native_volumes_destination_entry,
26
+ databricks_native_volumes_source_entry,
27
+ )
28
+
29
+ add_source_entry(source_type=VOLUMES_AWS_CONNECTOR_TYPE, entry=databricks_aws_volumes_source_entry)
30
+ add_destination_entry(
31
+ destination_type=VOLUMES_AWS_CONNECTOR_TYPE, entry=databricks_aws_volumes_destination_entry
32
+ )
33
+
34
+ add_source_entry(source_type=VOLUMES_GCP_CONNECTOR_TYPE, entry=databricks_gcp_volumes_source_entry)
35
+ add_destination_entry(
36
+ destination_type=VOLUMES_GCP_CONNECTOR_TYPE, entry=databricks_gcp_volumes_destination_entry
37
+ )
38
+
39
+ add_source_entry(
40
+ source_type=VOLUMES_NATIVE_CONNECTOR_TYPE, entry=databricks_native_volumes_source_entry
41
+ )
42
+ add_destination_entry(
43
+ destination_type=VOLUMES_NATIVE_CONNECTOR_TYPE,
44
+ entry=databricks_native_volumes_destination_entry,
45
+ )
46
+
47
+ add_source_entry(
48
+ source_type=VOLUMES_AZURE_CONNECTOR_TYPE, entry=databricks_azure_volumes_source_entry
49
+ )
50
+ add_destination_entry(
51
+ destination_type=VOLUMES_AZURE_CONNECTOR_TYPE, entry=databricks_azure_volumes_destination_entry
52
+ )
@@ -0,0 +1,175 @@
1
+ import os
2
+ from abc import ABC
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any, Generator, Optional
6
+ from uuid import NAMESPACE_DNS, uuid5
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+ from unstructured_ingest.error import (
11
+ DestinationConnectionError,
12
+ SourceConnectionError,
13
+ SourceConnectionNetworkError,
14
+ )
15
+ from unstructured_ingest.utils.dep_check import requires_dependencies
16
+ from unstructured_ingest.v2.interfaces import (
17
+ ConnectionConfig,
18
+ Downloader,
19
+ DownloaderConfig,
20
+ DownloadResponse,
21
+ FileData,
22
+ FileDataSourceMetadata,
23
+ Indexer,
24
+ IndexerConfig,
25
+ SourceIdentifiers,
26
+ Uploader,
27
+ UploaderConfig,
28
+ )
29
+ from unstructured_ingest.v2.logger import logger
30
+
31
+ if TYPE_CHECKING:
32
+ from databricks.sdk import WorkspaceClient
33
+
34
+
35
+ class DatabricksPathMixin(BaseModel):
36
+ volume: str = Field(description="Name of volume in the Unity Catalog")
37
+ catalog: str = Field(description="Name of the catalog in the Databricks Unity Catalog service")
38
+ volume_path: Optional[str] = Field(
39
+ default=None, description="Optional path within the volume to write to"
40
+ )
41
+ databricks_schema: str = Field(
42
+ default="default",
43
+ alias="schema",
44
+ description="Schema associated with the volume to write to in the Unity Catalog service",
45
+ )
46
+
47
+ @property
48
+ def path(self) -> str:
49
+ path = f"/Volumes/{self.catalog}/{self.databricks_schema}/{self.volume}"
50
+ if self.volume_path:
51
+ path = f"{path}/{self.volume_path}"
52
+ return path
53
+
54
+
55
+ class DatabricksVolumesConnectionConfig(ConnectionConfig, ABC):
56
+ host: Optional[str] = Field(
57
+ default=None,
58
+ description="The Databricks host URL for either the "
59
+ "Databricks workspace endpoint or the "
60
+ "Databricks accounts endpoint.",
61
+ )
62
+
63
+ @requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
64
+ def get_client(self) -> "WorkspaceClient":
65
+ from databricks.sdk import WorkspaceClient
66
+
67
+ return WorkspaceClient(
68
+ host=self.host,
69
+ **self.access_config.get_secret_value().model_dump(),
70
+ )
71
+
72
+
73
+ class DatabricksVolumesIndexerConfig(IndexerConfig, DatabricksPathMixin):
74
+ recursive: bool = False
75
+
76
+
77
+ @dataclass
78
+ class DatabricksVolumesIndexer(Indexer, ABC):
79
+ index_config: DatabricksVolumesIndexerConfig
80
+ connection_config: DatabricksVolumesConnectionConfig
81
+
82
+ def precheck(self) -> None:
83
+ try:
84
+ self.connection_config.get_client()
85
+ except Exception as e:
86
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
87
+ raise SourceConnectionError(f"failed to validate connection: {e}")
88
+
89
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
90
+ for file_info in self.connection_config.get_client().dbfs.list(
91
+ path=self.index_config.path, recursive=self.index_config.recursive
92
+ ):
93
+ if file_info.is_dir:
94
+ continue
95
+ rel_path = file_info.path.replace(self.index_config.path, "")
96
+ if rel_path.startswith("/"):
97
+ rel_path = rel_path[1:]
98
+ filename = Path(file_info.path).name
99
+ yield FileData(
100
+ identifier=str(uuid5(NAMESPACE_DNS, file_info.path)),
101
+ connector_type=self.connector_type,
102
+ source_identifiers=SourceIdentifiers(
103
+ filename=filename,
104
+ rel_path=rel_path,
105
+ fullpath=file_info.path,
106
+ ),
107
+ additional_metadata={"catalog": self.index_config.catalog, "path": file_info.path},
108
+ metadata=FileDataSourceMetadata(
109
+ url=file_info.path, date_modified=str(file_info.modification_time)
110
+ ),
111
+ )
112
+
113
+
114
+ class DatabricksVolumesDownloaderConfig(DownloaderConfig):
115
+ pass
116
+
117
+
118
+ @dataclass
119
+ class DatabricksVolumesDownloader(Downloader, ABC):
120
+ download_config: DatabricksVolumesDownloaderConfig
121
+ connection_config: DatabricksVolumesConnectionConfig
122
+
123
+ def precheck(self) -> None:
124
+ try:
125
+ self.connection_config.get_client()
126
+ except Exception as e:
127
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
128
+ raise SourceConnectionError(f"failed to validate connection: {e}")
129
+
130
+ def get_download_path(self, file_data: FileData) -> Path:
131
+ return self.download_config.download_dir / Path(file_data.source_identifiers.relative_path)
132
+
133
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
134
+ download_path = self.get_download_path(file_data=file_data)
135
+ download_path.parent.mkdir(parents=True, exist_ok=True)
136
+ volumes_path = file_data.additional_metadata["path"]
137
+ logger.info(f"Writing {file_data.identifier} to {download_path}")
138
+ try:
139
+ with self.connection_config.get_client().dbfs.download(path=volumes_path) as c:
140
+ read_content = c._read_handle.read()
141
+ with open(download_path, "wb") as f:
142
+ f.write(read_content)
143
+ except Exception as e:
144
+ logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
145
+ raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
146
+
147
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
148
+
149
+
150
+ class DatabricksVolumesUploaderConfig(UploaderConfig, DatabricksPathMixin):
151
+ overwrite: bool = Field(
152
+ default=False, description="If true, an existing file will be overwritten."
153
+ )
154
+
155
+
156
+ @dataclass
157
+ class DatabricksVolumesUploader(Uploader, ABC):
158
+ upload_config: DatabricksVolumesUploaderConfig
159
+ connection_config: DatabricksVolumesConnectionConfig
160
+
161
+ def precheck(self) -> None:
162
+ try:
163
+ assert self.connection_config.get_client().current_user.me().active
164
+ except Exception as e:
165
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
166
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
167
+
168
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
169
+ output_path = os.path.join(self.upload_config.path, path.name)
170
+ with open(path, "rb") as elements_file:
171
+ self.connection_config.get_client().files.upload(
172
+ file_path=output_path,
173
+ contents=elements_file,
174
+ overwrite=self.upload_config.overwrite,
175
+ )
@@ -0,0 +1,87 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Optional
3
+
4
+ from pydantic import Field, Secret
5
+
6
+ from unstructured_ingest.v2.interfaces import AccessConfig
7
+ from unstructured_ingest.v2.processes.connector_registry import (
8
+ DestinationRegistryEntry,
9
+ SourceRegistryEntry,
10
+ )
11
+ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
12
+ DatabricksVolumesConnectionConfig,
13
+ DatabricksVolumesDownloader,
14
+ DatabricksVolumesDownloaderConfig,
15
+ DatabricksVolumesIndexer,
16
+ DatabricksVolumesIndexerConfig,
17
+ DatabricksVolumesUploader,
18
+ DatabricksVolumesUploaderConfig,
19
+ )
20
+
21
+ CONNECTOR_TYPE = "databricks_volumes_aws"
22
+
23
+
24
+ class DatabricksAWSVolumesAccessConfig(AccessConfig):
25
+ account_id: Optional[str] = Field(
26
+ default=None,
27
+ description="The Databricks account ID for the Databricks " "accounts endpoint",
28
+ )
29
+ profile: Optional[str] = None
30
+ token: Optional[str] = Field(
31
+ default=None,
32
+ description="The Databricks personal access token (PAT)",
33
+ )
34
+
35
+
36
+ class DatabricksAWSVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
37
+ access_config: Secret[DatabricksAWSVolumesAccessConfig]
38
+
39
+
40
+ class DatabricksAWSVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
41
+ pass
42
+
43
+
44
+ @dataclass
45
+ class DatabricksAWSVolumesIndexer(DatabricksVolumesIndexer):
46
+ connection_config: DatabricksAWSVolumesConnectionConfig
47
+ index_config: DatabricksAWSVolumesIndexerConfig
48
+ connector_type: str = CONNECTOR_TYPE
49
+
50
+
51
+ class DatabricksAWSVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
52
+ pass
53
+
54
+
55
+ @dataclass
56
+ class DatabricksAWSVolumesDownloader(DatabricksVolumesDownloader):
57
+ connection_config: DatabricksAWSVolumesConnectionConfig
58
+ download_config: DatabricksVolumesDownloaderConfig
59
+ connector_type: str = CONNECTOR_TYPE
60
+
61
+
62
+ class DatabricksAWSVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
63
+ pass
64
+
65
+
66
+ @dataclass
67
+ class DatabricksAWSVolumesUploader(DatabricksVolumesUploader):
68
+ connection_config: DatabricksAWSVolumesConnectionConfig
69
+ upload_config: DatabricksAWSVolumesUploaderConfig = field(
70
+ default_factory=DatabricksAWSVolumesUploaderConfig
71
+ )
72
+ connector_type: str = CONNECTOR_TYPE
73
+
74
+
75
+ databricks_aws_volumes_destination_entry = DestinationRegistryEntry(
76
+ connection_config=DatabricksAWSVolumesConnectionConfig,
77
+ uploader=DatabricksAWSVolumesUploader,
78
+ uploader_config=DatabricksAWSVolumesUploaderConfig,
79
+ )
80
+
81
+ databricks_aws_volumes_source_entry = SourceRegistryEntry(
82
+ connection_config=DatabricksAWSVolumesConnectionConfig,
83
+ indexer=DatabricksAWSVolumesIndexer,
84
+ indexer_config=DatabricksAWSVolumesIndexerConfig,
85
+ downloader=DatabricksAWSVolumesDownloader,
86
+ downloader_config=DatabricksAWSVolumesDownloaderConfig,
87
+ )
@@ -0,0 +1,102 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Optional
3
+
4
+ from pydantic import Field, Secret
5
+
6
+ from unstructured_ingest.v2.interfaces import AccessConfig
7
+ from unstructured_ingest.v2.processes.connector_registry import (
8
+ DestinationRegistryEntry,
9
+ SourceRegistryEntry,
10
+ )
11
+ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
12
+ DatabricksVolumesConnectionConfig,
13
+ DatabricksVolumesDownloader,
14
+ DatabricksVolumesDownloaderConfig,
15
+ DatabricksVolumesIndexer,
16
+ DatabricksVolumesIndexerConfig,
17
+ DatabricksVolumesUploader,
18
+ DatabricksVolumesUploaderConfig,
19
+ )
20
+
21
+ CONNECTOR_TYPE = "databricks_volumes_azure"
22
+
23
+
24
+ class DatabricksAzureVolumesAccessConfig(AccessConfig):
25
+ account_id: Optional[str] = Field(
26
+ default=None,
27
+ description="The Databricks account ID for the Databricks " "accounts endpoint.",
28
+ )
29
+ profile: Optional[str] = None
30
+ azure_workspace_resource_id: Optional[str] = Field(
31
+ default=None,
32
+ description="The Azure Resource Manager ID for the Azure Databricks workspace, "
33
+ "which is exchanged for a Databricks host URL.",
34
+ )
35
+ azure_client_secret: Optional[str] = Field(
36
+ default=None, description="The Azure AD service principal’s client secret."
37
+ )
38
+ azure_client_id: Optional[str] = Field(
39
+ default=None, description="The Azure AD service principal’s application ID."
40
+ )
41
+ azure_tenant_id: Optional[str] = Field(
42
+ default=None, description="The Azure AD service principal’s tenant ID."
43
+ )
44
+ azure_environment: Optional[str] = Field(
45
+ default=None,
46
+ description="The Azure environment type for a " "specific set of API endpoints",
47
+ examples=["Public", "UsGov", "China", "Germany"],
48
+ )
49
+
50
+
51
+ class DatabricksAzureVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
52
+ access_config: Secret[DatabricksAzureVolumesAccessConfig]
53
+
54
+
55
+ class DatabricksAzureVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
56
+ pass
57
+
58
+
59
+ @dataclass
60
+ class DatabricksAzureVolumesIndexer(DatabricksVolumesIndexer):
61
+ connection_config: DatabricksAzureVolumesConnectionConfig
62
+ index_config: DatabricksAzureVolumesIndexerConfig
63
+ connector_type: str = CONNECTOR_TYPE
64
+
65
+
66
+ class DatabricksAzureVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
67
+ pass
68
+
69
+
70
+ @dataclass
71
+ class DatabricksAzureVolumesDownloader(DatabricksVolumesDownloader):
72
+ connection_config: DatabricksAzureVolumesConnectionConfig
73
+ download_config: DatabricksVolumesDownloaderConfig
74
+ connector_type: str = CONNECTOR_TYPE
75
+
76
+
77
+ class DatabricksAzureVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
78
+ pass
79
+
80
+
81
+ @dataclass
82
+ class DatabricksAzureVolumesUploader(DatabricksVolumesUploader):
83
+ connection_config: DatabricksAzureVolumesConnectionConfig
84
+ upload_config: DatabricksAzureVolumesUploaderConfig = field(
85
+ default_factory=DatabricksAzureVolumesUploaderConfig
86
+ )
87
+ connector_type: str = CONNECTOR_TYPE
88
+
89
+
90
+ databricks_azure_volumes_destination_entry = DestinationRegistryEntry(
91
+ connection_config=DatabricksAzureVolumesConnectionConfig,
92
+ uploader=DatabricksAzureVolumesUploader,
93
+ uploader_config=DatabricksAzureVolumesUploaderConfig,
94
+ )
95
+
96
+ databricks_azure_volumes_source_entry = SourceRegistryEntry(
97
+ connection_config=DatabricksAzureVolumesConnectionConfig,
98
+ indexer=DatabricksAzureVolumesIndexer,
99
+ indexer_config=DatabricksAzureVolumesIndexerConfig,
100
+ downloader=DatabricksAzureVolumesDownloader,
101
+ downloader_config=DatabricksAzureVolumesDownloaderConfig,
102
+ )