unstructured-ingest 0.0.25__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (86) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +42 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +15 -0
  7. test/integration/connectors/databricks_tests/__init__.py +0 -0
  8. test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
  9. test/integration/connectors/sql/__init__.py +0 -0
  10. test/integration/connectors/sql/test_postgres.py +178 -0
  11. test/integration/connectors/sql/test_sqlite.py +151 -0
  12. test/integration/connectors/test_s3.py +152 -0
  13. test/integration/connectors/utils/__init__.py +0 -0
  14. test/integration/connectors/utils/constants.py +7 -0
  15. test/integration/connectors/utils/docker_compose.py +44 -0
  16. test/integration/connectors/utils/validation.py +203 -0
  17. test/integration/embedders/__init__.py +0 -0
  18. test/integration/embedders/conftest.py +13 -0
  19. test/integration/embedders/test_bedrock.py +49 -0
  20. test/integration/embedders/test_huggingface.py +26 -0
  21. test/integration/embedders/test_mixedbread.py +47 -0
  22. test/integration/embedders/test_octoai.py +41 -0
  23. test/integration/embedders/test_openai.py +41 -0
  24. test/integration/embedders/test_vertexai.py +41 -0
  25. test/integration/embedders/test_voyageai.py +41 -0
  26. test/integration/embedders/togetherai.py +43 -0
  27. test/integration/embedders/utils.py +44 -0
  28. test/integration/partitioners/__init__.py +0 -0
  29. test/integration/partitioners/test_partitioner.py +75 -0
  30. test/integration/utils.py +15 -0
  31. test/unit/__init__.py +0 -0
  32. test/unit/embed/__init__.py +0 -0
  33. test/unit/embed/test_mixedbreadai.py +41 -0
  34. test/unit/embed/test_octoai.py +20 -0
  35. test/unit/embed/test_openai.py +20 -0
  36. test/unit/embed/test_vertexai.py +25 -0
  37. test/unit/embed/test_voyageai.py +24 -0
  38. test/unit/test_chunking_utils.py +36 -0
  39. test/unit/test_error.py +27 -0
  40. test/unit/test_interfaces.py +280 -0
  41. test/unit/test_interfaces_v2.py +26 -0
  42. test/unit/test_logger.py +78 -0
  43. test/unit/test_utils.py +164 -0
  44. test/unit/test_utils_v2.py +82 -0
  45. unstructured_ingest/__version__.py +1 -1
  46. unstructured_ingest/cli/interfaces.py +2 -2
  47. unstructured_ingest/connector/notion/types/block.py +1 -0
  48. unstructured_ingest/connector/notion/types/database.py +1 -0
  49. unstructured_ingest/connector/notion/types/page.py +1 -0
  50. unstructured_ingest/embed/bedrock.py +0 -20
  51. unstructured_ingest/embed/huggingface.py +0 -21
  52. unstructured_ingest/embed/interfaces.py +29 -3
  53. unstructured_ingest/embed/mixedbreadai.py +0 -36
  54. unstructured_ingest/embed/octoai.py +2 -24
  55. unstructured_ingest/embed/openai.py +0 -20
  56. unstructured_ingest/embed/togetherai.py +40 -0
  57. unstructured_ingest/embed/vertexai.py +0 -20
  58. unstructured_ingest/embed/voyageai.py +1 -24
  59. unstructured_ingest/interfaces.py +1 -1
  60. unstructured_ingest/v2/cli/utils/click.py +21 -2
  61. unstructured_ingest/v2/interfaces/connector.py +22 -2
  62. unstructured_ingest/v2/interfaces/downloader.py +1 -0
  63. unstructured_ingest/v2/processes/chunker.py +1 -1
  64. unstructured_ingest/v2/processes/connectors/__init__.py +5 -18
  65. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  66. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
  67. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  68. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  69. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  70. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  71. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +17 -0
  72. unstructured_ingest/v2/processes/connectors/kdbai.py +14 -6
  73. unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
  74. unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
  75. unstructured_ingest/v2/processes/connectors/sql/postgres.py +177 -0
  76. unstructured_ingest/v2/processes/connectors/sql/sql.py +310 -0
  77. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +172 -0
  78. unstructured_ingest/v2/processes/embedder.py +13 -0
  79. unstructured_ingest/v2/processes/partitioner.py +2 -1
  80. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/METADATA +16 -14
  81. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/RECORD +85 -31
  82. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/top_level.txt +1 -0
  83. unstructured_ingest/v2/processes/connectors/sql.py +0 -275
  84. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/LICENSE.md +0 -0
  85. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/WHEEL +0 -0
  86. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,175 @@
1
+ import os
2
+ from abc import ABC
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any, Generator, Optional
6
+ from uuid import NAMESPACE_DNS, uuid5
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+ from unstructured_ingest.error import (
11
+ DestinationConnectionError,
12
+ SourceConnectionError,
13
+ SourceConnectionNetworkError,
14
+ )
15
+ from unstructured_ingest.utils.dep_check import requires_dependencies
16
+ from unstructured_ingest.v2.interfaces import (
17
+ ConnectionConfig,
18
+ Downloader,
19
+ DownloaderConfig,
20
+ DownloadResponse,
21
+ FileData,
22
+ FileDataSourceMetadata,
23
+ Indexer,
24
+ IndexerConfig,
25
+ SourceIdentifiers,
26
+ Uploader,
27
+ UploaderConfig,
28
+ )
29
+ from unstructured_ingest.v2.logger import logger
30
+
31
+ if TYPE_CHECKING:
32
+ from databricks.sdk import WorkspaceClient
33
+
34
+
35
+ class DatabricksPathMixin(BaseModel):
36
+ volume: str = Field(description="Name of volume in the Unity Catalog")
37
+ catalog: str = Field(description="Name of the catalog in the Databricks Unity Catalog service")
38
+ volume_path: Optional[str] = Field(
39
+ default=None, description="Optional path within the volume to write to"
40
+ )
41
+ databricks_schema: str = Field(
42
+ default="default",
43
+ alias="schema",
44
+ description="Schema associated with the volume to write to in the Unity Catalog service",
45
+ )
46
+
47
+ @property
48
+ def path(self) -> str:
49
+ path = f"/Volumes/{self.catalog}/{self.databricks_schema}/{self.volume}"
50
+ if self.volume_path:
51
+ path = f"{path}/{self.volume_path}"
52
+ return path
53
+
54
+
55
+ class DatabricksVolumesConnectionConfig(ConnectionConfig, ABC):
56
+ host: Optional[str] = Field(
57
+ default=None,
58
+ description="The Databricks host URL for either the "
59
+ "Databricks workspace endpoint or the "
60
+ "Databricks accounts endpoint.",
61
+ )
62
+
63
+ @requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
64
+ def get_client(self) -> "WorkspaceClient":
65
+ from databricks.sdk import WorkspaceClient
66
+
67
+ return WorkspaceClient(
68
+ host=self.host,
69
+ **self.access_config.get_secret_value().model_dump(),
70
+ )
71
+
72
+
73
+ class DatabricksVolumesIndexerConfig(IndexerConfig, DatabricksPathMixin):
74
+ recursive: bool = False
75
+
76
+
77
+ @dataclass
78
+ class DatabricksVolumesIndexer(Indexer, ABC):
79
+ index_config: DatabricksVolumesIndexerConfig
80
+ connection_config: DatabricksVolumesConnectionConfig
81
+
82
+ def precheck(self) -> None:
83
+ try:
84
+ self.connection_config.get_client()
85
+ except Exception as e:
86
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
87
+ raise SourceConnectionError(f"failed to validate connection: {e}")
88
+
89
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
90
+ for file_info in self.connection_config.get_client().dbfs.list(
91
+ path=self.index_config.path, recursive=self.index_config.recursive
92
+ ):
93
+ if file_info.is_dir:
94
+ continue
95
+ rel_path = file_info.path.replace(self.index_config.path, "")
96
+ if rel_path.startswith("/"):
97
+ rel_path = rel_path[1:]
98
+ filename = Path(file_info.path).name
99
+ yield FileData(
100
+ identifier=str(uuid5(NAMESPACE_DNS, file_info.path)),
101
+ connector_type=self.connector_type,
102
+ source_identifiers=SourceIdentifiers(
103
+ filename=filename,
104
+ rel_path=rel_path,
105
+ fullpath=file_info.path,
106
+ ),
107
+ additional_metadata={"catalog": self.index_config.catalog, "path": file_info.path},
108
+ metadata=FileDataSourceMetadata(
109
+ url=file_info.path, date_modified=str(file_info.modification_time)
110
+ ),
111
+ )
112
+
113
+
114
+ class DatabricksVolumesDownloaderConfig(DownloaderConfig):
115
+ pass
116
+
117
+
118
+ @dataclass
119
+ class DatabricksVolumesDownloader(Downloader, ABC):
120
+ download_config: DatabricksVolumesDownloaderConfig
121
+ connection_config: DatabricksVolumesConnectionConfig
122
+
123
+ def precheck(self) -> None:
124
+ try:
125
+ self.connection_config.get_client()
126
+ except Exception as e:
127
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
128
+ raise SourceConnectionError(f"failed to validate connection: {e}")
129
+
130
+ def get_download_path(self, file_data: FileData) -> Path:
131
+ return self.download_config.download_dir / Path(file_data.source_identifiers.relative_path)
132
+
133
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
134
+ download_path = self.get_download_path(file_data=file_data)
135
+ download_path.parent.mkdir(parents=True, exist_ok=True)
136
+ volumes_path = file_data.additional_metadata["path"]
137
+ logger.info(f"Writing {file_data.identifier} to {download_path}")
138
+ try:
139
+ with self.connection_config.get_client().dbfs.download(path=volumes_path) as c:
140
+ read_content = c._read_handle.read()
141
+ with open(download_path, "wb") as f:
142
+ f.write(read_content)
143
+ except Exception as e:
144
+ logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
145
+ raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
146
+
147
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
148
+
149
+
150
+ class DatabricksVolumesUploaderConfig(UploaderConfig, DatabricksPathMixin):
151
+ overwrite: bool = Field(
152
+ default=False, description="If true, an existing file will be overwritten."
153
+ )
154
+
155
+
156
+ @dataclass
157
+ class DatabricksVolumesUploader(Uploader, ABC):
158
+ upload_config: DatabricksVolumesUploaderConfig
159
+ connection_config: DatabricksVolumesConnectionConfig
160
+
161
+ def precheck(self) -> None:
162
+ try:
163
+ assert self.connection_config.get_client().current_user.me().active
164
+ except Exception as e:
165
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
166
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
167
+
168
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
169
+ output_path = os.path.join(self.upload_config.path, path.name)
170
+ with open(path, "rb") as elements_file:
171
+ self.connection_config.get_client().files.upload(
172
+ file_path=output_path,
173
+ contents=elements_file,
174
+ overwrite=self.upload_config.overwrite,
175
+ )
@@ -0,0 +1,87 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Optional
3
+
4
+ from pydantic import Field, Secret
5
+
6
+ from unstructured_ingest.v2.interfaces import AccessConfig
7
+ from unstructured_ingest.v2.processes.connector_registry import (
8
+ DestinationRegistryEntry,
9
+ SourceRegistryEntry,
10
+ )
11
+ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
12
+ DatabricksVolumesConnectionConfig,
13
+ DatabricksVolumesDownloader,
14
+ DatabricksVolumesDownloaderConfig,
15
+ DatabricksVolumesIndexer,
16
+ DatabricksVolumesIndexerConfig,
17
+ DatabricksVolumesUploader,
18
+ DatabricksVolumesUploaderConfig,
19
+ )
20
+
21
+ CONNECTOR_TYPE = "databricks_volumes_aws"
22
+
23
+
24
+ class DatabricksAWSVolumesAccessConfig(AccessConfig):
25
+ account_id: Optional[str] = Field(
26
+ default=None,
27
+ description="The Databricks account ID for the Databricks " "accounts endpoint",
28
+ )
29
+ profile: Optional[str] = None
30
+ token: Optional[str] = Field(
31
+ default=None,
32
+ description="The Databricks personal access token (PAT)",
33
+ )
34
+
35
+
36
+ class DatabricksAWSVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
37
+ access_config: Secret[DatabricksAWSVolumesAccessConfig]
38
+
39
+
40
+ class DatabricksAWSVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
41
+ pass
42
+
43
+
44
+ @dataclass
45
+ class DatabricksAWSVolumesIndexer(DatabricksVolumesIndexer):
46
+ connection_config: DatabricksAWSVolumesConnectionConfig
47
+ index_config: DatabricksAWSVolumesIndexerConfig
48
+ connector_type: str = CONNECTOR_TYPE
49
+
50
+
51
+ class DatabricksAWSVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
52
+ pass
53
+
54
+
55
+ @dataclass
56
+ class DatabricksAWSVolumesDownloader(DatabricksVolumesDownloader):
57
+ connection_config: DatabricksAWSVolumesConnectionConfig
58
+ download_config: DatabricksVolumesDownloaderConfig
59
+ connector_type: str = CONNECTOR_TYPE
60
+
61
+
62
+ class DatabricksAWSVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
63
+ pass
64
+
65
+
66
+ @dataclass
67
+ class DatabricksAWSVolumesUploader(DatabricksVolumesUploader):
68
+ connection_config: DatabricksAWSVolumesConnectionConfig
69
+ upload_config: DatabricksAWSVolumesUploaderConfig = field(
70
+ default_factory=DatabricksAWSVolumesUploaderConfig
71
+ )
72
+ connector_type: str = CONNECTOR_TYPE
73
+
74
+
75
+ databricks_aws_volumes_destination_entry = DestinationRegistryEntry(
76
+ connection_config=DatabricksAWSVolumesConnectionConfig,
77
+ uploader=DatabricksAWSVolumesUploader,
78
+ uploader_config=DatabricksAWSVolumesUploaderConfig,
79
+ )
80
+
81
+ databricks_aws_volumes_source_entry = SourceRegistryEntry(
82
+ connection_config=DatabricksAWSVolumesConnectionConfig,
83
+ indexer=DatabricksAWSVolumesIndexer,
84
+ indexer_config=DatabricksAWSVolumesIndexerConfig,
85
+ downloader=DatabricksAWSVolumesDownloader,
86
+ downloader_config=DatabricksAWSVolumesDownloaderConfig,
87
+ )
@@ -0,0 +1,102 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Optional
3
+
4
+ from pydantic import Field, Secret
5
+
6
+ from unstructured_ingest.v2.interfaces import AccessConfig
7
+ from unstructured_ingest.v2.processes.connector_registry import (
8
+ DestinationRegistryEntry,
9
+ SourceRegistryEntry,
10
+ )
11
+ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
12
+ DatabricksVolumesConnectionConfig,
13
+ DatabricksVolumesDownloader,
14
+ DatabricksVolumesDownloaderConfig,
15
+ DatabricksVolumesIndexer,
16
+ DatabricksVolumesIndexerConfig,
17
+ DatabricksVolumesUploader,
18
+ DatabricksVolumesUploaderConfig,
19
+ )
20
+
21
+ CONNECTOR_TYPE = "databricks_volumes_azure"
22
+
23
+
24
+ class DatabricksAzureVolumesAccessConfig(AccessConfig):
25
+ account_id: Optional[str] = Field(
26
+ default=None,
27
+ description="The Databricks account ID for the Databricks " "accounts endpoint.",
28
+ )
29
+ profile: Optional[str] = None
30
+ azure_workspace_resource_id: Optional[str] = Field(
31
+ default=None,
32
+ description="The Azure Resource Manager ID for the Azure Databricks workspace, "
33
+ "which is exchanged for a Databricks host URL.",
34
+ )
35
+ azure_client_secret: Optional[str] = Field(
36
+ default=None, description="The Azure AD service principal’s client secret."
37
+ )
38
+ azure_client_id: Optional[str] = Field(
39
+ default=None, description="The Azure AD service principal’s application ID."
40
+ )
41
+ azure_tenant_id: Optional[str] = Field(
42
+ default=None, description="The Azure AD service principal’s tenant ID."
43
+ )
44
+ azure_environment: Optional[str] = Field(
45
+ default=None,
46
+ description="The Azure environment type for a " "specific set of API endpoints",
47
+ examples=["Public", "UsGov", "China", "Germany"],
48
+ )
49
+
50
+
51
+ class DatabricksAzureVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
52
+ access_config: Secret[DatabricksAzureVolumesAccessConfig]
53
+
54
+
55
+ class DatabricksAzureVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
56
+ pass
57
+
58
+
59
+ @dataclass
60
+ class DatabricksAzureVolumesIndexer(DatabricksVolumesIndexer):
61
+ connection_config: DatabricksAzureVolumesConnectionConfig
62
+ index_config: DatabricksAzureVolumesIndexerConfig
63
+ connector_type: str = CONNECTOR_TYPE
64
+
65
+
66
+ class DatabricksAzureVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
67
+ pass
68
+
69
+
70
+ @dataclass
71
+ class DatabricksAzureVolumesDownloader(DatabricksVolumesDownloader):
72
+ connection_config: DatabricksAzureVolumesConnectionConfig
73
+ download_config: DatabricksVolumesDownloaderConfig
74
+ connector_type: str = CONNECTOR_TYPE
75
+
76
+
77
+ class DatabricksAzureVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
78
+ pass
79
+
80
+
81
+ @dataclass
82
+ class DatabricksAzureVolumesUploader(DatabricksVolumesUploader):
83
+ connection_config: DatabricksAzureVolumesConnectionConfig
84
+ upload_config: DatabricksAzureVolumesUploaderConfig = field(
85
+ default_factory=DatabricksAzureVolumesUploaderConfig
86
+ )
87
+ connector_type: str = CONNECTOR_TYPE
88
+
89
+
90
+ databricks_azure_volumes_destination_entry = DestinationRegistryEntry(
91
+ connection_config=DatabricksAzureVolumesConnectionConfig,
92
+ uploader=DatabricksAzureVolumesUploader,
93
+ uploader_config=DatabricksAzureVolumesUploaderConfig,
94
+ )
95
+
96
+ databricks_azure_volumes_source_entry = SourceRegistryEntry(
97
+ connection_config=DatabricksAzureVolumesConnectionConfig,
98
+ indexer=DatabricksAzureVolumesIndexer,
99
+ indexer_config=DatabricksAzureVolumesIndexerConfig,
100
+ downloader=DatabricksAzureVolumesDownloader,
101
+ downloader_config=DatabricksAzureVolumesDownloaderConfig,
102
+ )
@@ -0,0 +1,85 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Optional
3
+
4
+ from pydantic import Field, Secret
5
+
6
+ from unstructured_ingest.v2.interfaces import AccessConfig
7
+ from unstructured_ingest.v2.processes.connector_registry import (
8
+ DestinationRegistryEntry,
9
+ SourceRegistryEntry,
10
+ )
11
+ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
12
+ DatabricksVolumesConnectionConfig,
13
+ DatabricksVolumesDownloader,
14
+ DatabricksVolumesDownloaderConfig,
15
+ DatabricksVolumesIndexer,
16
+ DatabricksVolumesIndexerConfig,
17
+ DatabricksVolumesUploader,
18
+ DatabricksVolumesUploaderConfig,
19
+ )
20
+
21
+ CONNECTOR_TYPE = "databricks_volumes_gcp"
22
+
23
+
24
+ class DatabricksGoogleVolumesAccessConfig(AccessConfig):
25
+ account_id: Optional[str] = Field(
26
+ default=None,
27
+ description="The Databricks account ID for the Databricks " "accounts endpoint.",
28
+ )
29
+ profile: Optional[str] = None
30
+ google_credentials: Optional[str] = None
31
+ google_service_account: Optional[str] = None
32
+
33
+
34
+ class DatabricksGoogleVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
35
+ access_config: Secret[DatabricksGoogleVolumesAccessConfig]
36
+
37
+
38
+ class DatabricksGoogleVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
39
+ pass
40
+
41
+
42
+ @dataclass
43
+ class DatabricksGoogleVolumesIndexer(DatabricksVolumesIndexer):
44
+ connection_config: DatabricksGoogleVolumesConnectionConfig
45
+ index_config: DatabricksGoogleVolumesIndexerConfig
46
+ connector_type: str = CONNECTOR_TYPE
47
+
48
+
49
+ class DatabricksGoogleVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
50
+ pass
51
+
52
+
53
+ @dataclass
54
+ class DatabricksGoogleVolumesDownloader(DatabricksVolumesDownloader):
55
+ connection_config: DatabricksGoogleVolumesConnectionConfig
56
+ download_config: DatabricksVolumesDownloaderConfig
57
+ connector_type: str = CONNECTOR_TYPE
58
+
59
+
60
+ class DatabricksGoogleVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
61
+ pass
62
+
63
+
64
+ @dataclass
65
+ class DatabricksGoogleVolumesUploader(DatabricksVolumesUploader):
66
+ connection_config: DatabricksGoogleVolumesConnectionConfig
67
+ upload_config: DatabricksGoogleVolumesUploaderConfig = field(
68
+ default_factory=DatabricksGoogleVolumesUploaderConfig
69
+ )
70
+ connector_type: str = CONNECTOR_TYPE
71
+
72
+
73
+ databricks_gcp_volumes_destination_entry = DestinationRegistryEntry(
74
+ connection_config=DatabricksGoogleVolumesConnectionConfig,
75
+ uploader=DatabricksGoogleVolumesUploader,
76
+ uploader_config=DatabricksGoogleVolumesUploaderConfig,
77
+ )
78
+
79
+ databricks_gcp_volumes_source_entry = SourceRegistryEntry(
80
+ connection_config=DatabricksGoogleVolumesConnectionConfig,
81
+ indexer=DatabricksGoogleVolumesIndexer,
82
+ indexer_config=DatabricksGoogleVolumesIndexerConfig,
83
+ downloader=DatabricksGoogleVolumesDownloader,
84
+ downloader_config=DatabricksGoogleVolumesDownloaderConfig,
85
+ )
@@ -0,0 +1,86 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+ from pydantic import Field, Secret
5
+
6
+ from unstructured_ingest.v2.interfaces import AccessConfig
7
+ from unstructured_ingest.v2.processes.connector_registry import (
8
+ DestinationRegistryEntry,
9
+ SourceRegistryEntry,
10
+ )
11
+ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
12
+ DatabricksVolumesConnectionConfig,
13
+ DatabricksVolumesDownloader,
14
+ DatabricksVolumesDownloaderConfig,
15
+ DatabricksVolumesIndexer,
16
+ DatabricksVolumesIndexerConfig,
17
+ DatabricksVolumesUploader,
18
+ DatabricksVolumesUploaderConfig,
19
+ )
20
+
21
+ CONNECTOR_TYPE = "databricks_volumes"
22
+
23
+
24
+ class DatabricksNativeVolumesAccessConfig(AccessConfig):
25
+ client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
26
+ client_secret: Optional[str] = Field(
27
+ default=None, description="Client Secret of the OAuth app."
28
+ )
29
+ profile: Optional[str] = None
30
+ azure_workspace_resource_id: Optional[str] = Field(
31
+ default=None,
32
+ description="The Azure Resource Manager ID for the Azure Databricks workspace, "
33
+ "which is exchanged for a Databricks host URL.",
34
+ )
35
+
36
+
37
+ class DatabricksNativeVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
38
+ access_config: Secret[DatabricksNativeVolumesAccessConfig]
39
+
40
+
41
+ class DatabricksNativeVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
42
+ pass
43
+
44
+
45
+ @dataclass
46
+ class DatabricksNativeVolumesIndexer(DatabricksVolumesIndexer):
47
+ connection_config: DatabricksNativeVolumesConnectionConfig
48
+ index_config: DatabricksNativeVolumesIndexerConfig
49
+ connector_type: str = CONNECTOR_TYPE
50
+
51
+
52
+ class DatabricksNativeVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
53
+ pass
54
+
55
+
56
+ @dataclass
57
+ class DatabricksNativeVolumesDownloader(DatabricksVolumesDownloader):
58
+ connection_config: DatabricksNativeVolumesConnectionConfig
59
+ download_config: DatabricksVolumesDownloaderConfig
60
+ connector_type: str = CONNECTOR_TYPE
61
+
62
+
63
+ class DatabricksNativeVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
64
+ pass
65
+
66
+
67
+ @dataclass
68
+ class DatabricksNativeVolumesUploader(DatabricksVolumesUploader):
69
+ connection_config: DatabricksNativeVolumesConnectionConfig
70
+ upload_config: DatabricksNativeVolumesUploaderConfig
71
+ connector_type: str = CONNECTOR_TYPE
72
+
73
+
74
+ databricks_native_volumes_destination_entry = DestinationRegistryEntry(
75
+ connection_config=DatabricksNativeVolumesConnectionConfig,
76
+ uploader=DatabricksNativeVolumesUploader,
77
+ uploader_config=DatabricksNativeVolumesUploaderConfig,
78
+ )
79
+
80
+ databricks_native_volumes_source_entry = SourceRegistryEntry(
81
+ connection_config=DatabricksNativeVolumesConnectionConfig,
82
+ indexer=DatabricksNativeVolumesIndexer,
83
+ indexer_config=DatabricksNativeVolumesIndexerConfig,
84
+ downloader=DatabricksNativeVolumesDownloader,
85
+ downloader_config=DatabricksNativeVolumesDownloaderConfig,
86
+ )
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import random
3
4
  from dataclasses import dataclass, field
4
5
  from pathlib import Path
5
6
  from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
@@ -63,6 +64,7 @@ class FileConfig(BaseModel):
63
64
 
64
65
  class FsspecIndexerConfig(FileConfig, IndexerConfig):
65
66
  recursive: bool = False
67
+ sample_n_files: Optional[int] = None
66
68
 
67
69
 
68
70
  class FsspecAccessConfig(AccessConfig):
@@ -128,8 +130,23 @@ class FsspecIndexer(Indexer):
128
130
  filtered_files = [
129
131
  file for file in files if file.get("size") > 0 and file.get("type") == "file"
130
132
  ]
133
+
134
+ if self.index_config.sample_n_files:
135
+ filtered_files = self.sample_n_files(filtered_files, self.index_config.sample_n_files)
136
+
131
137
  return filtered_files
132
138
 
139
+ def sample_n_files(self, files: list[dict[str, Any]], n) -> list[dict[str, Any]]:
140
+ if len(files) <= n:
141
+ logger.warning(
142
+ f"number of files to be sampled={n} is not smaller than the number"
143
+ f" of files found ({len(files)}). Returning all of the files as the"
144
+ " sample."
145
+ )
146
+ return files
147
+
148
+ return random.sample(files, n)
149
+
133
150
  def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
134
151
  raise NotImplementedError()
135
152
 
@@ -26,7 +26,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
26
26
  )
27
27
 
28
28
  if TYPE_CHECKING:
29
- from kdbai_client import Session, Table
29
+ from kdbai_client import Database, Session, Table
30
30
 
31
31
  CONNECTOR_TYPE = "kdbai"
32
32
 
@@ -99,6 +99,9 @@ class KdbaiUploadStager(UploadStager):
99
99
 
100
100
 
101
101
  class KdbaiUploaderConfig(UploaderConfig):
102
+ database_name: str = Field(
103
+ default="default", description="The name of the KDBAI database to write into."
104
+ )
102
105
  table_name: str = Field(description="The name of the KDBAI table to write into.")
103
106
  batch_size: int = Field(default=100, description="Number of records per batch")
104
107
 
@@ -111,24 +114,29 @@ class KdbaiUploader(Uploader):
111
114
 
112
115
  def precheck(self) -> None:
113
116
  try:
114
- self.get_table()
117
+ self.get_database()
115
118
  except Exception as e:
116
119
  logger.error(f"Failed to validate connection {e}", exc_info=True)
117
120
  raise DestinationConnectionError(f"failed to validate connection: {e}")
118
121
 
119
- def get_table(self) -> "Table":
122
+ def get_database(self) -> "Database":
120
123
  session: Session = self.connection_config.get_session()
121
- table = session.table(self.upload_config.table_name)
124
+ db = session.database(self.upload_config.database_name)
125
+ return db
126
+
127
+ def get_table(self) -> "Table":
128
+ db = self.get_database()
129
+ table = db.table(self.upload_config.table_name)
122
130
  return table
123
131
 
124
132
  def upsert_batch(self, batch: pd.DataFrame):
125
133
  table = self.get_table()
126
- table.insert(data=batch)
134
+ table.insert(batch)
127
135
 
128
136
  def process_dataframe(self, df: pd.DataFrame):
129
137
  logger.debug(
130
138
  f"uploading {len(df)} entries to {self.connection_config.endpoint} "
131
- f"db in table {self.upload_config.table_name}"
139
+ f"db {self.upload_config.database_name} in table {self.upload_config.table_name}"
132
140
  )
133
141
  for _, batch_df in df.groupby(np.arange(len(df)) // self.upload_config.batch_size):
134
142
  self.upsert_batch(batch=batch_df)