unstructured-ingest 1.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/cli/__init__.py +0 -0
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +269 -0
- unstructured_ingest/cli/base/dest.py +84 -0
- unstructured_ingest/cli/base/importer.py +34 -0
- unstructured_ingest/cli/base/src.py +75 -0
- unstructured_ingest/cli/cli.py +24 -0
- unstructured_ingest/cli/cmds.py +14 -0
- unstructured_ingest/cli/utils/__init__.py +0 -0
- unstructured_ingest/cli/utils/click.py +237 -0
- unstructured_ingest/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/data_types/__init__.py +0 -0
- unstructured_ingest/data_types/entities.py +17 -0
- unstructured_ingest/data_types/file_data.py +116 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +63 -0
- unstructured_ingest/embed/bedrock.py +323 -0
- unstructured_ingest/embed/huggingface.py +69 -0
- unstructured_ingest/embed/interfaces.py +146 -0
- unstructured_ingest/embed/mixedbreadai.py +134 -0
- unstructured_ingest/embed/octoai.py +133 -0
- unstructured_ingest/embed/openai.py +142 -0
- unstructured_ingest/embed/togetherai.py +116 -0
- unstructured_ingest/embed/vertexai.py +109 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/error.py +156 -0
- unstructured_ingest/errors_v2.py +156 -0
- unstructured_ingest/interfaces/__init__.py +27 -0
- unstructured_ingest/interfaces/connector.py +56 -0
- unstructured_ingest/interfaces/downloader.py +90 -0
- unstructured_ingest/interfaces/indexer.py +29 -0
- unstructured_ingest/interfaces/process.py +22 -0
- unstructured_ingest/interfaces/processor.py +88 -0
- unstructured_ingest/interfaces/upload_stager.py +89 -0
- unstructured_ingest/interfaces/uploader.py +67 -0
- unstructured_ingest/logger.py +39 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/otel.py +128 -0
- unstructured_ingest/pipeline/__init__.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +211 -0
- unstructured_ingest/pipeline/otel.py +32 -0
- unstructured_ingest/pipeline/pipeline.py +408 -0
- unstructured_ingest/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/pipeline/steps/chunk.py +78 -0
- unstructured_ingest/pipeline/steps/download.py +206 -0
- unstructured_ingest/pipeline/steps/embed.py +77 -0
- unstructured_ingest/pipeline/steps/filter.py +35 -0
- unstructured_ingest/pipeline/steps/index.py +86 -0
- unstructured_ingest/pipeline/steps/partition.py +77 -0
- unstructured_ingest/pipeline/steps/stage.py +65 -0
- unstructured_ingest/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/pipeline/steps/upload.py +58 -0
- unstructured_ingest/processes/__init__.py +18 -0
- unstructured_ingest/processes/chunker.py +131 -0
- unstructured_ingest/processes/connector_registry.py +69 -0
- unstructured_ingest/processes/connectors/__init__.py +129 -0
- unstructured_ingest/processes/connectors/airtable.py +238 -0
- unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +592 -0
- unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
- unstructured_ingest/processes/connectors/chroma.py +193 -0
- unstructured_ingest/processes/connectors/confluence.py +527 -0
- unstructured_ingest/processes/connectors/couchbase.py +336 -0
- unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
- unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
- unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
- unstructured_ingest/processes/connectors/delta_table.py +310 -0
- unstructured_ingest/processes/connectors/discord.py +161 -0
- unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
- unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
- unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
- unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
- unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
- unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
- unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
- unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/processes/connectors/github.py +226 -0
- unstructured_ingest/processes/connectors/gitlab.py +270 -0
- unstructured_ingest/processes/connectors/google_drive.py +848 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
- unstructured_ingest/processes/connectors/jira.py +522 -0
- unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
- unstructured_ingest/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/processes/connectors/kdbai.py +156 -0
- unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
- unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/processes/connectors/local.py +227 -0
- unstructured_ingest/processes/connectors/milvus.py +311 -0
- unstructured_ingest/processes/connectors/mongodb.py +389 -0
- unstructured_ingest/processes/connectors/neo4j.py +534 -0
- unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/processes/connectors/notion/connector.py +350 -0
- unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
- unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
- unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
- unstructured_ingest/processes/connectors/onedrive.py +485 -0
- unstructured_ingest/processes/connectors/outlook.py +242 -0
- unstructured_ingest/processes/connectors/pinecone.py +400 -0
- unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
- unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/processes/connectors/redisdb.py +214 -0
- unstructured_ingest/processes/connectors/salesforce.py +307 -0
- unstructured_ingest/processes/connectors/sharepoint.py +282 -0
- unstructured_ingest/processes/connectors/slack.py +249 -0
- unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
- unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
- unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
- unstructured_ingest/processes/connectors/sql/sql.py +456 -0
- unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
- unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
- unstructured_ingest/processes/connectors/utils.py +60 -0
- unstructured_ingest/processes/connectors/vectara.py +348 -0
- unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
- unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
- unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
- unstructured_ingest/processes/embedder.py +203 -0
- unstructured_ingest/processes/filter.py +60 -0
- unstructured_ingest/processes/partitioner.py +233 -0
- unstructured_ingest/processes/uncompress.py +61 -0
- unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest/processes/utils/blob_storage.py +32 -0
- unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- unstructured_ingest/unstructured_api.py +140 -0
- unstructured_ingest/utils/__init__.py +5 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +72 -0
- unstructured_ingest/utils/constants.py +2 -0
- unstructured_ingest/utils/data_prep.py +216 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/filesystem.py +27 -0
- unstructured_ingest/utils/html.py +174 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/utils/pydantic_models.py +52 -0
- unstructured_ingest/utils/string_and_date_utils.py +74 -0
- unstructured_ingest/utils/table.py +80 -0
- unstructured_ingest/utils/tls.py +15 -0
- unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
- unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
- unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
- unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
- unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import Field, Secret
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
7
|
+
DestinationRegistryEntry,
|
|
8
|
+
SourceRegistryEntry,
|
|
9
|
+
)
|
|
10
|
+
from unstructured_ingest.processes.connectors.databricks.volumes import (
|
|
11
|
+
DatabricksVolumesAccessConfig,
|
|
12
|
+
DatabricksVolumesConnectionConfig,
|
|
13
|
+
DatabricksVolumesDownloader,
|
|
14
|
+
DatabricksVolumesDownloaderConfig,
|
|
15
|
+
DatabricksVolumesIndexer,
|
|
16
|
+
DatabricksVolumesIndexerConfig,
|
|
17
|
+
DatabricksVolumesUploader,
|
|
18
|
+
DatabricksVolumesUploaderConfig,
|
|
19
|
+
)
|
|
20
|
+
from unstructured_ingest.processes.utils.blob_storage import (
|
|
21
|
+
BlobStoreUploadStager,
|
|
22
|
+
BlobStoreUploadStagerConfig,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
CONNECTOR_TYPE = "databricks_volumes_aws"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DatabricksAWSVolumesAccessConfig(DatabricksVolumesAccessConfig):
|
|
29
|
+
account_id: Optional[str] = Field(
|
|
30
|
+
default=None,
|
|
31
|
+
description="The Databricks account ID for the Databricks accounts endpoint",
|
|
32
|
+
)
|
|
33
|
+
profile: Optional[str] = None
|
|
34
|
+
token: Optional[str] = Field(
|
|
35
|
+
default=None,
|
|
36
|
+
description="The Databricks personal access token (PAT)",
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class DatabricksAWSVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
|
|
41
|
+
access_config: Secret[DatabricksAWSVolumesAccessConfig]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class DatabricksAWSVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class DatabricksAWSVolumesIndexer(DatabricksVolumesIndexer):
|
|
50
|
+
connection_config: DatabricksAWSVolumesConnectionConfig
|
|
51
|
+
index_config: DatabricksAWSVolumesIndexerConfig
|
|
52
|
+
connector_type: str = CONNECTOR_TYPE
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class DatabricksAWSVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
|
|
56
|
+
pass
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class DatabricksAWSVolumesDownloader(DatabricksVolumesDownloader):
|
|
61
|
+
connection_config: DatabricksAWSVolumesConnectionConfig
|
|
62
|
+
download_config: DatabricksVolumesDownloaderConfig
|
|
63
|
+
connector_type: str = CONNECTOR_TYPE
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class DatabricksAWSVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
|
|
67
|
+
pass
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dataclass
|
|
71
|
+
class DatabricksAWSVolumesUploader(DatabricksVolumesUploader):
|
|
72
|
+
connection_config: DatabricksAWSVolumesConnectionConfig
|
|
73
|
+
upload_config: DatabricksAWSVolumesUploaderConfig = field(
|
|
74
|
+
default_factory=DatabricksAWSVolumesUploaderConfig
|
|
75
|
+
)
|
|
76
|
+
connector_type: str = CONNECTOR_TYPE
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
databricks_aws_volumes_destination_entry = DestinationRegistryEntry(
|
|
80
|
+
connection_config=DatabricksAWSVolumesConnectionConfig,
|
|
81
|
+
uploader=DatabricksAWSVolumesUploader,
|
|
82
|
+
uploader_config=DatabricksAWSVolumesUploaderConfig,
|
|
83
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
84
|
+
upload_stager=BlobStoreUploadStager,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
databricks_aws_volumes_source_entry = SourceRegistryEntry(
|
|
88
|
+
connection_config=DatabricksAWSVolumesConnectionConfig,
|
|
89
|
+
indexer=DatabricksAWSVolumesIndexer,
|
|
90
|
+
indexer_config=DatabricksAWSVolumesIndexerConfig,
|
|
91
|
+
downloader=DatabricksAWSVolumesDownloader,
|
|
92
|
+
downloader_config=DatabricksAWSVolumesDownloaderConfig,
|
|
93
|
+
)
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import Field, Secret
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
7
|
+
DestinationRegistryEntry,
|
|
8
|
+
SourceRegistryEntry,
|
|
9
|
+
)
|
|
10
|
+
from unstructured_ingest.processes.connectors.databricks.volumes import (
|
|
11
|
+
DatabricksVolumesAccessConfig,
|
|
12
|
+
DatabricksVolumesConnectionConfig,
|
|
13
|
+
DatabricksVolumesDownloader,
|
|
14
|
+
DatabricksVolumesDownloaderConfig,
|
|
15
|
+
DatabricksVolumesIndexer,
|
|
16
|
+
DatabricksVolumesIndexerConfig,
|
|
17
|
+
DatabricksVolumesUploader,
|
|
18
|
+
DatabricksVolumesUploaderConfig,
|
|
19
|
+
)
|
|
20
|
+
from unstructured_ingest.processes.utils.blob_storage import (
|
|
21
|
+
BlobStoreUploadStager,
|
|
22
|
+
BlobStoreUploadStagerConfig,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
CONNECTOR_TYPE = "databricks_volumes_azure"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DatabricksAzureVolumesAccessConfig(DatabricksVolumesAccessConfig):
|
|
29
|
+
account_id: Optional[str] = Field(
|
|
30
|
+
default=None,
|
|
31
|
+
description="The Databricks account ID for the Databricks accounts endpoint.",
|
|
32
|
+
)
|
|
33
|
+
profile: Optional[str] = None
|
|
34
|
+
azure_workspace_resource_id: Optional[str] = Field(
|
|
35
|
+
default=None,
|
|
36
|
+
description="The Azure Resource Manager ID for the Azure Databricks workspace, "
|
|
37
|
+
"which is exchanged for a Databricks host URL.",
|
|
38
|
+
)
|
|
39
|
+
azure_client_secret: Optional[str] = Field(
|
|
40
|
+
default=None, description="The Azure AD service principal’s client secret."
|
|
41
|
+
)
|
|
42
|
+
azure_client_id: Optional[str] = Field(
|
|
43
|
+
default=None, description="The Azure AD service principal’s application ID."
|
|
44
|
+
)
|
|
45
|
+
azure_tenant_id: Optional[str] = Field(
|
|
46
|
+
default=None, description="The Azure AD service principal’s tenant ID."
|
|
47
|
+
)
|
|
48
|
+
azure_environment: Optional[str] = Field(
|
|
49
|
+
default=None,
|
|
50
|
+
description="The Azure environment type for a specific set of API endpoints",
|
|
51
|
+
examples=["Public", "UsGov", "China", "Germany"],
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class DatabricksAzureVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
|
|
56
|
+
access_config: Secret[DatabricksAzureVolumesAccessConfig]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class DatabricksAzureVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
|
|
60
|
+
pass
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class DatabricksAzureVolumesIndexer(DatabricksVolumesIndexer):
|
|
65
|
+
connection_config: DatabricksAzureVolumesConnectionConfig
|
|
66
|
+
index_config: DatabricksAzureVolumesIndexerConfig
|
|
67
|
+
connector_type: str = CONNECTOR_TYPE
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class DatabricksAzureVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
|
|
71
|
+
pass
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass
|
|
75
|
+
class DatabricksAzureVolumesDownloader(DatabricksVolumesDownloader):
|
|
76
|
+
connection_config: DatabricksAzureVolumesConnectionConfig
|
|
77
|
+
download_config: DatabricksVolumesDownloaderConfig
|
|
78
|
+
connector_type: str = CONNECTOR_TYPE
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class DatabricksAzureVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
|
|
82
|
+
pass
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@dataclass
|
|
86
|
+
class DatabricksAzureVolumesUploader(DatabricksVolumesUploader):
|
|
87
|
+
connection_config: DatabricksAzureVolumesConnectionConfig
|
|
88
|
+
upload_config: DatabricksAzureVolumesUploaderConfig = field(
|
|
89
|
+
default_factory=DatabricksAzureVolumesUploaderConfig
|
|
90
|
+
)
|
|
91
|
+
connector_type: str = CONNECTOR_TYPE
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
databricks_azure_volumes_destination_entry = DestinationRegistryEntry(
|
|
95
|
+
connection_config=DatabricksAzureVolumesConnectionConfig,
|
|
96
|
+
uploader=DatabricksAzureVolumesUploader,
|
|
97
|
+
uploader_config=DatabricksAzureVolumesUploaderConfig,
|
|
98
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
99
|
+
upload_stager=BlobStoreUploadStager,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
databricks_azure_volumes_source_entry = SourceRegistryEntry(
|
|
103
|
+
connection_config=DatabricksAzureVolumesConnectionConfig,
|
|
104
|
+
indexer=DatabricksAzureVolumesIndexer,
|
|
105
|
+
indexer_config=DatabricksAzureVolumesIndexerConfig,
|
|
106
|
+
downloader=DatabricksAzureVolumesDownloader,
|
|
107
|
+
downloader_config=DatabricksAzureVolumesDownloaderConfig,
|
|
108
|
+
)
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import Field, Secret
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
7
|
+
DestinationRegistryEntry,
|
|
8
|
+
SourceRegistryEntry,
|
|
9
|
+
)
|
|
10
|
+
from unstructured_ingest.processes.connectors.databricks.volumes import (
|
|
11
|
+
DatabricksVolumesAccessConfig,
|
|
12
|
+
DatabricksVolumesConnectionConfig,
|
|
13
|
+
DatabricksVolumesDownloader,
|
|
14
|
+
DatabricksVolumesDownloaderConfig,
|
|
15
|
+
DatabricksVolumesIndexer,
|
|
16
|
+
DatabricksVolumesIndexerConfig,
|
|
17
|
+
DatabricksVolumesUploader,
|
|
18
|
+
DatabricksVolumesUploaderConfig,
|
|
19
|
+
)
|
|
20
|
+
from unstructured_ingest.processes.utils.blob_storage import (
|
|
21
|
+
BlobStoreUploadStager,
|
|
22
|
+
BlobStoreUploadStagerConfig,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
CONNECTOR_TYPE = "databricks_volumes_gcp"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DatabricksGoogleVolumesAccessConfig(DatabricksVolumesAccessConfig):
|
|
29
|
+
account_id: Optional[str] = Field(
|
|
30
|
+
default=None,
|
|
31
|
+
description="The Databricks account ID for the Databricks accounts endpoint.",
|
|
32
|
+
)
|
|
33
|
+
profile: Optional[str] = None
|
|
34
|
+
google_credentials: Optional[str] = None
|
|
35
|
+
google_service_account: Optional[str] = None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class DatabricksGoogleVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
|
|
39
|
+
access_config: Secret[DatabricksGoogleVolumesAccessConfig]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class DatabricksGoogleVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class DatabricksGoogleVolumesIndexer(DatabricksVolumesIndexer):
|
|
48
|
+
connection_config: DatabricksGoogleVolumesConnectionConfig
|
|
49
|
+
index_config: DatabricksGoogleVolumesIndexerConfig
|
|
50
|
+
connector_type: str = CONNECTOR_TYPE
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class DatabricksGoogleVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
|
|
54
|
+
pass
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class DatabricksGoogleVolumesDownloader(DatabricksVolumesDownloader):
|
|
59
|
+
connection_config: DatabricksGoogleVolumesConnectionConfig
|
|
60
|
+
download_config: DatabricksVolumesDownloaderConfig
|
|
61
|
+
connector_type: str = CONNECTOR_TYPE
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class DatabricksGoogleVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass
|
|
69
|
+
class DatabricksGoogleVolumesUploader(DatabricksVolumesUploader):
|
|
70
|
+
connection_config: DatabricksGoogleVolumesConnectionConfig
|
|
71
|
+
upload_config: DatabricksGoogleVolumesUploaderConfig = field(
|
|
72
|
+
default_factory=DatabricksGoogleVolumesUploaderConfig
|
|
73
|
+
)
|
|
74
|
+
connector_type: str = CONNECTOR_TYPE
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
databricks_gcp_volumes_destination_entry = DestinationRegistryEntry(
|
|
78
|
+
connection_config=DatabricksGoogleVolumesConnectionConfig,
|
|
79
|
+
uploader=DatabricksGoogleVolumesUploader,
|
|
80
|
+
uploader_config=DatabricksGoogleVolumesUploaderConfig,
|
|
81
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
82
|
+
upload_stager=BlobStoreUploadStager,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
databricks_gcp_volumes_source_entry = SourceRegistryEntry(
|
|
86
|
+
connection_config=DatabricksGoogleVolumesConnectionConfig,
|
|
87
|
+
indexer=DatabricksGoogleVolumesIndexer,
|
|
88
|
+
indexer_config=DatabricksGoogleVolumesIndexerConfig,
|
|
89
|
+
downloader=DatabricksGoogleVolumesDownloader,
|
|
90
|
+
downloader_config=DatabricksGoogleVolumesDownloaderConfig,
|
|
91
|
+
)
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import Field, Secret
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
7
|
+
DestinationRegistryEntry,
|
|
8
|
+
SourceRegistryEntry,
|
|
9
|
+
)
|
|
10
|
+
from unstructured_ingest.processes.connectors.databricks.volumes import (
|
|
11
|
+
DatabricksVolumesAccessConfig,
|
|
12
|
+
DatabricksVolumesConnectionConfig,
|
|
13
|
+
DatabricksVolumesDownloader,
|
|
14
|
+
DatabricksVolumesDownloaderConfig,
|
|
15
|
+
DatabricksVolumesIndexer,
|
|
16
|
+
DatabricksVolumesIndexerConfig,
|
|
17
|
+
DatabricksVolumesUploader,
|
|
18
|
+
DatabricksVolumesUploaderConfig,
|
|
19
|
+
)
|
|
20
|
+
from unstructured_ingest.processes.utils.blob_storage import (
|
|
21
|
+
BlobStoreUploadStager,
|
|
22
|
+
BlobStoreUploadStagerConfig,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
CONNECTOR_TYPE = "databricks_volumes"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DatabricksNativeVolumesAccessConfig(DatabricksVolumesAccessConfig):
|
|
29
|
+
client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
|
|
30
|
+
client_secret: Optional[str] = Field(
|
|
31
|
+
default=None, description="Client Secret of the OAuth app."
|
|
32
|
+
)
|
|
33
|
+
profile: Optional[str] = None
|
|
34
|
+
azure_workspace_resource_id: Optional[str] = Field(
|
|
35
|
+
default=None,
|
|
36
|
+
description="The Azure Resource Manager ID for the Azure Databricks workspace, "
|
|
37
|
+
"which is exchanged for a Databricks host URL.",
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class DatabricksNativeVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
|
|
42
|
+
access_config: Secret[DatabricksNativeVolumesAccessConfig]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class DatabricksNativeVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class DatabricksNativeVolumesIndexer(DatabricksVolumesIndexer):
|
|
51
|
+
connection_config: DatabricksNativeVolumesConnectionConfig
|
|
52
|
+
index_config: DatabricksNativeVolumesIndexerConfig
|
|
53
|
+
connector_type: str = CONNECTOR_TYPE
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class DatabricksNativeVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class DatabricksNativeVolumesDownloader(DatabricksVolumesDownloader):
|
|
62
|
+
connection_config: DatabricksNativeVolumesConnectionConfig
|
|
63
|
+
download_config: DatabricksVolumesDownloaderConfig
|
|
64
|
+
connector_type: str = CONNECTOR_TYPE
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class DatabricksNativeVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
|
|
68
|
+
pass
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass
|
|
72
|
+
class DatabricksNativeVolumesUploader(DatabricksVolumesUploader):
|
|
73
|
+
connection_config: DatabricksNativeVolumesConnectionConfig
|
|
74
|
+
upload_config: DatabricksNativeVolumesUploaderConfig
|
|
75
|
+
connector_type: str = CONNECTOR_TYPE
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
databricks_native_volumes_destination_entry = DestinationRegistryEntry(
|
|
79
|
+
connection_config=DatabricksNativeVolumesConnectionConfig,
|
|
80
|
+
uploader=DatabricksNativeVolumesUploader,
|
|
81
|
+
uploader_config=DatabricksNativeVolumesUploaderConfig,
|
|
82
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
83
|
+
upload_stager=BlobStoreUploadStager,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
databricks_native_volumes_source_entry = SourceRegistryEntry(
|
|
87
|
+
connection_config=DatabricksNativeVolumesConnectionConfig,
|
|
88
|
+
indexer=DatabricksNativeVolumesIndexer,
|
|
89
|
+
indexer_config=DatabricksNativeVolumesIndexerConfig,
|
|
90
|
+
downloader=DatabricksNativeVolumesDownloader,
|
|
91
|
+
downloader_config=DatabricksNativeVolumesDownloaderConfig,
|
|
92
|
+
)
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
7
|
+
|
|
8
|
+
from pydantic import Field
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
11
|
+
from unstructured_ingest.error import ValueError
|
|
12
|
+
from unstructured_ingest.interfaces import (
|
|
13
|
+
Uploader,
|
|
14
|
+
UploaderConfig,
|
|
15
|
+
UploadStager,
|
|
16
|
+
UploadStagerConfig,
|
|
17
|
+
)
|
|
18
|
+
from unstructured_ingest.logger import logger
|
|
19
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
20
|
+
DestinationRegistryEntry,
|
|
21
|
+
)
|
|
22
|
+
from unstructured_ingest.processes.connectors.databricks.volumes import DatabricksPathMixin
|
|
23
|
+
from unstructured_ingest.processes.connectors.sql.databricks_delta_tables import (
|
|
24
|
+
DatabricksDeltaTablesConnectionConfig,
|
|
25
|
+
DatabricksDeltaTablesUploadStagerConfig,
|
|
26
|
+
)
|
|
27
|
+
from unstructured_ingest.utils.constants import RECORD_ID_LABEL
|
|
28
|
+
from unstructured_ingest.utils.data_prep import get_enhanced_element_id, get_json_data, write_data
|
|
29
|
+
|
|
30
|
+
CONNECTOR_TYPE = "databricks_volume_delta_tables"
|
|
31
|
+
|
|
32
|
+
if TYPE_CHECKING:
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMixin):
|
|
37
|
+
database: str = Field(description="Database name", default="default")
|
|
38
|
+
table_name: Optional[str] = Field(description="Table name", default=None)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class DatabricksVolumeDeltaTableStagerConfig(UploadStagerConfig):
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class DatabricksVolumeDeltaTableStager(UploadStager):
|
|
47
|
+
upload_stager_config: DatabricksVolumeDeltaTableStagerConfig = field(
|
|
48
|
+
default_factory=DatabricksVolumeDeltaTableStagerConfig
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def run(
|
|
52
|
+
self,
|
|
53
|
+
elements_filepath: Path,
|
|
54
|
+
output_dir: Path,
|
|
55
|
+
output_filename: str,
|
|
56
|
+
file_data: FileData,
|
|
57
|
+
**kwargs: Any,
|
|
58
|
+
) -> Path:
|
|
59
|
+
# To avoid new line issues when migrating from volumes into delta tables, omit indenting
|
|
60
|
+
# and always write it as a json file
|
|
61
|
+
output_dir.mkdir(exist_ok=True, parents=True)
|
|
62
|
+
output_path = output_dir / output_filename
|
|
63
|
+
final_output_path = output_path.with_suffix(".json")
|
|
64
|
+
data = get_json_data(path=elements_filepath)
|
|
65
|
+
for element in data:
|
|
66
|
+
element["id"] = get_enhanced_element_id(element_dict=element, file_data=file_data)
|
|
67
|
+
element[RECORD_ID_LABEL] = file_data.identifier
|
|
68
|
+
element["metadata"] = json.dumps(element.get("metadata", {}))
|
|
69
|
+
write_data(path=final_output_path, data=data, indent=None)
|
|
70
|
+
return final_output_path
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@dataclass
|
|
74
|
+
class DatabricksVolumeDeltaTableUploader(Uploader):
|
|
75
|
+
connection_config: DatabricksDeltaTablesConnectionConfig
|
|
76
|
+
upload_config: DatabricksVolumeDeltaTableUploaderConfig
|
|
77
|
+
connector_type: str = CONNECTOR_TYPE
|
|
78
|
+
_columns: Optional[dict[str, str]] = None
|
|
79
|
+
|
|
80
|
+
def init(self, **kwargs: Any) -> None:
|
|
81
|
+
self.create_destination(**kwargs)
|
|
82
|
+
|
|
83
|
+
def create_destination(
|
|
84
|
+
self, destination_name: str = "unstructuredautocreated", **kwargs: Any
|
|
85
|
+
) -> bool:
|
|
86
|
+
table_name = self.upload_config.table_name or destination_name
|
|
87
|
+
self.upload_config.table_name = table_name
|
|
88
|
+
connectors_dir = Path(__file__).parents[1]
|
|
89
|
+
collection_config_file = connectors_dir / "assets" / "databricks_delta_table_schema.sql"
|
|
90
|
+
with self.get_cursor() as cursor:
|
|
91
|
+
cursor.execute("SHOW TABLES")
|
|
92
|
+
table_names = [r[1] for r in cursor.fetchall()]
|
|
93
|
+
if table_name in table_names:
|
|
94
|
+
return False
|
|
95
|
+
with collection_config_file.open() as schema_file:
|
|
96
|
+
data_lines = schema_file.readlines()
|
|
97
|
+
data_lines[0] = data_lines[0].replace("elements", table_name)
|
|
98
|
+
destination_schema = "".join([line.strip() for line in data_lines])
|
|
99
|
+
logger.info(f"creating table {table_name} for user")
|
|
100
|
+
cursor.execute(destination_schema)
|
|
101
|
+
return True
|
|
102
|
+
|
|
103
|
+
def precheck(self) -> None:
|
|
104
|
+
with self.connection_config.get_cursor() as cursor:
|
|
105
|
+
cursor.execute("SHOW CATALOGS")
|
|
106
|
+
catalogs = [r[0] for r in cursor.fetchall()]
|
|
107
|
+
if self.upload_config.catalog not in catalogs:
|
|
108
|
+
raise ValueError(
|
|
109
|
+
"Catalog {} not found in {}".format(
|
|
110
|
+
self.upload_config.catalog, ", ".join(catalogs)
|
|
111
|
+
)
|
|
112
|
+
)
|
|
113
|
+
cursor.execute(f"USE CATALOG '{self.upload_config.catalog}'")
|
|
114
|
+
cursor.execute("SHOW DATABASES")
|
|
115
|
+
databases = [r[0] for r in cursor.fetchall()]
|
|
116
|
+
if self.upload_config.database not in databases:
|
|
117
|
+
raise ValueError(
|
|
118
|
+
"Database {} not found in {}".format(
|
|
119
|
+
self.upload_config.database, ", ".join(databases)
|
|
120
|
+
)
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
def get_output_path(self, file_data: FileData, suffix: str = ".json") -> str:
|
|
124
|
+
filename = Path(file_data.source_identifiers.filename)
|
|
125
|
+
adjusted_filename = filename if filename.suffix == suffix else f"{filename}{suffix}"
|
|
126
|
+
return os.path.join(self.upload_config.path, f"{adjusted_filename}")
|
|
127
|
+
|
|
128
|
+
@contextmanager
|
|
129
|
+
def get_cursor(self, **connect_kwargs) -> Generator[Any, None, None]:
|
|
130
|
+
with self.connection_config.get_cursor(**connect_kwargs) as cursor:
|
|
131
|
+
logger.debug(f"executing: USE CATALOG: '{self.upload_config.catalog}'")
|
|
132
|
+
cursor.execute(f"USE CATALOG '{self.upload_config.catalog}'")
|
|
133
|
+
logger.debug(f"executing: USE DATABASE: {self.upload_config.database}")
|
|
134
|
+
cursor.execute(f"USE DATABASE {self.upload_config.database}")
|
|
135
|
+
yield cursor
|
|
136
|
+
|
|
137
|
+
def get_table_columns(self) -> dict[str, str]:
|
|
138
|
+
if self._columns is None:
|
|
139
|
+
with self.get_cursor() as cursor:
|
|
140
|
+
cursor.execute(f"SELECT * from `{self.upload_config.table_name}` LIMIT 1")
|
|
141
|
+
self._columns = {desc[0]: desc[1] for desc in cursor.description}
|
|
142
|
+
return self._columns
|
|
143
|
+
|
|
144
|
+
def can_delete(self) -> bool:
|
|
145
|
+
existing_columns = self.get_table_columns()
|
|
146
|
+
return RECORD_ID_LABEL in existing_columns
|
|
147
|
+
|
|
148
|
+
def delete_previous_content(self, file_data: FileData) -> None:
|
|
149
|
+
logger.debug(
|
|
150
|
+
f"deleting any content with metadata "
|
|
151
|
+
f"{RECORD_ID_LABEL}={file_data.identifier} "
|
|
152
|
+
f"from delta table: {self.upload_config.table_name}"
|
|
153
|
+
)
|
|
154
|
+
with self.get_cursor() as cursor:
|
|
155
|
+
cursor.execute(
|
|
156
|
+
f"DELETE FROM `{self.upload_config.table_name}` WHERE {RECORD_ID_LABEL} = '{file_data.identifier}'" # noqa: E501
|
|
157
|
+
)
|
|
158
|
+
results = cursor.fetchall()
|
|
159
|
+
deleted_rows = results[0][0]
|
|
160
|
+
logger.debug(f"deleted {deleted_rows} rows from table {self.upload_config.table_name}")
|
|
161
|
+
|
|
162
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
163
|
+
if self.can_delete():
|
|
164
|
+
self.delete_previous_content(file_data=file_data)
|
|
165
|
+
with self.get_cursor(staging_allowed_local_path=path.parent.as_posix()) as cursor:
|
|
166
|
+
catalog_path = self.get_output_path(file_data=file_data)
|
|
167
|
+
logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
|
|
168
|
+
cursor.execute(f"PUT '{path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
|
|
169
|
+
logger.debug(
|
|
170
|
+
f"migrating content from {catalog_path} to table {self.upload_config.table_name}"
|
|
171
|
+
)
|
|
172
|
+
data = get_json_data(path=path)
|
|
173
|
+
columns = data[0].keys()
|
|
174
|
+
select_columns = ["PARSE_JSON(metadata)" if c == "metadata" else c for c in columns]
|
|
175
|
+
column_str = ", ".join(columns)
|
|
176
|
+
select_column_str = ", ".join(select_columns)
|
|
177
|
+
sql_statment = f"INSERT INTO `{self.upload_config.table_name}` ({column_str}) SELECT {select_column_str} FROM json.`{catalog_path}`" # noqa: E501
|
|
178
|
+
cursor.execute(sql_statment)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
databricks_volumes_delta_tables_destination_entry = DestinationRegistryEntry(
|
|
182
|
+
connection_config=DatabricksDeltaTablesConnectionConfig,
|
|
183
|
+
uploader=DatabricksVolumeDeltaTableUploader,
|
|
184
|
+
uploader_config=DatabricksVolumeDeltaTableUploaderConfig,
|
|
185
|
+
upload_stager=DatabricksVolumeDeltaTableStager,
|
|
186
|
+
upload_stager_config=DatabricksDeltaTablesUploadStagerConfig,
|
|
187
|
+
)
|