unstructured-ingest 0.5.23__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/databricks/test_volumes_native.py +1 -1
- test/integration/connectors/duckdb/test_duckdb.py +1 -1
- test/integration/connectors/duckdb/test_motherduck.py +1 -1
- test/integration/connectors/elasticsearch/test_elasticsearch.py +1 -1
- test/integration/connectors/elasticsearch/test_opensearch.py +1 -1
- test/integration/connectors/sql/test_databricks_delta_tables.py +1 -1
- test/integration/connectors/sql/test_postgres.py +1 -1
- test/integration/connectors/sql/test_singlestore.py +1 -1
- test/integration/connectors/sql/test_snowflake.py +1 -1
- test/integration/connectors/sql/test_sqlite.py +1 -1
- test/integration/connectors/test_astradb.py +1 -1
- test/integration/connectors/test_azure_ai_search.py +1 -1
- test/integration/connectors/test_chroma.py +1 -1
- test/integration/connectors/test_delta_table.py +1 -1
- test/integration/connectors/test_lancedb.py +1 -1
- test/integration/connectors/test_milvus.py +1 -1
- test/integration/connectors/test_mongodb.py +1 -1
- test/integration/connectors/test_neo4j.py +5 -5
- test/integration/connectors/test_onedrive.py +1 -1
- test/integration/connectors/test_pinecone.py +1 -1
- test/integration/connectors/test_qdrant.py +1 -1
- test/integration/connectors/test_redis.py +1 -1
- test/integration/connectors/test_s3.py +1 -1
- test/integration/connectors/test_vectara.py +68 -56
- test/integration/connectors/utils/validation/destination.py +2 -1
- test/integration/connectors/utils/validation/source.py +2 -1
- test/integration/connectors/weaviate/test_local.py +1 -1
- test/unit/test_html.py +1 -1
- test/unit/v2/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +1 -1
- test/unit/v2/connectors/motherduck/test_base.py +1 -2
- test/unit/v2/connectors/sql/test_sql.py +1 -1
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/utils/html.py +2 -1
- unstructured_ingest/v2/interfaces/__init__.py +0 -13
- unstructured_ingest/v2/interfaces/downloader.py +1 -1
- unstructured_ingest/v2/interfaces/indexer.py +1 -1
- unstructured_ingest/v2/interfaces/upload_stager.py +2 -2
- unstructured_ingest/v2/interfaces/uploader.py +2 -3
- unstructured_ingest/v2/pipeline/steps/chunk.py +1 -2
- unstructured_ingest/v2/pipeline/steps/download.py +2 -3
- unstructured_ingest/v2/pipeline/steps/embed.py +1 -2
- unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
- unstructured_ingest/v2/pipeline/steps/partition.py +1 -2
- unstructured_ingest/v2/pipeline/steps/stage.py +2 -2
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
- unstructured_ingest/v2/pipeline/steps/upload.py +2 -2
- unstructured_ingest/v2/processes/connectors/airtable.py +1 -2
- unstructured_ingest/v2/processes/connectors/astradb.py +7 -5
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +1 -1
- unstructured_ingest/v2/processes/connectors/chroma.py +1 -1
- unstructured_ingest/v2/processes/connectors/confluence.py +5 -3
- unstructured_ingest/v2/processes/connectors/couchbase.py +7 -5
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +5 -3
- unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +64 -19
- unstructured_ingest/v2/processes/connectors/delta_table.py +1 -1
- unstructured_ingest/v2/processes/connectors/discord.py +5 -3
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +2 -1
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +1 -1
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +1 -1
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +7 -5
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +5 -3
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -4
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +1 -1
- unstructured_ingest/v2/processes/connectors/gitlab.py +5 -3
- unstructured_ingest/v2/processes/connectors/google_drive.py +5 -3
- unstructured_ingest/v2/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +1 -1
- unstructured_ingest/v2/processes/connectors/jira.py +5 -3
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +5 -3
- unstructured_ingest/v2/processes/connectors/kdbai.py +1 -1
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -4
- unstructured_ingest/v2/processes/connectors/local.py +5 -3
- unstructured_ingest/v2/processes/connectors/milvus.py +1 -1
- unstructured_ingest/v2/processes/connectors/mongodb.py +7 -5
- unstructured_ingest/v2/processes/connectors/neo4j.py +1 -1
- unstructured_ingest/v2/processes/connectors/notion/connector.py +5 -3
- unstructured_ingest/v2/processes/connectors/onedrive.py +5 -3
- unstructured_ingest/v2/processes/connectors/outlook.py +5 -2
- unstructured_ingest/v2/processes/connectors/pinecone.py +1 -1
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +1 -1
- unstructured_ingest/v2/processes/connectors/redisdb.py +1 -1
- unstructured_ingest/v2/processes/connectors/salesforce.py +5 -3
- unstructured_ingest/v2/processes/connectors/sharepoint.py +3 -3
- unstructured_ingest/v2/processes/connectors/slack.py +2 -2
- unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +1 -1
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +1 -1
- unstructured_ingest/v2/processes/connectors/sql/sql.py +13 -8
- unstructured_ingest/v2/processes/connectors/sql/vastdb.py +3 -3
- unstructured_ingest/v2/processes/connectors/vectara.py +1 -1
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +1 -1
- unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py +5 -3
- unstructured_ingest/v2/processes/filter.py +1 -1
- unstructured_ingest/v2/processes/uncompress.py +1 -1
- unstructured_ingest/v2/processes/utils/blob_storage.py +2 -1
- unstructured_ingest/v2/utils.py +1 -1
- {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/METADATA +101 -101
- {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/RECORD +104 -105
- unstructured_ingest/v2/interfaces/file_data.py +0 -13
- {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/top_level.txt +0 -0
|
@@ -3,10 +3,9 @@ from pathlib import Path
|
|
|
3
3
|
import pytest
|
|
4
4
|
from pytest_mock import MockerFixture
|
|
5
5
|
|
|
6
|
-
from unstructured_ingest.v2.interfaces import FileData
|
|
7
|
-
from unstructured_ingest.v2.interfaces.file_data import SourceIdentifiers
|
|
8
6
|
from unstructured_ingest.v2.interfaces.upload_stager import UploadStagerConfig
|
|
9
7
|
from unstructured_ingest.v2.processes.connectors.duckdb.base import BaseDuckDBUploadStager
|
|
8
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
10
9
|
|
|
11
10
|
|
|
12
11
|
@pytest.fixture
|
|
@@ -4,13 +4,13 @@ import pandas as pd
|
|
|
4
4
|
import pytest
|
|
5
5
|
from pytest_mock import MockerFixture
|
|
6
6
|
|
|
7
|
-
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
8
7
|
from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
9
8
|
SQLConnectionConfig,
|
|
10
9
|
SQLUploader,
|
|
11
10
|
SQLUploaderConfig,
|
|
12
11
|
SQLUploadStager,
|
|
13
12
|
)
|
|
13
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
@pytest.fixture
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.6.0" # pragma: no cover
|
|
@@ -7,8 +7,9 @@ from uuid import NAMESPACE_DNS, uuid5
|
|
|
7
7
|
from pydantic import BaseModel, Field
|
|
8
8
|
|
|
9
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
|
-
from unstructured_ingest.v2.interfaces import DownloadResponse
|
|
10
|
+
from unstructured_ingest.v2.interfaces import DownloadResponse
|
|
11
11
|
from unstructured_ingest.v2.logger import logger
|
|
12
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
12
13
|
|
|
13
14
|
if TYPE_CHECKING:
|
|
14
15
|
from bs4.element import Tag
|
|
@@ -1,11 +1,3 @@
|
|
|
1
|
-
from unstructured_ingest.v2.types.file_data import (
|
|
2
|
-
BatchFileData,
|
|
3
|
-
BatchItem,
|
|
4
|
-
FileData,
|
|
5
|
-
FileDataSourceMetadata,
|
|
6
|
-
SourceIdentifiers,
|
|
7
|
-
)
|
|
8
|
-
|
|
9
1
|
from .connector import AccessConfig, BaseConnector, ConnectionConfig
|
|
10
2
|
from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
|
|
11
3
|
from .indexer import Indexer, IndexerConfig
|
|
@@ -19,7 +11,6 @@ __all__ = [
|
|
|
19
11
|
"download_responses",
|
|
20
12
|
"Downloader",
|
|
21
13
|
"DownloaderConfig",
|
|
22
|
-
"FileData",
|
|
23
14
|
"Indexer",
|
|
24
15
|
"IndexerConfig",
|
|
25
16
|
"BaseProcess",
|
|
@@ -28,13 +19,9 @@ __all__ = [
|
|
|
28
19
|
"UploadStagerConfig",
|
|
29
20
|
"Uploader",
|
|
30
21
|
"UploaderConfig",
|
|
31
|
-
"SourceIdentifiers",
|
|
32
22
|
"UploadContent",
|
|
33
23
|
"AccessConfig",
|
|
34
24
|
"ConnectionConfig",
|
|
35
25
|
"BaseConnector",
|
|
36
|
-
"FileDataSourceMetadata",
|
|
37
|
-
"BatchFileData",
|
|
38
|
-
"BatchItem",
|
|
39
26
|
"VectorDBUploader",
|
|
40
27
|
]
|
|
@@ -6,8 +6,8 @@ from typing import Any, Optional, TypedDict, TypeVar, Union
|
|
|
6
6
|
from pydantic import BaseModel, Field
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.v2.interfaces.connector import BaseConnector
|
|
9
|
-
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
10
9
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
10
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class DownloaderConfig(BaseModel):
|
|
@@ -4,8 +4,8 @@ from typing import Any, AsyncGenerator, Generator, Optional, TypeVar
|
|
|
4
4
|
from pydantic import BaseModel
|
|
5
5
|
|
|
6
6
|
from unstructured_ingest.v2.interfaces.connector import BaseConnector
|
|
7
|
-
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
8
7
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
8
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class IndexerConfig(BaseModel):
|
|
@@ -7,8 +7,8 @@ from pydantic import BaseModel
|
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.utils import ndjson
|
|
9
9
|
from unstructured_ingest.utils.data_prep import get_data, write_data
|
|
10
|
-
from unstructured_ingest.v2.interfaces
|
|
11
|
-
from unstructured_ingest.v2.
|
|
10
|
+
from unstructured_ingest.v2.interfaces import BaseProcess
|
|
11
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class UploadStagerConfig(BaseModel):
|
|
@@ -6,9 +6,8 @@ from typing import Any, TypeVar
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.utils.data_prep import get_data
|
|
9
|
-
from unstructured_ingest.v2.interfaces
|
|
10
|
-
from unstructured_ingest.v2.
|
|
11
|
-
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
9
|
+
from unstructured_ingest.v2.interfaces import BaseConnector, BaseProcess
|
|
10
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
12
11
|
|
|
13
12
|
|
|
14
13
|
class UploaderConfig(BaseModel):
|
|
@@ -5,11 +5,10 @@ from pathlib import Path
|
|
|
5
5
|
from typing import Callable, Optional, TypedDict
|
|
6
6
|
|
|
7
7
|
from unstructured_ingest.utils.data_prep import write_data
|
|
8
|
-
from unstructured_ingest.v2.interfaces import FileData
|
|
9
|
-
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
10
8
|
from unstructured_ingest.v2.logger import logger
|
|
11
9
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
12
10
|
from unstructured_ingest.v2.processes.chunker import Chunker
|
|
11
|
+
from unstructured_ingest.v2.types.file_data import FileData, file_data_from_file
|
|
13
12
|
from unstructured_ingest.v2.utils import serialize_base_model_json
|
|
14
13
|
|
|
15
14
|
STEP_ID = "chunk"
|
|
@@ -6,11 +6,10 @@ from dataclasses import dataclass
|
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
from typing import Callable, Optional, TypedDict, TypeVar
|
|
8
8
|
|
|
9
|
-
from unstructured_ingest.v2.interfaces import
|
|
10
|
-
from unstructured_ingest.v2.interfaces.downloader import Downloader
|
|
11
|
-
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
9
|
+
from unstructured_ingest.v2.interfaces import Downloader, download_responses
|
|
12
10
|
from unstructured_ingest.v2.logger import logger
|
|
13
11
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
12
|
+
from unstructured_ingest.v2.types.file_data import FileData, file_data_from_file
|
|
14
13
|
from unstructured_ingest.v2.utils import serialize_base_model_json
|
|
15
14
|
|
|
16
15
|
DownloaderT = TypeVar("DownloaderT", bound=Downloader)
|
|
@@ -5,11 +5,10 @@ from pathlib import Path
|
|
|
5
5
|
from typing import Callable, Optional, TypedDict
|
|
6
6
|
|
|
7
7
|
from unstructured_ingest.utils.data_prep import write_data
|
|
8
|
-
from unstructured_ingest.v2.interfaces import FileData
|
|
9
|
-
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
10
8
|
from unstructured_ingest.v2.logger import logger
|
|
11
9
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
12
10
|
from unstructured_ingest.v2.processes.embedder import Embedder
|
|
11
|
+
from unstructured_ingest.v2.types.file_data import FileData, file_data_from_file
|
|
13
12
|
from unstructured_ingest.v2.utils import serialize_base_model_json
|
|
14
13
|
|
|
15
14
|
STEP_ID = "embed"
|
|
@@ -2,10 +2,10 @@ import asyncio
|
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from typing import Callable, Optional
|
|
4
4
|
|
|
5
|
-
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
6
5
|
from unstructured_ingest.v2.logger import logger
|
|
7
6
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
8
7
|
from unstructured_ingest.v2.processes.filter import Filterer
|
|
8
|
+
from unstructured_ingest.v2.types.file_data import file_data_from_file
|
|
9
9
|
|
|
10
10
|
STEP_ID = "filter"
|
|
11
11
|
|
|
@@ -5,11 +5,10 @@ from pathlib import Path
|
|
|
5
5
|
from typing import Callable, Optional, TypedDict
|
|
6
6
|
|
|
7
7
|
from unstructured_ingest.utils.data_prep import write_data
|
|
8
|
-
from unstructured_ingest.v2.interfaces import FileData
|
|
9
|
-
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
10
8
|
from unstructured_ingest.v2.logger import logger
|
|
11
9
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
12
10
|
from unstructured_ingest.v2.processes.partitioner import Partitioner
|
|
11
|
+
from unstructured_ingest.v2.types.file_data import FileData, file_data_from_file
|
|
13
12
|
from unstructured_ingest.v2.utils import serialize_base_model_json
|
|
14
13
|
|
|
15
14
|
STEP_ID = "partition"
|
|
@@ -4,10 +4,10 @@ from dataclasses import dataclass
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Callable, Optional, TypedDict
|
|
6
6
|
|
|
7
|
-
from unstructured_ingest.v2.interfaces
|
|
8
|
-
from unstructured_ingest.v2.interfaces.upload_stager import UploadStager
|
|
7
|
+
from unstructured_ingest.v2.interfaces import UploadStager
|
|
9
8
|
from unstructured_ingest.v2.logger import logger
|
|
10
9
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
10
|
+
from unstructured_ingest.v2.types.file_data import file_data_from_file
|
|
11
11
|
from unstructured_ingest.v2.utils import serialize_base_model_json
|
|
12
12
|
|
|
13
13
|
STEP_ID = "upload_stage"
|
|
@@ -3,10 +3,10 @@ from dataclasses import dataclass
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Callable, TypedDict
|
|
5
5
|
|
|
6
|
-
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
7
6
|
from unstructured_ingest.v2.logger import logger
|
|
8
7
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
9
8
|
from unstructured_ingest.v2.processes.uncompress import Uncompressor
|
|
9
|
+
from unstructured_ingest.v2.types.file_data import file_data_from_file
|
|
10
10
|
|
|
11
11
|
STEP_ID = "uncompress"
|
|
12
12
|
|
|
@@ -3,11 +3,11 @@ from dataclasses import dataclass
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Callable, Optional, TypedDict
|
|
5
5
|
|
|
6
|
-
from unstructured_ingest.v2.interfaces
|
|
7
|
-
from unstructured_ingest.v2.interfaces.uploader import UploadContent
|
|
6
|
+
from unstructured_ingest.v2.interfaces import UploadContent
|
|
8
7
|
from unstructured_ingest.v2.logger import logger
|
|
9
8
|
from unstructured_ingest.v2.pipeline.interfaces import BatchPipelineStep
|
|
10
9
|
from unstructured_ingest.v2.pipeline.otel import instrument
|
|
10
|
+
from unstructured_ingest.v2.types.file_data import file_data_from_file
|
|
11
11
|
|
|
12
12
|
STEP_ID = "upload"
|
|
13
13
|
|
|
@@ -13,14 +13,13 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
13
13
|
Downloader,
|
|
14
14
|
DownloaderConfig,
|
|
15
15
|
DownloadResponse,
|
|
16
|
-
FileData,
|
|
17
16
|
Indexer,
|
|
18
17
|
IndexerConfig,
|
|
19
|
-
SourceIdentifiers,
|
|
20
18
|
)
|
|
21
19
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
22
20
|
SourceRegistryEntry,
|
|
23
21
|
)
|
|
22
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
24
23
|
|
|
25
24
|
if TYPE_CHECKING:
|
|
26
25
|
from pyairtable import Api
|
|
@@ -21,17 +21,12 @@ from unstructured_ingest.utils.string_and_date_utils import truncate_string_byte
|
|
|
21
21
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
22
22
|
from unstructured_ingest.v2.interfaces import (
|
|
23
23
|
AccessConfig,
|
|
24
|
-
BatchFileData,
|
|
25
|
-
BatchItem,
|
|
26
24
|
ConnectionConfig,
|
|
27
25
|
Downloader,
|
|
28
26
|
DownloaderConfig,
|
|
29
27
|
DownloadResponse,
|
|
30
|
-
FileData,
|
|
31
|
-
FileDataSourceMetadata,
|
|
32
28
|
Indexer,
|
|
33
29
|
IndexerConfig,
|
|
34
|
-
SourceIdentifiers,
|
|
35
30
|
Uploader,
|
|
36
31
|
UploaderConfig,
|
|
37
32
|
UploadStager,
|
|
@@ -44,6 +39,13 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
44
39
|
SourceRegistryEntry,
|
|
45
40
|
)
|
|
46
41
|
from unstructured_ingest.v2.processes.connectors.utils import format_and_truncate_orig_elements
|
|
42
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
43
|
+
BatchFileData,
|
|
44
|
+
BatchItem,
|
|
45
|
+
FileData,
|
|
46
|
+
FileDataSourceMetadata,
|
|
47
|
+
SourceIdentifiers,
|
|
48
|
+
)
|
|
47
49
|
|
|
48
50
|
if TYPE_CHECKING:
|
|
49
51
|
from astrapy import AsyncCollection as AstraDBAsyncCollection
|
|
@@ -12,7 +12,6 @@ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
|
12
12
|
from unstructured_ingest.v2.interfaces import (
|
|
13
13
|
AccessConfig,
|
|
14
14
|
ConnectionConfig,
|
|
15
|
-
FileData,
|
|
16
15
|
Uploader,
|
|
17
16
|
UploaderConfig,
|
|
18
17
|
UploadStager,
|
|
@@ -23,6 +22,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
23
22
|
DestinationRegistryEntry,
|
|
24
23
|
)
|
|
25
24
|
from unstructured_ingest.v2.processes.connectors.utils import parse_datetime
|
|
25
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
26
26
|
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
27
27
|
|
|
28
28
|
if TYPE_CHECKING:
|
|
@@ -12,7 +12,6 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
|
12
12
|
from unstructured_ingest.v2.interfaces import (
|
|
13
13
|
AccessConfig,
|
|
14
14
|
ConnectionConfig,
|
|
15
|
-
FileData,
|
|
16
15
|
Uploader,
|
|
17
16
|
UploaderConfig,
|
|
18
17
|
UploadStager,
|
|
@@ -20,6 +19,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
20
19
|
)
|
|
21
20
|
from unstructured_ingest.v2.logger import logger
|
|
22
21
|
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
22
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
23
23
|
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
24
24
|
|
|
25
25
|
from .utils import conform_string_to_dict
|
|
@@ -15,17 +15,19 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
15
15
|
Downloader,
|
|
16
16
|
DownloaderConfig,
|
|
17
17
|
DownloadResponse,
|
|
18
|
-
FileData,
|
|
19
|
-
FileDataSourceMetadata,
|
|
20
18
|
Indexer,
|
|
21
19
|
IndexerConfig,
|
|
22
|
-
SourceIdentifiers,
|
|
23
20
|
download_responses,
|
|
24
21
|
)
|
|
25
22
|
from unstructured_ingest.v2.logger import logger
|
|
26
23
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
27
24
|
SourceRegistryEntry,
|
|
28
25
|
)
|
|
26
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
27
|
+
FileData,
|
|
28
|
+
FileDataSourceMetadata,
|
|
29
|
+
SourceIdentifiers,
|
|
30
|
+
)
|
|
29
31
|
|
|
30
32
|
if TYPE_CHECKING:
|
|
31
33
|
from atlassian import Confluence
|
|
@@ -17,17 +17,12 @@ from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
|
|
|
17
17
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
18
18
|
from unstructured_ingest.v2.interfaces import (
|
|
19
19
|
AccessConfig,
|
|
20
|
-
BatchFileData,
|
|
21
|
-
BatchItem,
|
|
22
20
|
ConnectionConfig,
|
|
23
21
|
Downloader,
|
|
24
22
|
DownloaderConfig,
|
|
25
23
|
DownloadResponse,
|
|
26
|
-
FileData,
|
|
27
|
-
FileDataSourceMetadata,
|
|
28
24
|
Indexer,
|
|
29
25
|
IndexerConfig,
|
|
30
|
-
SourceIdentifiers,
|
|
31
26
|
Uploader,
|
|
32
27
|
UploaderConfig,
|
|
33
28
|
UploadStager,
|
|
@@ -39,6 +34,13 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
39
34
|
DestinationRegistryEntry,
|
|
40
35
|
SourceRegistryEntry,
|
|
41
36
|
)
|
|
37
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
38
|
+
BatchFileData,
|
|
39
|
+
BatchItem,
|
|
40
|
+
FileData,
|
|
41
|
+
FileDataSourceMetadata,
|
|
42
|
+
SourceIdentifiers,
|
|
43
|
+
)
|
|
42
44
|
|
|
43
45
|
if TYPE_CHECKING:
|
|
44
46
|
from couchbase.cluster import Cluster
|
|
@@ -20,15 +20,17 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
20
20
|
Downloader,
|
|
21
21
|
DownloaderConfig,
|
|
22
22
|
DownloadResponse,
|
|
23
|
-
FileData,
|
|
24
|
-
FileDataSourceMetadata,
|
|
25
23
|
Indexer,
|
|
26
24
|
IndexerConfig,
|
|
27
|
-
SourceIdentifiers,
|
|
28
25
|
Uploader,
|
|
29
26
|
UploaderConfig,
|
|
30
27
|
)
|
|
31
28
|
from unstructured_ingest.v2.logger import logger
|
|
29
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
30
|
+
FileData,
|
|
31
|
+
FileDataSourceMetadata,
|
|
32
|
+
SourceIdentifiers,
|
|
33
|
+
)
|
|
32
34
|
|
|
33
35
|
if TYPE_CHECKING:
|
|
34
36
|
from databricks.sdk import WorkspaceClient
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
import json
|
|
2
1
|
import os
|
|
2
|
+
import tempfile
|
|
3
3
|
from contextlib import contextmanager
|
|
4
4
|
from dataclasses import dataclass
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Any, Generator
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
7
7
|
|
|
8
8
|
from pydantic import Field
|
|
9
9
|
|
|
10
|
-
from unstructured_ingest.utils.data_prep import write_data
|
|
11
|
-
from unstructured_ingest.v2.interfaces import
|
|
10
|
+
from unstructured_ingest.utils.data_prep import get_data_df, write_data
|
|
11
|
+
from unstructured_ingest.v2.interfaces import Uploader, UploaderConfig
|
|
12
12
|
from unstructured_ingest.v2.logger import logger
|
|
13
13
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
14
14
|
DestinationRegistryEntry,
|
|
@@ -19,9 +19,13 @@ from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables imp
|
|
|
19
19
|
DatabricksDeltaTablesUploadStager,
|
|
20
20
|
DatabricksDeltaTablesUploadStagerConfig,
|
|
21
21
|
)
|
|
22
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
22
23
|
|
|
23
24
|
CONNECTOR_TYPE = "databricks_volume_delta_tables"
|
|
24
25
|
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from pandas import DataFrame
|
|
28
|
+
|
|
25
29
|
|
|
26
30
|
class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMixin):
|
|
27
31
|
database: str = Field(description="Database name", default="default")
|
|
@@ -30,10 +34,12 @@ class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMix
|
|
|
30
34
|
|
|
31
35
|
@dataclass
|
|
32
36
|
class DatabricksVolumeDeltaTableStager(DatabricksDeltaTablesUploadStager):
|
|
33
|
-
def write_output(self, output_path: Path, data: list[dict]) ->
|
|
37
|
+
def write_output(self, output_path: Path, data: list[dict]) -> Path:
|
|
34
38
|
# To avoid new line issues when migrating from volumes into delta tables, omit indenting
|
|
35
39
|
# and always write it as a json file
|
|
36
|
-
|
|
40
|
+
final_output_path = output_path.with_suffix(".json")
|
|
41
|
+
write_data(path=final_output_path, data=data, indent=None)
|
|
42
|
+
return final_output_path
|
|
37
43
|
|
|
38
44
|
|
|
39
45
|
@dataclass
|
|
@@ -41,6 +47,7 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
|
|
|
41
47
|
connection_config: DatabricksDeltaTablesConnectionConfig
|
|
42
48
|
upload_config: DatabricksVolumeDeltaTableUploaderConfig
|
|
43
49
|
connector_type: str = CONNECTOR_TYPE
|
|
50
|
+
_columns: Optional[dict[str, str]] = None
|
|
44
51
|
|
|
45
52
|
def precheck(self) -> None:
|
|
46
53
|
with self.connection_config.get_cursor() as cursor:
|
|
@@ -84,20 +91,58 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
|
|
|
84
91
|
cursor.execute(f"USE DATABASE {self.upload_config.database}")
|
|
85
92
|
yield cursor
|
|
86
93
|
|
|
87
|
-
def
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
+
def get_table_columns(self) -> dict[str, str]:
|
|
95
|
+
if self._columns is None:
|
|
96
|
+
with self.get_cursor() as cursor:
|
|
97
|
+
cursor.execute(f"SELECT * from {self.upload_config.table_name} LIMIT 1")
|
|
98
|
+
self._columns = {desc[0]: desc[1] for desc in cursor.description}
|
|
99
|
+
return self._columns
|
|
100
|
+
|
|
101
|
+
def _fit_to_schema(self, df: "DataFrame", add_missing_columns: bool = True) -> "DataFrame":
|
|
102
|
+
import pandas as pd
|
|
103
|
+
|
|
104
|
+
table_columns = self.get_table_columns()
|
|
105
|
+
columns = set(df.columns)
|
|
106
|
+
schema_fields = set(table_columns.keys())
|
|
107
|
+
columns_to_drop = columns - schema_fields
|
|
108
|
+
missing_columns = schema_fields - columns
|
|
109
|
+
|
|
110
|
+
if columns_to_drop:
|
|
111
|
+
logger.info(
|
|
112
|
+
"Following columns will be dropped to match the table's schema: "
|
|
113
|
+
f"{', '.join(columns_to_drop)}"
|
|
114
|
+
)
|
|
115
|
+
if missing_columns and add_missing_columns:
|
|
116
|
+
logger.info(
|
|
117
|
+
"Following null filled columns will be added to match the table's schema:"
|
|
118
|
+
f" {', '.join(missing_columns)} "
|
|
94
119
|
)
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
120
|
+
|
|
121
|
+
df = df.drop(columns=columns_to_drop)
|
|
122
|
+
|
|
123
|
+
if add_missing_columns:
|
|
124
|
+
for column in missing_columns:
|
|
125
|
+
df[column] = pd.Series()
|
|
126
|
+
return df
|
|
127
|
+
|
|
128
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
129
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
130
|
+
df = get_data_df()
|
|
131
|
+
df = self._fit_to_schema(df=df)
|
|
132
|
+
temp_path = Path(temp_dir) / path.name
|
|
133
|
+
df.to_json(temp_path, orient="records", lines=False)
|
|
134
|
+
with self.get_cursor(staging_allowed_local_path=temp_dir) as cursor:
|
|
135
|
+
catalog_path = self.get_output_path(file_data=file_data)
|
|
136
|
+
logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
|
|
137
|
+
cursor.execute(f"PUT '{temp_path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
|
|
138
|
+
logger.debug(
|
|
139
|
+
f"migrating content from {catalog_path} to "
|
|
140
|
+
f"table {self.upload_config.table_name}"
|
|
141
|
+
)
|
|
142
|
+
columns = list(df.columns)
|
|
143
|
+
column_str = ", ".join(columns)
|
|
144
|
+
sql_statment = f"INSERT INTO `{self.upload_config.table_name}` ({column_str}) SELECT {column_str} FROM json.`{catalog_path}`" # noqa: E501
|
|
145
|
+
cursor.execute(sql_statment)
|
|
101
146
|
|
|
102
147
|
|
|
103
148
|
databricks_volumes_delta_tables_destination_entry = DestinationRegistryEntry(
|
|
@@ -15,7 +15,6 @@ from unstructured_ingest.utils.table import convert_to_pandas_dataframe
|
|
|
15
15
|
from unstructured_ingest.v2.interfaces import (
|
|
16
16
|
AccessConfig,
|
|
17
17
|
ConnectionConfig,
|
|
18
|
-
FileData,
|
|
19
18
|
Uploader,
|
|
20
19
|
UploaderConfig,
|
|
21
20
|
UploadStager,
|
|
@@ -23,6 +22,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
23
22
|
)
|
|
24
23
|
from unstructured_ingest.v2.logger import logger
|
|
25
24
|
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
25
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
26
26
|
|
|
27
27
|
CONNECTOR_TYPE = "delta_table"
|
|
28
28
|
|
|
@@ -12,14 +12,16 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
12
12
|
Downloader,
|
|
13
13
|
DownloaderConfig,
|
|
14
14
|
DownloadResponse,
|
|
15
|
-
FileData,
|
|
16
|
-
FileDataSourceMetadata,
|
|
17
15
|
Indexer,
|
|
18
16
|
IndexerConfig,
|
|
19
|
-
SourceIdentifiers,
|
|
20
17
|
)
|
|
21
18
|
from unstructured_ingest.v2.logger import logger
|
|
22
19
|
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
20
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
21
|
+
FileData,
|
|
22
|
+
FileDataSourceMetadata,
|
|
23
|
+
SourceIdentifiers,
|
|
24
|
+
)
|
|
23
25
|
|
|
24
26
|
if TYPE_CHECKING:
|
|
25
27
|
from discord import Client as DiscordClient
|
|
@@ -4,7 +4,8 @@ from typing import Any
|
|
|
4
4
|
|
|
5
5
|
from unstructured_ingest.utils.data_prep import get_data, write_data
|
|
6
6
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
7
|
-
from unstructured_ingest.v2.interfaces import
|
|
7
|
+
from unstructured_ingest.v2.interfaces import UploadStager
|
|
8
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
8
9
|
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
9
10
|
|
|
10
11
|
_COLUMNS = (
|
|
@@ -11,7 +11,6 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
|
11
11
|
from unstructured_ingest.v2.interfaces import (
|
|
12
12
|
AccessConfig,
|
|
13
13
|
ConnectionConfig,
|
|
14
|
-
FileData,
|
|
15
14
|
Uploader,
|
|
16
15
|
UploaderConfig,
|
|
17
16
|
UploadStagerConfig,
|
|
@@ -19,6 +18,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
19
18
|
from unstructured_ingest.v2.logger import logger
|
|
20
19
|
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
21
20
|
from unstructured_ingest.v2.processes.connectors.duckdb.base import BaseDuckDBUploadStager
|
|
21
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
22
22
|
|
|
23
23
|
if TYPE_CHECKING:
|
|
24
24
|
from duckdb import DuckDBPyConnection as DuckDBConnection
|
|
@@ -12,7 +12,6 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
|
12
12
|
from unstructured_ingest.v2.interfaces import (
|
|
13
13
|
AccessConfig,
|
|
14
14
|
ConnectionConfig,
|
|
15
|
-
FileData,
|
|
16
15
|
Uploader,
|
|
17
16
|
UploaderConfig,
|
|
18
17
|
UploadStagerConfig,
|
|
@@ -20,6 +19,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
20
19
|
from unstructured_ingest.v2.logger import logger
|
|
21
20
|
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
22
21
|
from unstructured_ingest.v2.processes.connectors.duckdb.base import BaseDuckDBUploadStager
|
|
22
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
23
23
|
|
|
24
24
|
if TYPE_CHECKING:
|
|
25
25
|
from duckdb import DuckDBPyConnection as MotherDuckConnection
|
|
@@ -23,17 +23,12 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
|
23
23
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
24
24
|
from unstructured_ingest.v2.interfaces import (
|
|
25
25
|
AccessConfig,
|
|
26
|
-
BatchFileData,
|
|
27
|
-
BatchItem,
|
|
28
26
|
ConnectionConfig,
|
|
29
27
|
Downloader,
|
|
30
28
|
DownloaderConfig,
|
|
31
29
|
DownloadResponse,
|
|
32
|
-
FileData,
|
|
33
|
-
FileDataSourceMetadata,
|
|
34
30
|
Indexer,
|
|
35
31
|
IndexerConfig,
|
|
36
|
-
SourceIdentifiers,
|
|
37
32
|
Uploader,
|
|
38
33
|
UploaderConfig,
|
|
39
34
|
UploadStager,
|
|
@@ -45,6 +40,13 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
45
40
|
DestinationRegistryEntry,
|
|
46
41
|
SourceRegistryEntry,
|
|
47
42
|
)
|
|
43
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
44
|
+
BatchFileData,
|
|
45
|
+
BatchItem,
|
|
46
|
+
FileData,
|
|
47
|
+
FileDataSourceMetadata,
|
|
48
|
+
SourceIdentifiers,
|
|
49
|
+
)
|
|
48
50
|
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
49
51
|
|
|
50
52
|
if TYPE_CHECKING:
|
|
@@ -9,7 +9,6 @@ from pydantic import Field, Secret
|
|
|
9
9
|
|
|
10
10
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
11
|
from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserError
|
|
12
|
-
from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
|
|
13
12
|
from unstructured_ingest.v2.logger import logger
|
|
14
13
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
15
14
|
DestinationRegistryEntry,
|
|
@@ -30,6 +29,7 @@ from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
|
30
29
|
BlobStoreUploadStager,
|
|
31
30
|
BlobStoreUploadStagerConfig,
|
|
32
31
|
)
|
|
32
|
+
from unstructured_ingest.v2.types.file_data import FileDataSourceMetadata
|
|
33
33
|
|
|
34
34
|
if TYPE_CHECKING:
|
|
35
35
|
from adlfs import AzureBlobFileSystem
|