unstructured-ingest 0.5.23__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (105) hide show
  1. test/integration/connectors/databricks/test_volumes_native.py +1 -1
  2. test/integration/connectors/duckdb/test_duckdb.py +1 -1
  3. test/integration/connectors/duckdb/test_motherduck.py +1 -1
  4. test/integration/connectors/elasticsearch/test_elasticsearch.py +1 -1
  5. test/integration/connectors/elasticsearch/test_opensearch.py +1 -1
  6. test/integration/connectors/sql/test_databricks_delta_tables.py +1 -1
  7. test/integration/connectors/sql/test_postgres.py +1 -1
  8. test/integration/connectors/sql/test_singlestore.py +1 -1
  9. test/integration/connectors/sql/test_snowflake.py +1 -1
  10. test/integration/connectors/sql/test_sqlite.py +1 -1
  11. test/integration/connectors/test_astradb.py +1 -1
  12. test/integration/connectors/test_azure_ai_search.py +1 -1
  13. test/integration/connectors/test_chroma.py +1 -1
  14. test/integration/connectors/test_delta_table.py +1 -1
  15. test/integration/connectors/test_lancedb.py +1 -1
  16. test/integration/connectors/test_milvus.py +1 -1
  17. test/integration/connectors/test_mongodb.py +1 -1
  18. test/integration/connectors/test_neo4j.py +5 -5
  19. test/integration/connectors/test_onedrive.py +1 -1
  20. test/integration/connectors/test_pinecone.py +1 -1
  21. test/integration/connectors/test_qdrant.py +1 -1
  22. test/integration/connectors/test_redis.py +1 -1
  23. test/integration/connectors/test_s3.py +1 -1
  24. test/integration/connectors/test_vectara.py +68 -56
  25. test/integration/connectors/utils/validation/destination.py +2 -1
  26. test/integration/connectors/utils/validation/source.py +2 -1
  27. test/integration/connectors/weaviate/test_local.py +1 -1
  28. test/unit/test_html.py +1 -1
  29. test/unit/v2/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +1 -1
  30. test/unit/v2/connectors/motherduck/test_base.py +1 -2
  31. test/unit/v2/connectors/sql/test_sql.py +1 -1
  32. unstructured_ingest/__version__.py +1 -1
  33. unstructured_ingest/utils/html.py +2 -1
  34. unstructured_ingest/v2/interfaces/__init__.py +0 -13
  35. unstructured_ingest/v2/interfaces/downloader.py +1 -1
  36. unstructured_ingest/v2/interfaces/indexer.py +1 -1
  37. unstructured_ingest/v2/interfaces/upload_stager.py +2 -2
  38. unstructured_ingest/v2/interfaces/uploader.py +2 -3
  39. unstructured_ingest/v2/pipeline/steps/chunk.py +1 -2
  40. unstructured_ingest/v2/pipeline/steps/download.py +2 -3
  41. unstructured_ingest/v2/pipeline/steps/embed.py +1 -2
  42. unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
  43. unstructured_ingest/v2/pipeline/steps/partition.py +1 -2
  44. unstructured_ingest/v2/pipeline/steps/stage.py +2 -2
  45. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
  46. unstructured_ingest/v2/pipeline/steps/upload.py +2 -2
  47. unstructured_ingest/v2/processes/connectors/airtable.py +1 -2
  48. unstructured_ingest/v2/processes/connectors/astradb.py +7 -5
  49. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +1 -1
  50. unstructured_ingest/v2/processes/connectors/chroma.py +1 -1
  51. unstructured_ingest/v2/processes/connectors/confluence.py +5 -3
  52. unstructured_ingest/v2/processes/connectors/couchbase.py +7 -5
  53. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +5 -3
  54. unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +64 -19
  55. unstructured_ingest/v2/processes/connectors/delta_table.py +1 -1
  56. unstructured_ingest/v2/processes/connectors/discord.py +5 -3
  57. unstructured_ingest/v2/processes/connectors/duckdb/base.py +2 -1
  58. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +1 -1
  59. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +1 -1
  60. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +7 -5
  61. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +1 -1
  62. unstructured_ingest/v2/processes/connectors/fsspec/box.py +1 -1
  63. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +1 -1
  64. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +5 -3
  65. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +1 -1
  66. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -4
  67. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +1 -1
  68. unstructured_ingest/v2/processes/connectors/gitlab.py +5 -3
  69. unstructured_ingest/v2/processes/connectors/google_drive.py +5 -3
  70. unstructured_ingest/v2/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +1 -1
  71. unstructured_ingest/v2/processes/connectors/jira.py +5 -3
  72. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +5 -3
  73. unstructured_ingest/v2/processes/connectors/kdbai.py +1 -1
  74. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -4
  75. unstructured_ingest/v2/processes/connectors/local.py +5 -3
  76. unstructured_ingest/v2/processes/connectors/milvus.py +1 -1
  77. unstructured_ingest/v2/processes/connectors/mongodb.py +7 -5
  78. unstructured_ingest/v2/processes/connectors/neo4j.py +1 -1
  79. unstructured_ingest/v2/processes/connectors/notion/connector.py +5 -3
  80. unstructured_ingest/v2/processes/connectors/onedrive.py +5 -3
  81. unstructured_ingest/v2/processes/connectors/outlook.py +5 -2
  82. unstructured_ingest/v2/processes/connectors/pinecone.py +1 -1
  83. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +1 -1
  84. unstructured_ingest/v2/processes/connectors/redisdb.py +1 -1
  85. unstructured_ingest/v2/processes/connectors/salesforce.py +5 -3
  86. unstructured_ingest/v2/processes/connectors/sharepoint.py +3 -3
  87. unstructured_ingest/v2/processes/connectors/slack.py +2 -2
  88. unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +1 -1
  89. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +1 -1
  90. unstructured_ingest/v2/processes/connectors/sql/sql.py +13 -8
  91. unstructured_ingest/v2/processes/connectors/sql/vastdb.py +3 -3
  92. unstructured_ingest/v2/processes/connectors/vectara.py +1 -1
  93. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +1 -1
  94. unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py +5 -3
  95. unstructured_ingest/v2/processes/filter.py +1 -1
  96. unstructured_ingest/v2/processes/uncompress.py +1 -1
  97. unstructured_ingest/v2/processes/utils/blob_storage.py +2 -1
  98. unstructured_ingest/v2/utils.py +1 -1
  99. {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/METADATA +101 -101
  100. {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/RECORD +104 -105
  101. unstructured_ingest/v2/interfaces/file_data.py +0 -13
  102. {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/LICENSE.md +0 -0
  103. {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/WHEEL +0 -0
  104. {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/entry_points.txt +0 -0
  105. {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/top_level.txt +0 -0
@@ -3,10 +3,9 @@ from pathlib import Path
3
3
  import pytest
4
4
  from pytest_mock import MockerFixture
5
5
 
6
- from unstructured_ingest.v2.interfaces import FileData
7
- from unstructured_ingest.v2.interfaces.file_data import SourceIdentifiers
8
6
  from unstructured_ingest.v2.interfaces.upload_stager import UploadStagerConfig
9
7
  from unstructured_ingest.v2.processes.connectors.duckdb.base import BaseDuckDBUploadStager
8
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
10
9
 
11
10
 
12
11
  @pytest.fixture
@@ -4,13 +4,13 @@ import pandas as pd
4
4
  import pytest
5
5
  from pytest_mock import MockerFixture
6
6
 
7
- from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
8
7
  from unstructured_ingest.v2.processes.connectors.sql.sql import (
9
8
  SQLConnectionConfig,
10
9
  SQLUploader,
11
10
  SQLUploaderConfig,
12
11
  SQLUploadStager,
13
12
  )
13
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
14
14
 
15
15
 
16
16
  @pytest.fixture
@@ -1 +1 @@
1
- __version__ = "0.5.23" # pragma: no cover
1
+ __version__ = "0.6.0" # pragma: no cover
@@ -7,8 +7,9 @@ from uuid import NAMESPACE_DNS, uuid5
7
7
  from pydantic import BaseModel, Field
8
8
 
9
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, SourceIdentifiers
10
+ from unstructured_ingest.v2.interfaces import DownloadResponse
11
11
  from unstructured_ingest.v2.logger import logger
12
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
12
13
 
13
14
  if TYPE_CHECKING:
14
15
  from bs4.element import Tag
@@ -1,11 +1,3 @@
1
- from unstructured_ingest.v2.types.file_data import (
2
- BatchFileData,
3
- BatchItem,
4
- FileData,
5
- FileDataSourceMetadata,
6
- SourceIdentifiers,
7
- )
8
-
9
1
  from .connector import AccessConfig, BaseConnector, ConnectionConfig
10
2
  from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
11
3
  from .indexer import Indexer, IndexerConfig
@@ -19,7 +11,6 @@ __all__ = [
19
11
  "download_responses",
20
12
  "Downloader",
21
13
  "DownloaderConfig",
22
- "FileData",
23
14
  "Indexer",
24
15
  "IndexerConfig",
25
16
  "BaseProcess",
@@ -28,13 +19,9 @@ __all__ = [
28
19
  "UploadStagerConfig",
29
20
  "Uploader",
30
21
  "UploaderConfig",
31
- "SourceIdentifiers",
32
22
  "UploadContent",
33
23
  "AccessConfig",
34
24
  "ConnectionConfig",
35
25
  "BaseConnector",
36
- "FileDataSourceMetadata",
37
- "BatchFileData",
38
- "BatchItem",
39
26
  "VectorDBUploader",
40
27
  ]
@@ -6,8 +6,8 @@ from typing import Any, Optional, TypedDict, TypeVar, Union
6
6
  from pydantic import BaseModel, Field
7
7
 
8
8
  from unstructured_ingest.v2.interfaces.connector import BaseConnector
9
- from unstructured_ingest.v2.interfaces.file_data import FileData
10
9
  from unstructured_ingest.v2.interfaces.process import BaseProcess
10
+ from unstructured_ingest.v2.types.file_data import FileData
11
11
 
12
12
 
13
13
  class DownloaderConfig(BaseModel):
@@ -4,8 +4,8 @@ from typing import Any, AsyncGenerator, Generator, Optional, TypeVar
4
4
  from pydantic import BaseModel
5
5
 
6
6
  from unstructured_ingest.v2.interfaces.connector import BaseConnector
7
- from unstructured_ingest.v2.interfaces.file_data import FileData
8
7
  from unstructured_ingest.v2.interfaces.process import BaseProcess
8
+ from unstructured_ingest.v2.types.file_data import FileData
9
9
 
10
10
 
11
11
  class IndexerConfig(BaseModel):
@@ -7,8 +7,8 @@ from pydantic import BaseModel
7
7
 
8
8
  from unstructured_ingest.utils import ndjson
9
9
  from unstructured_ingest.utils.data_prep import get_data, write_data
10
- from unstructured_ingest.v2.interfaces.file_data import FileData
11
- from unstructured_ingest.v2.interfaces.process import BaseProcess
10
+ from unstructured_ingest.v2.interfaces import BaseProcess
11
+ from unstructured_ingest.v2.types.file_data import FileData
12
12
 
13
13
 
14
14
  class UploadStagerConfig(BaseModel):
@@ -6,9 +6,8 @@ from typing import Any, TypeVar
6
6
  from pydantic import BaseModel
7
7
 
8
8
  from unstructured_ingest.utils.data_prep import get_data
9
- from unstructured_ingest.v2.interfaces.connector import BaseConnector
10
- from unstructured_ingest.v2.interfaces.file_data import FileData
11
- from unstructured_ingest.v2.interfaces.process import BaseProcess
9
+ from unstructured_ingest.v2.interfaces import BaseConnector, BaseProcess
10
+ from unstructured_ingest.v2.types.file_data import FileData
12
11
 
13
12
 
14
13
  class UploaderConfig(BaseModel):
@@ -5,11 +5,10 @@ from pathlib import Path
5
5
  from typing import Callable, Optional, TypedDict
6
6
 
7
7
  from unstructured_ingest.utils.data_prep import write_data
8
- from unstructured_ingest.v2.interfaces import FileData
9
- from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
10
8
  from unstructured_ingest.v2.logger import logger
11
9
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
12
10
  from unstructured_ingest.v2.processes.chunker import Chunker
11
+ from unstructured_ingest.v2.types.file_data import FileData, file_data_from_file
13
12
  from unstructured_ingest.v2.utils import serialize_base_model_json
14
13
 
15
14
  STEP_ID = "chunk"
@@ -6,11 +6,10 @@ from dataclasses import dataclass
6
6
  from pathlib import Path
7
7
  from typing import Callable, Optional, TypedDict, TypeVar
8
8
 
9
- from unstructured_ingest.v2.interfaces import FileData, download_responses
10
- from unstructured_ingest.v2.interfaces.downloader import Downloader
11
- from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
9
+ from unstructured_ingest.v2.interfaces import Downloader, download_responses
12
10
  from unstructured_ingest.v2.logger import logger
13
11
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
12
+ from unstructured_ingest.v2.types.file_data import FileData, file_data_from_file
14
13
  from unstructured_ingest.v2.utils import serialize_base_model_json
15
14
 
16
15
  DownloaderT = TypeVar("DownloaderT", bound=Downloader)
@@ -5,11 +5,10 @@ from pathlib import Path
5
5
  from typing import Callable, Optional, TypedDict
6
6
 
7
7
  from unstructured_ingest.utils.data_prep import write_data
8
- from unstructured_ingest.v2.interfaces import FileData
9
- from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
10
8
  from unstructured_ingest.v2.logger import logger
11
9
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
12
10
  from unstructured_ingest.v2.processes.embedder import Embedder
11
+ from unstructured_ingest.v2.types.file_data import FileData, file_data_from_file
13
12
  from unstructured_ingest.v2.utils import serialize_base_model_json
14
13
 
15
14
  STEP_ID = "embed"
@@ -2,10 +2,10 @@ import asyncio
2
2
  from dataclasses import dataclass
3
3
  from typing import Callable, Optional
4
4
 
5
- from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
6
5
  from unstructured_ingest.v2.logger import logger
7
6
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
8
7
  from unstructured_ingest.v2.processes.filter import Filterer
8
+ from unstructured_ingest.v2.types.file_data import file_data_from_file
9
9
 
10
10
  STEP_ID = "filter"
11
11
 
@@ -5,11 +5,10 @@ from pathlib import Path
5
5
  from typing import Callable, Optional, TypedDict
6
6
 
7
7
  from unstructured_ingest.utils.data_prep import write_data
8
- from unstructured_ingest.v2.interfaces import FileData
9
- from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
10
8
  from unstructured_ingest.v2.logger import logger
11
9
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
12
10
  from unstructured_ingest.v2.processes.partitioner import Partitioner
11
+ from unstructured_ingest.v2.types.file_data import FileData, file_data_from_file
13
12
  from unstructured_ingest.v2.utils import serialize_base_model_json
14
13
 
15
14
  STEP_ID = "partition"
@@ -4,10 +4,10 @@ from dataclasses import dataclass
4
4
  from pathlib import Path
5
5
  from typing import Callable, Optional, TypedDict
6
6
 
7
- from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
8
- from unstructured_ingest.v2.interfaces.upload_stager import UploadStager
7
+ from unstructured_ingest.v2.interfaces import UploadStager
9
8
  from unstructured_ingest.v2.logger import logger
10
9
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
10
+ from unstructured_ingest.v2.types.file_data import file_data_from_file
11
11
  from unstructured_ingest.v2.utils import serialize_base_model_json
12
12
 
13
13
  STEP_ID = "upload_stage"
@@ -3,10 +3,10 @@ from dataclasses import dataclass
3
3
  from pathlib import Path
4
4
  from typing import Callable, TypedDict
5
5
 
6
- from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
7
6
  from unstructured_ingest.v2.logger import logger
8
7
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
9
8
  from unstructured_ingest.v2.processes.uncompress import Uncompressor
9
+ from unstructured_ingest.v2.types.file_data import file_data_from_file
10
10
 
11
11
  STEP_ID = "uncompress"
12
12
 
@@ -3,11 +3,11 @@ from dataclasses import dataclass
3
3
  from pathlib import Path
4
4
  from typing import Callable, Optional, TypedDict
5
5
 
6
- from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
7
- from unstructured_ingest.v2.interfaces.uploader import UploadContent
6
+ from unstructured_ingest.v2.interfaces import UploadContent
8
7
  from unstructured_ingest.v2.logger import logger
9
8
  from unstructured_ingest.v2.pipeline.interfaces import BatchPipelineStep
10
9
  from unstructured_ingest.v2.pipeline.otel import instrument
10
+ from unstructured_ingest.v2.types.file_data import file_data_from_file
11
11
 
12
12
  STEP_ID = "upload"
13
13
 
@@ -13,14 +13,13 @@ from unstructured_ingest.v2.interfaces import (
13
13
  Downloader,
14
14
  DownloaderConfig,
15
15
  DownloadResponse,
16
- FileData,
17
16
  Indexer,
18
17
  IndexerConfig,
19
- SourceIdentifiers,
20
18
  )
21
19
  from unstructured_ingest.v2.processes.connector_registry import (
22
20
  SourceRegistryEntry,
23
21
  )
22
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
24
23
 
25
24
  if TYPE_CHECKING:
26
25
  from pyairtable import Api
@@ -21,17 +21,12 @@ from unstructured_ingest.utils.string_and_date_utils import truncate_string_byte
21
21
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
22
22
  from unstructured_ingest.v2.interfaces import (
23
23
  AccessConfig,
24
- BatchFileData,
25
- BatchItem,
26
24
  ConnectionConfig,
27
25
  Downloader,
28
26
  DownloaderConfig,
29
27
  DownloadResponse,
30
- FileData,
31
- FileDataSourceMetadata,
32
28
  Indexer,
33
29
  IndexerConfig,
34
- SourceIdentifiers,
35
30
  Uploader,
36
31
  UploaderConfig,
37
32
  UploadStager,
@@ -44,6 +39,13 @@ from unstructured_ingest.v2.processes.connector_registry import (
44
39
  SourceRegistryEntry,
45
40
  )
46
41
  from unstructured_ingest.v2.processes.connectors.utils import format_and_truncate_orig_elements
42
+ from unstructured_ingest.v2.types.file_data import (
43
+ BatchFileData,
44
+ BatchItem,
45
+ FileData,
46
+ FileDataSourceMetadata,
47
+ SourceIdentifiers,
48
+ )
47
49
 
48
50
  if TYPE_CHECKING:
49
51
  from astrapy import AsyncCollection as AstraDBAsyncCollection
@@ -12,7 +12,6 @@ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
12
12
  from unstructured_ingest.v2.interfaces import (
13
13
  AccessConfig,
14
14
  ConnectionConfig,
15
- FileData,
16
15
  Uploader,
17
16
  UploaderConfig,
18
17
  UploadStager,
@@ -23,6 +22,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
23
22
  DestinationRegistryEntry,
24
23
  )
25
24
  from unstructured_ingest.v2.processes.connectors.utils import parse_datetime
25
+ from unstructured_ingest.v2.types.file_data import FileData
26
26
  from unstructured_ingest.v2.utils import get_enhanced_element_id
27
27
 
28
28
  if TYPE_CHECKING:
@@ -12,7 +12,6 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
12
12
  from unstructured_ingest.v2.interfaces import (
13
13
  AccessConfig,
14
14
  ConnectionConfig,
15
- FileData,
16
15
  Uploader,
17
16
  UploaderConfig,
18
17
  UploadStager,
@@ -20,6 +19,7 @@ from unstructured_ingest.v2.interfaces import (
20
19
  )
21
20
  from unstructured_ingest.v2.logger import logger
22
21
  from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
22
+ from unstructured_ingest.v2.types.file_data import FileData
23
23
  from unstructured_ingest.v2.utils import get_enhanced_element_id
24
24
 
25
25
  from .utils import conform_string_to_dict
@@ -15,17 +15,19 @@ from unstructured_ingest.v2.interfaces import (
15
15
  Downloader,
16
16
  DownloaderConfig,
17
17
  DownloadResponse,
18
- FileData,
19
- FileDataSourceMetadata,
20
18
  Indexer,
21
19
  IndexerConfig,
22
- SourceIdentifiers,
23
20
  download_responses,
24
21
  )
25
22
  from unstructured_ingest.v2.logger import logger
26
23
  from unstructured_ingest.v2.processes.connector_registry import (
27
24
  SourceRegistryEntry,
28
25
  )
26
+ from unstructured_ingest.v2.types.file_data import (
27
+ FileData,
28
+ FileDataSourceMetadata,
29
+ SourceIdentifiers,
30
+ )
29
31
 
30
32
  if TYPE_CHECKING:
31
33
  from atlassian import Confluence
@@ -17,17 +17,12 @@ from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
17
17
  from unstructured_ingest.utils.dep_check import requires_dependencies
18
18
  from unstructured_ingest.v2.interfaces import (
19
19
  AccessConfig,
20
- BatchFileData,
21
- BatchItem,
22
20
  ConnectionConfig,
23
21
  Downloader,
24
22
  DownloaderConfig,
25
23
  DownloadResponse,
26
- FileData,
27
- FileDataSourceMetadata,
28
24
  Indexer,
29
25
  IndexerConfig,
30
- SourceIdentifiers,
31
26
  Uploader,
32
27
  UploaderConfig,
33
28
  UploadStager,
@@ -39,6 +34,13 @@ from unstructured_ingest.v2.processes.connector_registry import (
39
34
  DestinationRegistryEntry,
40
35
  SourceRegistryEntry,
41
36
  )
37
+ from unstructured_ingest.v2.types.file_data import (
38
+ BatchFileData,
39
+ BatchItem,
40
+ FileData,
41
+ FileDataSourceMetadata,
42
+ SourceIdentifiers,
43
+ )
42
44
 
43
45
  if TYPE_CHECKING:
44
46
  from couchbase.cluster import Cluster
@@ -20,15 +20,17 @@ from unstructured_ingest.v2.interfaces import (
20
20
  Downloader,
21
21
  DownloaderConfig,
22
22
  DownloadResponse,
23
- FileData,
24
- FileDataSourceMetadata,
25
23
  Indexer,
26
24
  IndexerConfig,
27
- SourceIdentifiers,
28
25
  Uploader,
29
26
  UploaderConfig,
30
27
  )
31
28
  from unstructured_ingest.v2.logger import logger
29
+ from unstructured_ingest.v2.types.file_data import (
30
+ FileData,
31
+ FileDataSourceMetadata,
32
+ SourceIdentifiers,
33
+ )
32
34
 
33
35
  if TYPE_CHECKING:
34
36
  from databricks.sdk import WorkspaceClient
@@ -1,14 +1,14 @@
1
- import json
2
1
  import os
2
+ import tempfile
3
3
  from contextlib import contextmanager
4
4
  from dataclasses import dataclass
5
5
  from pathlib import Path
6
- from typing import Any, Generator
6
+ from typing import TYPE_CHECKING, Any, Generator, Optional
7
7
 
8
8
  from pydantic import Field
9
9
 
10
- from unstructured_ingest.utils.data_prep import write_data
11
- from unstructured_ingest.v2.interfaces import FileData, Uploader, UploaderConfig
10
+ from unstructured_ingest.utils.data_prep import get_data_df, write_data
11
+ from unstructured_ingest.v2.interfaces import Uploader, UploaderConfig
12
12
  from unstructured_ingest.v2.logger import logger
13
13
  from unstructured_ingest.v2.processes.connector_registry import (
14
14
  DestinationRegistryEntry,
@@ -19,9 +19,13 @@ from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables imp
19
19
  DatabricksDeltaTablesUploadStager,
20
20
  DatabricksDeltaTablesUploadStagerConfig,
21
21
  )
22
+ from unstructured_ingest.v2.types.file_data import FileData
22
23
 
23
24
  CONNECTOR_TYPE = "databricks_volume_delta_tables"
24
25
 
26
+ if TYPE_CHECKING:
27
+ from pandas import DataFrame
28
+
25
29
 
26
30
  class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMixin):
27
31
  database: str = Field(description="Database name", default="default")
@@ -30,10 +34,12 @@ class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMix
30
34
 
31
35
  @dataclass
32
36
  class DatabricksVolumeDeltaTableStager(DatabricksDeltaTablesUploadStager):
33
- def write_output(self, output_path: Path, data: list[dict]) -> None:
37
+ def write_output(self, output_path: Path, data: list[dict]) -> Path:
34
38
  # To avoid new line issues when migrating from volumes into delta tables, omit indenting
35
39
  # and always write it as a json file
36
- write_data(path=output_path.with_suffix(".json"), data=data, indent=None)
40
+ final_output_path = output_path.with_suffix(".json")
41
+ write_data(path=final_output_path, data=data, indent=None)
42
+ return final_output_path
37
43
 
38
44
 
39
45
  @dataclass
@@ -41,6 +47,7 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
41
47
  connection_config: DatabricksDeltaTablesConnectionConfig
42
48
  upload_config: DatabricksVolumeDeltaTableUploaderConfig
43
49
  connector_type: str = CONNECTOR_TYPE
50
+ _columns: Optional[dict[str, str]] = None
44
51
 
45
52
  def precheck(self) -> None:
46
53
  with self.connection_config.get_cursor() as cursor:
@@ -84,20 +91,58 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
84
91
  cursor.execute(f"USE DATABASE {self.upload_config.database}")
85
92
  yield cursor
86
93
 
87
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
88
- with self.get_cursor(staging_allowed_local_path=str(path.parent)) as cursor:
89
- catalog_path = self.get_output_path(file_data=file_data)
90
- logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
91
- cursor.execute(f"PUT '{path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
92
- logger.debug(
93
- f"migrating content from {catalog_path} to table {self.upload_config.table_name}"
94
+ def get_table_columns(self) -> dict[str, str]:
95
+ if self._columns is None:
96
+ with self.get_cursor() as cursor:
97
+ cursor.execute(f"SELECT * from {self.upload_config.table_name} LIMIT 1")
98
+ self._columns = {desc[0]: desc[1] for desc in cursor.description}
99
+ return self._columns
100
+
101
+ def _fit_to_schema(self, df: "DataFrame", add_missing_columns: bool = True) -> "DataFrame":
102
+ import pandas as pd
103
+
104
+ table_columns = self.get_table_columns()
105
+ columns = set(df.columns)
106
+ schema_fields = set(table_columns.keys())
107
+ columns_to_drop = columns - schema_fields
108
+ missing_columns = schema_fields - columns
109
+
110
+ if columns_to_drop:
111
+ logger.info(
112
+ "Following columns will be dropped to match the table's schema: "
113
+ f"{', '.join(columns_to_drop)}"
114
+ )
115
+ if missing_columns and add_missing_columns:
116
+ logger.info(
117
+ "Following null filled columns will be added to match the table's schema:"
118
+ f" {', '.join(missing_columns)} "
94
119
  )
95
- with path.open() as f:
96
- data = json.load(f)
97
- columns = data[0].keys()
98
- column_str = ", ".join(columns)
99
- sql_statment = f"INSERT INTO `{self.upload_config.table_name}` ({column_str}) SELECT {column_str} FROM json.`{catalog_path}`" # noqa: E501
100
- cursor.execute(sql_statment)
120
+
121
+ df = df.drop(columns=columns_to_drop)
122
+
123
+ if add_missing_columns:
124
+ for column in missing_columns:
125
+ df[column] = pd.Series()
126
+ return df
127
+
128
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
129
+ with tempfile.TemporaryDirectory() as temp_dir:
130
+ df = get_data_df()
131
+ df = self._fit_to_schema(df=df)
132
+ temp_path = Path(temp_dir) / path.name
133
+ df.to_json(temp_path, orient="records", lines=False)
134
+ with self.get_cursor(staging_allowed_local_path=temp_dir) as cursor:
135
+ catalog_path = self.get_output_path(file_data=file_data)
136
+ logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
137
+ cursor.execute(f"PUT '{temp_path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
138
+ logger.debug(
139
+ f"migrating content from {catalog_path} to "
140
+ f"table {self.upload_config.table_name}"
141
+ )
142
+ columns = list(df.columns)
143
+ column_str = ", ".join(columns)
144
+ sql_statment = f"INSERT INTO `{self.upload_config.table_name}` ({column_str}) SELECT {column_str} FROM json.`{catalog_path}`" # noqa: E501
145
+ cursor.execute(sql_statment)
101
146
 
102
147
 
103
148
  databricks_volumes_delta_tables_destination_entry = DestinationRegistryEntry(
@@ -15,7 +15,6 @@ from unstructured_ingest.utils.table import convert_to_pandas_dataframe
15
15
  from unstructured_ingest.v2.interfaces import (
16
16
  AccessConfig,
17
17
  ConnectionConfig,
18
- FileData,
19
18
  Uploader,
20
19
  UploaderConfig,
21
20
  UploadStager,
@@ -23,6 +22,7 @@ from unstructured_ingest.v2.interfaces import (
23
22
  )
24
23
  from unstructured_ingest.v2.logger import logger
25
24
  from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
25
+ from unstructured_ingest.v2.types.file_data import FileData
26
26
 
27
27
  CONNECTOR_TYPE = "delta_table"
28
28
 
@@ -12,14 +12,16 @@ from unstructured_ingest.v2.interfaces import (
12
12
  Downloader,
13
13
  DownloaderConfig,
14
14
  DownloadResponse,
15
- FileData,
16
- FileDataSourceMetadata,
17
15
  Indexer,
18
16
  IndexerConfig,
19
- SourceIdentifiers,
20
17
  )
21
18
  from unstructured_ingest.v2.logger import logger
22
19
  from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
20
+ from unstructured_ingest.v2.types.file_data import (
21
+ FileData,
22
+ FileDataSourceMetadata,
23
+ SourceIdentifiers,
24
+ )
23
25
 
24
26
  if TYPE_CHECKING:
25
27
  from discord import Client as DiscordClient
@@ -4,7 +4,8 @@ from typing import Any
4
4
 
5
5
  from unstructured_ingest.utils.data_prep import get_data, write_data
6
6
  from unstructured_ingest.utils.dep_check import requires_dependencies
7
- from unstructured_ingest.v2.interfaces import FileData, UploadStager
7
+ from unstructured_ingest.v2.interfaces import UploadStager
8
+ from unstructured_ingest.v2.types.file_data import FileData
8
9
  from unstructured_ingest.v2.utils import get_enhanced_element_id
9
10
 
10
11
  _COLUMNS = (
@@ -11,7 +11,6 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
11
11
  from unstructured_ingest.v2.interfaces import (
12
12
  AccessConfig,
13
13
  ConnectionConfig,
14
- FileData,
15
14
  Uploader,
16
15
  UploaderConfig,
17
16
  UploadStagerConfig,
@@ -19,6 +18,7 @@ from unstructured_ingest.v2.interfaces import (
19
18
  from unstructured_ingest.v2.logger import logger
20
19
  from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
21
20
  from unstructured_ingest.v2.processes.connectors.duckdb.base import BaseDuckDBUploadStager
21
+ from unstructured_ingest.v2.types.file_data import FileData
22
22
 
23
23
  if TYPE_CHECKING:
24
24
  from duckdb import DuckDBPyConnection as DuckDBConnection
@@ -12,7 +12,6 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
12
12
  from unstructured_ingest.v2.interfaces import (
13
13
  AccessConfig,
14
14
  ConnectionConfig,
15
- FileData,
16
15
  Uploader,
17
16
  UploaderConfig,
18
17
  UploadStagerConfig,
@@ -20,6 +19,7 @@ from unstructured_ingest.v2.interfaces import (
20
19
  from unstructured_ingest.v2.logger import logger
21
20
  from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
22
21
  from unstructured_ingest.v2.processes.connectors.duckdb.base import BaseDuckDBUploadStager
22
+ from unstructured_ingest.v2.types.file_data import FileData
23
23
 
24
24
  if TYPE_CHECKING:
25
25
  from duckdb import DuckDBPyConnection as MotherDuckConnection
@@ -23,17 +23,12 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
23
23
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
24
24
  from unstructured_ingest.v2.interfaces import (
25
25
  AccessConfig,
26
- BatchFileData,
27
- BatchItem,
28
26
  ConnectionConfig,
29
27
  Downloader,
30
28
  DownloaderConfig,
31
29
  DownloadResponse,
32
- FileData,
33
- FileDataSourceMetadata,
34
30
  Indexer,
35
31
  IndexerConfig,
36
- SourceIdentifiers,
37
32
  Uploader,
38
33
  UploaderConfig,
39
34
  UploadStager,
@@ -45,6 +40,13 @@ from unstructured_ingest.v2.processes.connector_registry import (
45
40
  DestinationRegistryEntry,
46
41
  SourceRegistryEntry,
47
42
  )
43
+ from unstructured_ingest.v2.types.file_data import (
44
+ BatchFileData,
45
+ BatchItem,
46
+ FileData,
47
+ FileDataSourceMetadata,
48
+ SourceIdentifiers,
49
+ )
48
50
  from unstructured_ingest.v2.utils import get_enhanced_element_id
49
51
 
50
52
  if TYPE_CHECKING:
@@ -9,7 +9,6 @@ from pydantic import Field, Secret
9
9
 
10
10
  from unstructured_ingest.utils.dep_check import requires_dependencies
11
11
  from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserError
12
- from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
13
12
  from unstructured_ingest.v2.logger import logger
14
13
  from unstructured_ingest.v2.processes.connector_registry import (
15
14
  DestinationRegistryEntry,
@@ -30,6 +29,7 @@ from unstructured_ingest.v2.processes.utils.blob_storage import (
30
29
  BlobStoreUploadStager,
31
30
  BlobStoreUploadStagerConfig,
32
31
  )
32
+ from unstructured_ingest.v2.types.file_data import FileDataSourceMetadata
33
33
 
34
34
  if TYPE_CHECKING:
35
35
  from adlfs import AzureBlobFileSystem