unstructured-ingest 0.5.20__py3-none-any.whl → 0.5.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (29) hide show
  1. test/integration/connectors/test_astradb.py +8 -2
  2. unstructured_ingest/__version__.py +1 -1
  3. unstructured_ingest/embed/interfaces.py +7 -3
  4. unstructured_ingest/utils/data_prep.py +17 -5
  5. unstructured_ingest/utils/table.py +11 -4
  6. unstructured_ingest/v2/interfaces/__init__.py +8 -1
  7. unstructured_ingest/v2/interfaces/file_data.py +13 -116
  8. unstructured_ingest/v2/processes/connectors/delta_table.py +8 -3
  9. unstructured_ingest/v2/processes/connectors/duckdb/base.py +4 -3
  10. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +5 -2
  11. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +5 -2
  12. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +7 -1
  13. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +7 -1
  14. unstructured_ingest/v2/processes/connectors/kdbai.py +6 -3
  15. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +10 -2
  16. unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +5 -3
  17. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -1
  18. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +7 -3
  19. unstructured_ingest/v2/processes/connectors/sql/sql.py +22 -9
  20. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -1
  21. unstructured_ingest/v2/processes/connectors/sql/vastdb.py +5 -7
  22. unstructured_ingest/v2/types/__init__.py +0 -0
  23. unstructured_ingest/v2/types/file_data.py +116 -0
  24. {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.23.dist-info}/METADATA +168 -17
  25. {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.23.dist-info}/RECORD +29 -27
  26. {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.23.dist-info}/LICENSE.md +0 -0
  27. {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.23.dist-info}/WHEEL +0 -0
  28. {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.23.dist-info}/entry_points.txt +0 -0
  29. {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.23.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,4 @@
1
+ import contextlib
1
2
  import json
2
3
  import os
3
4
  from dataclasses import dataclass
@@ -231,6 +232,13 @@ def test_astra_create_destination():
231
232
  )
232
233
  collection_name = "system_created-123"
233
234
  formatted_collection_name = "system_created_123"
235
+
236
+ client = AstraDBClient()
237
+ db = client.get_database(api_endpoint=env_data.api_endpoint, token=env_data.token)
238
+ with contextlib.suppress(Exception):
239
+ # drop collection before trying to create it
240
+ db.drop_collection(formatted_collection_name)
241
+
234
242
  created = uploader.create_destination(destination_name=collection_name, vector_length=3072)
235
243
  assert created
236
244
  assert uploader.upload_config.collection_name == formatted_collection_name
@@ -239,8 +247,6 @@ def test_astra_create_destination():
239
247
  assert not created
240
248
 
241
249
  # cleanup
242
- client = AstraDBClient()
243
- db = client.get_database(api_endpoint=env_data.api_endpoint, token=env_data.token)
244
250
  db.drop_collection(formatted_collection_name)
245
251
 
246
252
 
@@ -1 +1 @@
1
- __version__ = "0.5.20" # pragma: no cover
1
+ __version__ = "0.5.23" # pragma: no cover
@@ -2,10 +2,10 @@ from abc import ABC
2
2
  from dataclasses import dataclass
3
3
  from typing import Any, Optional
4
4
 
5
- import numpy as np
6
5
  from pydantic import BaseModel, Field
7
6
 
8
7
  from unstructured_ingest.utils.data_prep import batch_generator
8
+ from unstructured_ingest.utils.dep_check import requires_dependencies
9
9
 
10
10
  EMBEDDINGS_KEY = "embeddings"
11
11
 
@@ -32,7 +32,6 @@ class BaseEncoder(ABC):
32
32
 
33
33
  @dataclass
34
34
  class BaseEmbeddingEncoder(BaseEncoder, ABC):
35
-
36
35
  def initialize(self):
37
36
  """Initializes the embedding encoder class. Should also validate the instance
38
37
  is properly configured: e.g., embed a single a element"""
@@ -46,8 +45,11 @@ class BaseEmbeddingEncoder(BaseEncoder, ABC):
46
45
  return self.embed_query(query="Q")
47
46
 
48
47
  @property
48
+ @requires_dependencies(["numpy"])
49
49
  def is_unit_vector(self) -> bool:
50
50
  """Denotes if the embedding vector is a unit vector."""
51
+ import numpy as np
52
+
51
53
  exemplary_embedding = self.get_exemplary_embedding()
52
54
  return np.isclose(np.linalg.norm(exemplary_embedding), 1.0, rtol=1e-03)
53
55
 
@@ -86,7 +88,6 @@ class BaseEmbeddingEncoder(BaseEncoder, ABC):
86
88
 
87
89
  @dataclass
88
90
  class AsyncBaseEmbeddingEncoder(BaseEncoder, ABC):
89
-
90
91
  async def initialize(self):
91
92
  """Initializes the embedding encoder class. Should also validate the instance
92
93
  is properly configured: e.g., embed a single a element"""
@@ -100,8 +101,11 @@ class AsyncBaseEmbeddingEncoder(BaseEncoder, ABC):
100
101
  return await self.embed_query(query="Q")
101
102
 
102
103
  @property
104
+ @requires_dependencies(["numpy"])
103
105
  async def is_unit_vector(self) -> bool:
104
106
  """Denotes if the embedding vector is a unit vector."""
107
+ import numpy as np
108
+
105
109
  exemplary_embedding = await self.get_exemplary_embedding()
106
110
  return np.isclose(np.linalg.norm(exemplary_embedding), 1.0, rtol=1e-03)
107
111
 
@@ -2,20 +2,22 @@ import itertools
2
2
  import json
3
3
  from datetime import datetime
4
4
  from pathlib import Path
5
- from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, Union, cast
6
-
7
- import pandas as pd
5
+ from typing import TYPE_CHECKING, Any, Generator, Iterable, Optional, Sequence, TypeVar, Union, cast
8
6
 
9
7
  from unstructured_ingest.utils import ndjson
8
+ from unstructured_ingest.utils.dep_check import requires_dependencies
10
9
  from unstructured_ingest.v2.logger import logger
11
10
 
11
+ if TYPE_CHECKING:
12
+ from pandas import DataFrame
13
+
12
14
  DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
13
15
 
14
16
  T = TypeVar("T")
15
17
  IterableT = Iterable[T]
16
18
 
17
19
 
18
- def split_dataframe(df: pd.DataFrame, chunk_size: int = 100) -> Generator[pd.DataFrame, None, None]:
20
+ def split_dataframe(df: "DataFrame", chunk_size: int = 100) -> Generator["DataFrame", None, None]:
19
21
  num_chunks = len(df) // chunk_size + 1
20
22
  for i in range(num_chunks):
21
23
  yield df[i * chunk_size : (i + 1) * chunk_size]
@@ -144,9 +146,13 @@ def get_data_by_suffix(path: Path) -> list[dict]:
144
146
  elif path.suffix == ".ndjson":
145
147
  return ndjson.load(f)
146
148
  elif path.suffix == ".csv":
149
+ import pandas as pd
150
+
147
151
  df = pd.read_csv(path)
148
152
  return df.to_dict(orient="records")
149
153
  elif path.suffix == ".parquet":
154
+ import pandas as pd
155
+
150
156
  df = pd.read_parquet(path)
151
157
  return df.to_dict(orient="records")
152
158
  else:
@@ -180,6 +186,9 @@ def get_data(path: Union[Path, str]) -> list[dict]:
180
186
  return ndjson.load(f)
181
187
  except Exception as e:
182
188
  logger.warning(f"failed to read {path} as ndjson: {e}")
189
+
190
+ import pandas as pd
191
+
183
192
  try:
184
193
  df = pd.read_csv(path)
185
194
  return df.to_dict(orient="records")
@@ -202,7 +211,10 @@ def get_json_data(path: Path) -> list[dict]:
202
211
  raise ValueError(f"Unsupported file type: {path}")
203
212
 
204
213
 
205
- def get_data_df(path: Path) -> pd.DataFrame:
214
+ @requires_dependencies(["pandas"])
215
+ def get_data_df(path: Path) -> "DataFrame":
216
+ import pandas as pd
217
+
206
218
  with path.open() as f:
207
219
  if path.suffix == ".json":
208
220
  data = json.load(f)
@@ -1,11 +1,16 @@
1
- from typing import Any
2
-
3
- import pandas as pd
1
+ from typing import TYPE_CHECKING, Any
4
2
 
5
3
  from unstructured_ingest.utils.data_prep import flatten_dict
4
+ from unstructured_ingest.utils.dep_check import requires_dependencies
5
+
6
+ if TYPE_CHECKING:
7
+ from pandas import DataFrame
6
8
 
7
9
 
10
+ @requires_dependencies(["pandas"])
8
11
  def get_default_pandas_dtypes() -> dict[str, Any]:
12
+ import pandas as pd
13
+
9
14
  return {
10
15
  "text": pd.StringDtype(), # type: ignore
11
16
  "type": pd.StringDtype(), # type: ignore
@@ -57,7 +62,9 @@ def get_default_pandas_dtypes() -> dict[str, Any]:
57
62
  def convert_to_pandas_dataframe(
58
63
  elements_dict: list[dict[str, Any]],
59
64
  drop_empty_cols: bool = False,
60
- ) -> pd.DataFrame:
65
+ ) -> "DataFrame":
66
+ import pandas as pd
67
+
61
68
  # Flatten metadata if it hasn't already been flattened
62
69
  for d in elements_dict:
63
70
  if metadata := d.pop("metadata", None):
@@ -1,6 +1,13 @@
1
+ from unstructured_ingest.v2.types.file_data import (
2
+ BatchFileData,
3
+ BatchItem,
4
+ FileData,
5
+ FileDataSourceMetadata,
6
+ SourceIdentifiers,
7
+ )
8
+
1
9
  from .connector import AccessConfig, BaseConnector, ConnectionConfig
2
10
  from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
3
- from .file_data import BatchFileData, BatchItem, FileData, FileDataSourceMetadata, SourceIdentifiers
4
11
  from .indexer import Indexer, IndexerConfig
5
12
  from .process import BaseProcess
6
13
  from .processor import ProcessorConfig
@@ -1,116 +1,13 @@
1
- import json
2
- from pathlib import Path
3
- from typing import Any, Optional
4
- from uuid import NAMESPACE_DNS, uuid5
5
-
6
- from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
7
-
8
- from unstructured_ingest.v2.logger import logger
9
-
10
-
11
- class SourceIdentifiers(BaseModel):
12
- filename: str
13
- fullpath: str
14
- rel_path: Optional[str] = None
15
-
16
- @property
17
- def filename_stem(self) -> str:
18
- return Path(self.filename).stem
19
-
20
- @property
21
- def relative_path(self) -> str:
22
- return self.rel_path or self.fullpath
23
-
24
-
25
- class FileDataSourceMetadata(BaseModel):
26
- url: Optional[str] = None
27
- version: Optional[str] = None
28
- record_locator: Optional[dict[str, Any]] = None
29
- date_created: Optional[str] = None
30
- date_modified: Optional[str] = None
31
- date_processed: Optional[str] = None
32
- permissions_data: Optional[list[dict[str, Any]]] = None
33
- filesize_bytes: Optional[int] = None
34
-
35
-
36
- class FileData(BaseModel):
37
- identifier: str
38
- connector_type: str
39
- source_identifiers: SourceIdentifiers
40
- metadata: FileDataSourceMetadata = Field(default_factory=lambda: FileDataSourceMetadata())
41
- additional_metadata: dict[str, Any] = Field(default_factory=dict)
42
- reprocess: bool = False
43
- local_download_path: Optional[str] = None
44
- display_name: Optional[str] = None
45
-
46
- @classmethod
47
- def from_file(cls, path: str) -> "FileData":
48
- path = Path(path).resolve()
49
- if not path.exists() or not path.is_file():
50
- raise ValueError(f"file path not valid: {path}")
51
- with open(str(path.resolve()), "rb") as f:
52
- file_data_dict = json.load(f)
53
- file_data = cls.model_validate(file_data_dict)
54
- return file_data
55
-
56
- @classmethod
57
- def cast(cls, file_data: "FileData", **kwargs) -> "FileData":
58
- file_data_dict = file_data.model_dump()
59
- return cls.model_validate(file_data_dict, **kwargs)
60
-
61
- def to_file(self, path: str) -> None:
62
- path = Path(path).resolve()
63
- path.parent.mkdir(parents=True, exist_ok=True)
64
- with open(str(path.resolve()), "w") as f:
65
- json.dump(self.model_dump(), f, indent=2)
66
-
67
-
68
- class BatchItem(BaseModel):
69
- identifier: str
70
- version: Optional[str] = None
71
-
72
-
73
- class BatchFileData(FileData):
74
- identifier: str = Field(init=False)
75
- batch_items: list[BatchItem]
76
- source_identifiers: Optional[SourceIdentifiers] = None
77
-
78
- @field_validator("batch_items")
79
- @classmethod
80
- def check_batch_items(cls, v: list[BatchItem]) -> list[BatchItem]:
81
- if not v:
82
- raise ValueError("batch items cannot be empty")
83
- all_identifiers = [item.identifier for item in v]
84
- if len(all_identifiers) != len(set(all_identifiers)):
85
- raise ValueError(f"duplicate identifiers: {all_identifiers}")
86
- sorted_batch_items = sorted(v, key=lambda item: item.identifier)
87
- return sorted_batch_items
88
-
89
- @model_validator(mode="before")
90
- @classmethod
91
- def populate_identifier(cls, data: Any) -> Any:
92
- if isinstance(data, dict) and "identifier" not in data:
93
- batch_items = data["batch_items"]
94
- identifier_data = json.dumps(
95
- {item.identifier: item.version for item in batch_items}, sort_keys=True
96
- )
97
- data["identifier"] = str(uuid5(NAMESPACE_DNS, str(identifier_data)))
98
- return data
99
-
100
-
101
- def file_data_from_file(path: str) -> FileData:
102
- try:
103
- return BatchFileData.from_file(path=path)
104
- except ValidationError:
105
- logger.debug(f"{path} not detected as batch file data")
106
-
107
- return FileData.from_file(path=path)
108
-
109
-
110
- def file_data_from_dict(data: dict) -> FileData:
111
- try:
112
- return BatchFileData.model_validate(data)
113
- except ValidationError:
114
- logger.debug(f"{data} not valid for batch file data")
115
-
116
- return FileData.model_validate(data)
1
+ """
2
+ COMPATABILITY NOTICE:
3
+ This file has moved to the v2/types/ module.
4
+ The following line exists for backward compatibility.
5
+ """
6
+
7
+ from unstructured_ingest.v2.types.file_data import * # noqa - star imports are bad, but this is for maximal backward compatability
8
+
9
+ # Eventually this file should go away. Let's start warning users now:
10
+ logger.warning( # noqa - using logger from the star import
11
+ "Importing file_data.py through interfaces is deprecated. "
12
+ "Please use unstructured_ingest.v2.types.file_data instead!"
13
+ )
@@ -3,10 +3,9 @@ import traceback
3
3
  from dataclasses import dataclass, field
4
4
  from multiprocessing import Process, Queue
5
5
  from pathlib import Path
6
- from typing import Any, Optional
6
+ from typing import TYPE_CHECKING, Any, Optional
7
7
  from urllib.parse import urlparse
8
8
 
9
- import pandas as pd
10
9
  from pydantic import Field, Secret
11
10
 
12
11
  from unstructured_ingest.error import DestinationConnectionError
@@ -27,6 +26,9 @@ from unstructured_ingest.v2.processes.connector_registry import DestinationRegis
27
26
 
28
27
  CONNECTOR_TYPE = "delta_table"
29
28
 
29
+ if TYPE_CHECKING:
30
+ from pandas import DataFrame
31
+
30
32
 
31
33
  @requires_dependencies(["deltalake"], extras="delta-table")
32
34
  def write_deltalake_with_error_handling(queue, **kwargs):
@@ -136,7 +138,7 @@ class DeltaTableUploader(Uploader):
136
138
  logger.error(f"failed to validate connection: {e}", exc_info=True)
137
139
  raise DestinationConnectionError(f"failed to validate connection: {e}")
138
140
 
139
- def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
141
+ def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
140
142
  updated_upload_path = os.path.join(
141
143
  self.connection_config.table_uri, file_data.source_identifiers.relative_path
142
144
  )
@@ -172,7 +174,10 @@ class DeltaTableUploader(Uploader):
172
174
  logger.error(f"Exception occurred in write_deltalake: {error_message}")
173
175
  raise RuntimeError(f"Error in write_deltalake: {error_message}")
174
176
 
177
+ @requires_dependencies(["pandas"], extras="delta-table")
175
178
  def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
179
+ import pandas as pd
180
+
176
181
  df = pd.DataFrame(data=data)
177
182
  self.upload_dataframe(df=df, file_data=file_data)
178
183
 
@@ -2,9 +2,8 @@ from dataclasses import dataclass
2
2
  from pathlib import Path
3
3
  from typing import Any
4
4
 
5
- import pandas as pd
6
-
7
5
  from unstructured_ingest.utils.data_prep import get_data, write_data
6
+ from unstructured_ingest.utils.dep_check import requires_dependencies
8
7
  from unstructured_ingest.v2.interfaces import FileData, UploadStager
9
8
  from unstructured_ingest.v2.utils import get_enhanced_element_id
10
9
 
@@ -55,7 +54,6 @@ _COLUMNS = (
55
54
 
56
55
  @dataclass
57
56
  class BaseDuckDBUploadStager(UploadStager):
58
-
59
57
  def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
60
58
  data = element_dict.copy()
61
59
  metadata: dict[str, Any] = data.pop("metadata", {})
@@ -72,6 +70,7 @@ class BaseDuckDBUploadStager(UploadStager):
72
70
  data = {k: v for k, v in data.items() if k in _COLUMNS}
73
71
  return data
74
72
 
73
+ @requires_dependencies(["pandas"], extras="duckdb")
75
74
  def run(
76
75
  self,
77
76
  elements_filepath: Path,
@@ -80,6 +79,8 @@ class BaseDuckDBUploadStager(UploadStager):
80
79
  output_filename: str,
81
80
  **kwargs: Any,
82
81
  ) -> Path:
82
+ import pandas as pd
83
+
83
84
  elements_contents = get_data(path=elements_filepath)
84
85
  output_filename_suffix = Path(elements_filepath).suffix
85
86
  output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
@@ -3,7 +3,6 @@ from dataclasses import dataclass, field
3
3
  from pathlib import Path
4
4
  from typing import TYPE_CHECKING, Any, Generator, Optional
5
5
 
6
- import pandas as pd
7
6
  from pydantic import Field, Secret
8
7
 
9
8
  from unstructured_ingest.error import DestinationConnectionError
@@ -23,6 +22,7 @@ from unstructured_ingest.v2.processes.connectors.duckdb.base import BaseDuckDBUp
23
22
 
24
23
  if TYPE_CHECKING:
25
24
  from duckdb import DuckDBPyConnection as DuckDBConnection
25
+ from pandas import DataFrame
26
26
 
27
27
  CONNECTOR_TYPE = "duckdb"
28
28
 
@@ -101,7 +101,7 @@ class DuckDBUploader(Uploader):
101
101
  logger.error(f"failed to validate connection: {e}", exc_info=True)
102
102
  raise DestinationConnectionError(f"failed to validate connection: {e}")
103
103
 
104
- def upload_dataframe(self, df: pd.DataFrame) -> None:
104
+ def upload_dataframe(self, df: "DataFrame") -> None:
105
105
  logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
106
106
 
107
107
  with self.connection_config.get_client() as conn:
@@ -109,7 +109,10 @@ class DuckDBUploader(Uploader):
109
109
  f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df" # noqa: E501
110
110
  )
111
111
 
112
+ @requires_dependencies(["pandas"], extras="duckdb")
112
113
  def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
114
+ import pandas as pd
115
+
113
116
  df = pd.DataFrame(data=data)
114
117
  self.upload_dataframe(df=df)
115
118
 
@@ -3,7 +3,6 @@ from dataclasses import dataclass, field
3
3
  from pathlib import Path
4
4
  from typing import TYPE_CHECKING, Any, Generator, Optional
5
5
 
6
- import pandas as pd
7
6
  from pydantic import Field, Secret
8
7
 
9
8
  from unstructured_ingest.__version__ import __version__ as unstructured_io_ingest_version
@@ -24,6 +23,7 @@ from unstructured_ingest.v2.processes.connectors.duckdb.base import BaseDuckDBUp
24
23
 
25
24
  if TYPE_CHECKING:
26
25
  from duckdb import DuckDBPyConnection as MotherDuckConnection
26
+ from pandas import DataFrame
27
27
 
28
28
  CONNECTOR_TYPE = "motherduck"
29
29
 
@@ -100,7 +100,7 @@ class MotherDuckUploader(Uploader):
100
100
  logger.error(f"failed to validate connection: {e}", exc_info=True)
101
101
  raise DestinationConnectionError(f"failed to validate connection: {e}")
102
102
 
103
- def upload_dataframe(self, df: pd.DataFrame) -> None:
103
+ def upload_dataframe(self, df: "DataFrame") -> None:
104
104
  logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
105
105
  database = self.connection_config.database
106
106
  db_schema = self.connection_config.db_schema
@@ -109,7 +109,10 @@ class MotherDuckUploader(Uploader):
109
109
  with self.connection_config.get_client() as conn:
110
110
  conn.query(f'INSERT INTO "{database}"."{db_schema}"."{table}" BY NAME SELECT * FROM df')
111
111
 
112
+ @requires_dependencies(["pandas"], extras="duckdb")
112
113
  def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
114
+ import pandas as pd
115
+
113
116
  df = pd.DataFrame(data=data)
114
117
  self.upload_dataframe(df=df)
115
118
 
@@ -6,7 +6,7 @@ from pathlib import Path
6
6
  from time import time
7
7
  from typing import TYPE_CHECKING, Any, Generator, Optional, Union
8
8
 
9
- from pydantic import BaseModel, Field, Secret, SecretStr
9
+ from pydantic import BaseModel, Field, Secret, SecretStr, field_validator
10
10
 
11
11
  from unstructured_ingest.error import (
12
12
  DestinationConnectionError,
@@ -98,6 +98,12 @@ class ElasticsearchConnectionConfig(ConnectionConfig):
98
98
  ca_certs: Optional[Path] = None
99
99
  access_config: Secret[ElasticsearchAccessConfig]
100
100
 
101
+ @field_validator("hosts", mode="before")
102
+ def to_list(cls, value):
103
+ if isinstance(value, str):
104
+ return [value]
105
+ return value
106
+
101
107
  def get_client_kwargs(self) -> dict:
102
108
  # Update auth related fields to conform to what the SDK expects based on the
103
109
  # supported methods:
@@ -2,7 +2,7 @@ from dataclasses import dataclass, field
2
2
  from pathlib import Path
3
3
  from typing import TYPE_CHECKING, Optional
4
4
 
5
- from pydantic import BaseModel, Field, Secret
5
+ from pydantic import BaseModel, Field, Secret, field_validator
6
6
 
7
7
  from unstructured_ingest.error import (
8
8
  DestinationConnectionError,
@@ -78,6 +78,12 @@ class OpenSearchConnectionConfig(ConnectionConfig):
78
78
 
79
79
  access_config: Secret[OpenSearchAccessConfig]
80
80
 
81
+ @field_validator("hosts", mode="before")
82
+ def to_list(cls, value):
83
+ if isinstance(value, str):
84
+ return [value]
85
+ return value
86
+
81
87
  def get_client_kwargs(self) -> dict:
82
88
  # Update auth related fields to conform to what the SDK expects based on the
83
89
  # supported methods:
@@ -3,7 +3,6 @@ from dataclasses import dataclass, field
3
3
  from pathlib import Path
4
4
  from typing import TYPE_CHECKING, Any, Generator, Optional
5
5
 
6
- import pandas as pd
7
6
  from pydantic import Field, Secret
8
7
 
9
8
  from unstructured_ingest.error import DestinationConnectionError
@@ -26,6 +25,7 @@ from unstructured_ingest.v2.utils import get_enhanced_element_id
26
25
 
27
26
  if TYPE_CHECKING:
28
27
  from kdbai_client import Database, Session, Table
28
+ from pandas import DataFrame
29
29
 
30
30
  CONNECTOR_TYPE = "kdbai"
31
31
 
@@ -118,11 +118,11 @@ class KdbaiUploader(Uploader):
118
118
  table = db.table(self.upload_config.table_name)
119
119
  yield table
120
120
 
121
- def upsert_batch(self, batch: pd.DataFrame):
121
+ def upsert_batch(self, batch: "DataFrame"):
122
122
  with self.get_table() as table:
123
123
  table.insert(batch)
124
124
 
125
- def process_dataframe(self, df: pd.DataFrame):
125
+ def process_dataframe(self, df: "DataFrame"):
126
126
  logger.debug(
127
127
  f"uploading {len(df)} entries to {self.connection_config.endpoint} "
128
128
  f"db {self.upload_config.database_name} in table {self.upload_config.table_name}"
@@ -130,7 +130,10 @@ class KdbaiUploader(Uploader):
130
130
  for batch_df in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
131
131
  self.upsert_batch(batch=batch_df)
132
132
 
133
+ @requires_dependencies(["pandas"], extras="kdbai")
133
134
  def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
135
+ import pandas as pd
136
+
134
137
  df = pd.DataFrame(data=data)
135
138
  self.process_dataframe(df=df)
136
139
 
@@ -8,7 +8,6 @@ from dataclasses import dataclass, field
8
8
  from pathlib import Path
9
9
  from typing import TYPE_CHECKING, Any, AsyncGenerator, Optional
10
10
 
11
- import pandas as pd
12
11
  from pydantic import Field
13
12
 
14
13
  from unstructured_ingest.error import DestinationConnectionError
@@ -26,6 +25,7 @@ CONNECTOR_TYPE = "lancedb"
26
25
  if TYPE_CHECKING:
27
26
  from lancedb import AsyncConnection
28
27
  from lancedb.table import AsyncTable
28
+ from pandas import DataFrame
29
29
 
30
30
 
31
31
  class LanceDBConnectionConfig(ConnectionConfig, ABC):
@@ -69,6 +69,7 @@ class LanceDBUploadStager(UploadStager):
69
69
  default_factory=LanceDBUploadStagerConfig
70
70
  )
71
71
 
72
+ @requires_dependencies(["pandas"], extras="lancedb")
72
73
  def run(
73
74
  self,
74
75
  elements_filepath: Path,
@@ -77,6 +78,8 @@ class LanceDBUploadStager(UploadStager):
77
78
  output_filename: str,
78
79
  **kwargs: Any,
79
80
  ) -> Path:
81
+ import pandas as pd
82
+
80
83
  with open(elements_filepath) as elements_file:
81
84
  elements_contents: list[dict] = json.load(elements_file)
82
85
 
@@ -129,7 +132,10 @@ class LanceDBUploader(Uploader):
129
132
  finally:
130
133
  table.close()
131
134
 
135
+ @requires_dependencies(["pandas"], extras="lancedb")
132
136
  async def run_async(self, path, file_data, **kwargs):
137
+ import pandas as pd
138
+
133
139
  df = pd.read_feather(path)
134
140
  async with self.get_table() as table:
135
141
  schema = await table.schema()
@@ -144,7 +150,9 @@ class LanceDBUploader(Uploader):
144
150
  await table.delete(f'{RECORD_ID_LABEL} = "{file_data.identifier}"')
145
151
  await table.add(data=df)
146
152
 
147
- def _fit_to_schema(self, df: pd.DataFrame, schema) -> pd.DataFrame:
153
+ def _fit_to_schema(self, df: "DataFrame", schema) -> "DataFrame":
154
+ import pandas as pd
155
+
148
156
  columns = set(df.columns)
149
157
  schema_fields = set(schema.names)
150
158
  columns_to_drop = columns - schema_fields
@@ -3,8 +3,6 @@ from contextlib import contextmanager
3
3
  from dataclasses import dataclass
4
4
  from typing import TYPE_CHECKING, Any, Generator, Optional
5
5
 
6
- import numpy as np
7
- import pandas as pd
8
6
  from pydantic import Field, Secret
9
7
 
10
8
  from unstructured_ingest.utils.data_prep import split_dataframe
@@ -27,6 +25,7 @@ if TYPE_CHECKING:
27
25
  from databricks.sdk.core import oauth_service_principal
28
26
  from databricks.sql.client import Connection as DeltaTableConnection
29
27
  from databricks.sql.client import Cursor as DeltaTableCursor
28
+ from pandas import DataFrame
30
29
 
31
30
  CONNECTOR_TYPE = "databricks_delta_tables"
32
31
 
@@ -180,7 +179,10 @@ class DatabricksDeltaTablesUploader(SQLUploader):
180
179
  )
181
180
  return statement
182
181
 
183
- def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
182
+ @requires_dependencies(["pandas"], extras="databricks-delta-tables")
183
+ def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
184
+ import numpy as np
185
+
184
186
  if self.can_delete():
185
187
  self.delete_by_record_id(file_data=file_data)
186
188
  else:
@@ -3,9 +3,9 @@ from contextlib import contextmanager
3
3
  from dataclasses import dataclass, field
4
4
  from typing import TYPE_CHECKING, Any, Generator, Optional
5
5
 
6
- import pandas as pd
7
6
  from pydantic import Field, Secret
8
7
 
8
+ from unstructured_ingest.utils.dep_check import requires_dependencies
9
9
  from unstructured_ingest.v2.logger import logger
10
10
  from unstructured_ingest.v2.processes.connector_registry import (
11
11
  DestinationRegistryEntry,
@@ -46,6 +46,7 @@ class SingleStoreConnectionConfig(SQLConnectionConfig):
46
46
  database: Optional[str] = Field(default=None, description="SingleStore database")
47
47
 
48
48
  @contextmanager
49
+ @requires_dependencies(["singlestoredb"], extras="singlestore")
49
50
  def get_connection(self) -> Generator["SingleStoreConnection", None, None]:
50
51
  import singlestoredb as s2
51
52
 
@@ -130,9 +131,12 @@ class SingleStoreUploader(SQLUploader):
130
131
  values_delimiter: str = "%s"
131
132
  connector_type: str = CONNECTOR_TYPE
132
133
 
134
+ @requires_dependencies(["pandas"], extras="singlestore")
133
135
  def prepare_data(
134
136
  self, columns: list[str], data: tuple[tuple[Any, ...], ...]
135
137
  ) -> list[tuple[Any, ...]]:
138
+ import pandas as pd
139
+
136
140
  output = []
137
141
  for row in data:
138
142
  parsed = []