unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (187) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/README.md +28 -0
  3. unstructured_ingest/embed/mixedbreadai.py +0 -1
  4. unstructured_ingest/interfaces/upload_stager.py +2 -2
  5. unstructured_ingest/interfaces/uploader.py +3 -3
  6. unstructured_ingest/main.py +0 -0
  7. unstructured_ingest/pipeline/interfaces.py +1 -1
  8. unstructured_ingest/pipeline/pipeline.py +1 -1
  9. unstructured_ingest/processes/chunker.py +4 -0
  10. unstructured_ingest/processes/connectors/airtable.py +4 -2
  11. unstructured_ingest/processes/connectors/astradb.py +2 -2
  12. unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  13. unstructured_ingest/processes/connectors/confluence.py +0 -1
  14. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
  15. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
  16. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
  17. unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
  18. unstructured_ingest/processes/connectors/delta_table.py +1 -0
  19. unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
  20. unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
  21. unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
  22. unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
  23. unstructured_ingest/processes/connectors/gitlab.py +1 -2
  24. unstructured_ingest/processes/connectors/google_drive.py +0 -2
  25. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
  26. unstructured_ingest/processes/connectors/kdbai.py +1 -0
  27. unstructured_ingest/processes/connectors/outlook.py +1 -2
  28. unstructured_ingest/processes/connectors/pinecone.py +0 -1
  29. unstructured_ingest/processes/connectors/redisdb.py +28 -24
  30. unstructured_ingest/processes/connectors/salesforce.py +1 -1
  31. unstructured_ingest/processes/connectors/slack.py +1 -2
  32. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
  33. unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
  34. unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
  35. unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
  36. unstructured_ingest/processes/connectors/sql/sql.py +3 -4
  37. unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
  38. unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
  39. unstructured_ingest/processes/connectors/vectara.py +0 -2
  40. unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
  41. unstructured_ingest/processes/embedder.py +2 -2
  42. unstructured_ingest/processes/filter.py +1 -1
  43. unstructured_ingest/processes/partitioner.py +4 -0
  44. unstructured_ingest/processes/utils/blob_storage.py +2 -2
  45. unstructured_ingest/unstructured_api.py +13 -8
  46. unstructured_ingest/utils/data_prep.py +8 -32
  47. unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
  48. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +50 -184
  49. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
  50. examples/__init__.py +0 -0
  51. examples/airtable.py +0 -44
  52. examples/azure_cognitive_search.py +0 -55
  53. examples/chroma.py +0 -54
  54. examples/couchbase.py +0 -55
  55. examples/databricks_volumes_dest.py +0 -55
  56. examples/databricks_volumes_source.py +0 -53
  57. examples/delta_table.py +0 -45
  58. examples/discord_example.py +0 -36
  59. examples/elasticsearch.py +0 -49
  60. examples/google_drive.py +0 -45
  61. examples/kdbai.py +0 -54
  62. examples/local.py +0 -36
  63. examples/milvus.py +0 -44
  64. examples/mongodb.py +0 -53
  65. examples/opensearch.py +0 -50
  66. examples/pinecone.py +0 -57
  67. examples/s3.py +0 -38
  68. examples/salesforce.py +0 -44
  69. examples/sharepoint.py +0 -47
  70. examples/singlestore.py +0 -49
  71. examples/sql.py +0 -90
  72. examples/vectara.py +0 -54
  73. examples/weaviate.py +0 -44
  74. test/__init__.py +0 -0
  75. test/integration/__init__.py +0 -0
  76. test/integration/chunkers/__init__.py +0 -0
  77. test/integration/chunkers/test_chunkers.py +0 -31
  78. test/integration/connectors/__init__.py +0 -0
  79. test/integration/connectors/conftest.py +0 -38
  80. test/integration/connectors/databricks/__init__.py +0 -0
  81. test/integration/connectors/databricks/test_volumes_native.py +0 -273
  82. test/integration/connectors/discord/__init__.py +0 -0
  83. test/integration/connectors/discord/test_discord.py +0 -90
  84. test/integration/connectors/duckdb/__init__.py +0 -0
  85. test/integration/connectors/duckdb/conftest.py +0 -14
  86. test/integration/connectors/duckdb/test_duckdb.py +0 -90
  87. test/integration/connectors/duckdb/test_motherduck.py +0 -95
  88. test/integration/connectors/elasticsearch/__init__.py +0 -0
  89. test/integration/connectors/elasticsearch/conftest.py +0 -34
  90. test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
  91. test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
  92. test/integration/connectors/sql/__init__.py +0 -0
  93. test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
  94. test/integration/connectors/sql/test_postgres.py +0 -201
  95. test/integration/connectors/sql/test_singlestore.py +0 -182
  96. test/integration/connectors/sql/test_snowflake.py +0 -244
  97. test/integration/connectors/sql/test_sqlite.py +0 -168
  98. test/integration/connectors/sql/test_vastdb.py +0 -34
  99. test/integration/connectors/test_astradb.py +0 -287
  100. test/integration/connectors/test_azure_ai_search.py +0 -254
  101. test/integration/connectors/test_chroma.py +0 -136
  102. test/integration/connectors/test_confluence.py +0 -111
  103. test/integration/connectors/test_delta_table.py +0 -183
  104. test/integration/connectors/test_dropbox.py +0 -151
  105. test/integration/connectors/test_github.py +0 -49
  106. test/integration/connectors/test_google_drive.py +0 -257
  107. test/integration/connectors/test_jira.py +0 -67
  108. test/integration/connectors/test_lancedb.py +0 -247
  109. test/integration/connectors/test_milvus.py +0 -208
  110. test/integration/connectors/test_mongodb.py +0 -335
  111. test/integration/connectors/test_neo4j.py +0 -244
  112. test/integration/connectors/test_notion.py +0 -152
  113. test/integration/connectors/test_onedrive.py +0 -163
  114. test/integration/connectors/test_pinecone.py +0 -387
  115. test/integration/connectors/test_qdrant.py +0 -216
  116. test/integration/connectors/test_redis.py +0 -143
  117. test/integration/connectors/test_s3.py +0 -184
  118. test/integration/connectors/test_sharepoint.py +0 -222
  119. test/integration/connectors/test_vectara.py +0 -282
  120. test/integration/connectors/test_zendesk.py +0 -120
  121. test/integration/connectors/utils/__init__.py +0 -0
  122. test/integration/connectors/utils/constants.py +0 -13
  123. test/integration/connectors/utils/docker.py +0 -151
  124. test/integration/connectors/utils/docker_compose.py +0 -59
  125. test/integration/connectors/utils/validation/__init__.py +0 -0
  126. test/integration/connectors/utils/validation/destination.py +0 -77
  127. test/integration/connectors/utils/validation/equality.py +0 -76
  128. test/integration/connectors/utils/validation/source.py +0 -331
  129. test/integration/connectors/utils/validation/utils.py +0 -36
  130. test/integration/connectors/weaviate/__init__.py +0 -0
  131. test/integration/connectors/weaviate/conftest.py +0 -15
  132. test/integration/connectors/weaviate/test_cloud.py +0 -39
  133. test/integration/connectors/weaviate/test_local.py +0 -152
  134. test/integration/embedders/__init__.py +0 -0
  135. test/integration/embedders/conftest.py +0 -13
  136. test/integration/embedders/test_azure_openai.py +0 -57
  137. test/integration/embedders/test_bedrock.py +0 -103
  138. test/integration/embedders/test_huggingface.py +0 -24
  139. test/integration/embedders/test_mixedbread.py +0 -71
  140. test/integration/embedders/test_octoai.py +0 -75
  141. test/integration/embedders/test_openai.py +0 -74
  142. test/integration/embedders/test_togetherai.py +0 -71
  143. test/integration/embedders/test_vertexai.py +0 -63
  144. test/integration/embedders/test_voyageai.py +0 -79
  145. test/integration/embedders/utils.py +0 -66
  146. test/integration/partitioners/__init__.py +0 -0
  147. test/integration/partitioners/test_partitioner.py +0 -76
  148. test/integration/utils.py +0 -15
  149. test/unit/__init__.py +0 -0
  150. test/unit/chunkers/__init__.py +0 -0
  151. test/unit/chunkers/test_chunkers.py +0 -49
  152. test/unit/connectors/__init__.py +0 -0
  153. test/unit/connectors/ibm_watsonx/__init__.py +0 -0
  154. test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
  155. test/unit/connectors/motherduck/__init__.py +0 -0
  156. test/unit/connectors/motherduck/test_base.py +0 -73
  157. test/unit/connectors/sql/__init__.py +0 -0
  158. test/unit/connectors/sql/test_sql.py +0 -152
  159. test/unit/connectors/test_confluence.py +0 -71
  160. test/unit/connectors/test_jira.py +0 -401
  161. test/unit/embed/__init__.py +0 -0
  162. test/unit/embed/test_mixedbreadai.py +0 -42
  163. test/unit/embed/test_octoai.py +0 -27
  164. test/unit/embed/test_openai.py +0 -28
  165. test/unit/embed/test_vertexai.py +0 -25
  166. test/unit/embed/test_voyageai.py +0 -24
  167. test/unit/embedders/__init__.py +0 -0
  168. test/unit/embedders/test_bedrock.py +0 -36
  169. test/unit/embedders/test_huggingface.py +0 -48
  170. test/unit/embedders/test_mixedbread.py +0 -37
  171. test/unit/embedders/test_octoai.py +0 -35
  172. test/unit/embedders/test_openai.py +0 -35
  173. test/unit/embedders/test_togetherai.py +0 -37
  174. test/unit/embedders/test_vertexai.py +0 -37
  175. test/unit/embedders/test_voyageai.py +0 -38
  176. test/unit/partitioners/__init__.py +0 -0
  177. test/unit/partitioners/test_partitioner.py +0 -63
  178. test/unit/test_error.py +0 -27
  179. test/unit/test_html.py +0 -112
  180. test/unit/test_interfaces.py +0 -26
  181. test/unit/test_utils.py +0 -220
  182. test/unit/utils/__init__.py +0 -0
  183. test/unit/utils/data_generator.py +0 -32
  184. unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
  185. unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
  186. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
  187. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
@@ -1 +1 @@
1
- __version__ = "0.7.2" # pragma: no cover
1
+ __version__ = "1.0.1" # pragma: no cover
@@ -0,0 +1,28 @@
1
+ # Ingest CLI
2
+ This package helps map user input via a cli to the underlying ingest code to run a small ETL pipeline.
3
+
4
+ ## Design Reference
5
+ [cli.py](cli.py) is the main entrypoint to run the cli itself. The key points for this is the interaction between all
6
+ source and destination connectors.
7
+
8
+ To manually run the cli:
9
+ ```shell
10
+ PYTHONPATH=. python unstructured_ingest/v2/main.py --help
11
+ ```
12
+
13
+ The `main.py` file simply wraps the generated Click command created in `cli.py`.
14
+
15
+ ### Source Commands
16
+ All source commands are added as sub commands to the parent ingest Click group. This allows each command to map to
17
+ different connectors with shared and unique parameters.
18
+
19
+ ### Destination Commands
20
+ All destination commands are added as sub commands to each parent source command. This allows each invocation of the source
21
+ sub command to display all possible destination subcommands. The code un [utils.py](./utils.py) helps structure the
22
+ generated text from the Click library to be more intuitive on this approach (i.e. list sub commands as `Destinations`).
23
+
24
+ ### Configs
25
+ The configs in [configs/](./configs) and connector specific ones in [cmds/](./cmds) help surface all user parameters that
26
+ are needed to marshall the input dictionary from Click into all the respective configs needed to create a full pipeline run.
27
+ Because click returns a flat dictionary of user inputs, the `extract_config` method in `utils.py` helps deserialize this dictionary
28
+ into dataclasses that have nested fields (such as access configs).
@@ -114,7 +114,6 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
114
114
 
115
115
  @dataclass
116
116
  class AsyncMixedbreadAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
117
-
118
117
  config: MixedbreadAIEmbeddingConfig
119
118
 
120
119
  async def get_exemplary_embedding(self) -> list[float]:
@@ -8,7 +8,7 @@ from pydantic import BaseModel
8
8
  from unstructured_ingest.data_types.file_data import FileData
9
9
  from unstructured_ingest.interfaces import BaseProcess
10
10
  from unstructured_ingest.utils import ndjson
11
- from unstructured_ingest.utils.data_prep import get_data, write_data
11
+ from unstructured_ingest.utils.data_prep import get_json_data, write_data
12
12
 
13
13
 
14
14
  class UploadStagerConfig(BaseModel):
@@ -43,7 +43,7 @@ class UploadStager(BaseProcess, ABC):
43
43
  writer.f.flush()
44
44
 
45
45
  def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
46
- elements_contents = get_data(path=input_file)
46
+ elements_contents = get_json_data(path=input_file)
47
47
 
48
48
  conformed_elements = [
49
49
  self.conform_dict(element_dict=element, file_data=file_data)
@@ -7,7 +7,7 @@ from pydantic import BaseModel
7
7
 
8
8
  from unstructured_ingest.data_types.file_data import FileData
9
9
  from unstructured_ingest.interfaces import BaseConnector, BaseProcess
10
- from unstructured_ingest.utils.data_prep import get_data
10
+ from unstructured_ingest.utils.data_prep import get_json_data
11
11
 
12
12
 
13
13
  class UploaderConfig(BaseModel):
@@ -45,11 +45,11 @@ class Uploader(BaseProcess, BaseConnector, ABC):
45
45
  return False
46
46
 
47
47
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
48
- data = get_data(path=path)
48
+ data = get_json_data(path=path)
49
49
  self.run_data(data=data, file_data=file_data, **kwargs)
50
50
 
51
51
  async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
52
- data = get_data(path=path)
52
+ data = get_json_data(path=path)
53
53
  await self.run_data_async(data=data, file_data=file_data, **kwargs)
54
54
 
55
55
  def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
File without changes
@@ -119,7 +119,7 @@ class PipelineStep(ABC):
119
119
  iterable = iterable or []
120
120
  if iterable:
121
121
  logger.info(
122
- f"calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
122
+ f"calling {self.__class__.__name__} with {len(iterable)} docs", # type: ignore
123
123
  )
124
124
  else:
125
125
  logger.info(f"calling {self.__class__.__name__} with no inputs")
@@ -220,7 +220,7 @@ class Pipeline:
220
220
 
221
221
  def _run(self):
222
222
  logger.info(
223
- f"running local pipeline: {self} with configs: " f"{self.context.model_dump_json()}"
223
+ f"running local pipeline: {self} with configs: {self.context.model_dump_json()}"
224
224
  )
225
225
  if self.context.mp_supported:
226
226
  manager = mp.Manager()
@@ -24,6 +24,9 @@ class ChunkerConfig(BaseModel):
24
24
  default="https://api.unstructuredapp.io/general/v0/general",
25
25
  description="If chunking via api, use the following host.",
26
26
  )
27
+ chunk_api_timeout_ms: Optional[int] = Field(
28
+ default=None, description="Timeout in milliseconds for all api call during chunking."
29
+ )
27
30
  chunk_by_api: bool = Field(default=False, description="Flag to use api for chunking")
28
31
  chunk_api_key: Optional[SecretStr] = Field(
29
32
  default=None, description="API Key for chunking endpoint."
@@ -120,6 +123,7 @@ class Chunker(BaseProcess, ABC):
120
123
  api_key=self.config.chunk_api_key.get_secret_value(),
121
124
  filename=elements_filepath,
122
125
  api_parameters=self.config.to_chunking_kwargs(),
126
+ timeout_ms=self.config.chunk_api_timeout_ms,
123
127
  )
124
128
 
125
129
  elements = assign_and_map_hash_ids(elements=elements)
@@ -3,7 +3,6 @@ from pathlib import Path
3
3
  from typing import TYPE_CHECKING, Any, Generator, Optional
4
4
  from uuid import NAMESPACE_DNS, uuid5
5
5
 
6
- import pandas
7
6
  from pydantic import BaseModel, Field, Secret, field_validator
8
7
 
9
8
  from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
@@ -213,10 +212,13 @@ class AirtableDownloader(Downloader):
213
212
  row_dict.update(table_row["fields"])
214
213
  return row_dict
215
214
 
215
+ @requires_dependencies(["pandas"], extras="airtable")
216
216
  def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
217
+ import pandas as pd
218
+
217
219
  table_meta = AirtableTableMeta.model_validate(file_data.additional_metadata)
218
220
  table_contents = self.get_table_contents(table_meta=table_meta)
219
- df = pandas.DataFrame.from_dict(
221
+ df = pd.DataFrame.from_dict(
220
222
  data=[self._table_row_to_dict(table_row=row) for row in table_contents]
221
223
  ).sort_index(axis=1)
222
224
  download_path = self.get_download_path(file_data=file_data)
@@ -43,7 +43,7 @@ from unstructured_ingest.processes.connector_registry import (
43
43
  )
44
44
  from unstructured_ingest.processes.connectors.utils import format_and_truncate_orig_elements
45
45
  from unstructured_ingest.utils.constants import RECORD_ID_LABEL
46
- from unstructured_ingest.utils.data_prep import batch_generator, get_data
46
+ from unstructured_ingest.utils.data_prep import batch_generator, get_json_data
47
47
  from unstructured_ingest.utils.dep_check import requires_dependencies
48
48
  from unstructured_ingest.utils.string_and_date_utils import truncate_string_bytes
49
49
 
@@ -465,7 +465,7 @@ class AstraDBUploader(Uploader):
465
465
  collection.insert_many(chunk)
466
466
 
467
467
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
468
- data = get_data(path=path)
468
+ data = get_json_data(path=path)
469
469
  self.run_data(data=data, file_data=file_data, **kwargs)
470
470
 
471
471
 
@@ -212,7 +212,7 @@ class AzureAISearchUploader(Uploader):
212
212
  raise WriteError(
213
213
  ", ".join(
214
214
  [
215
- f"{error.key}: " f"[{error.status_code}] {error.error_message}"
215
+ f"{error.key}: [{error.status_code}] {error.error_message}"
216
216
  for error in errors
217
217
  ],
218
218
  ),
@@ -125,7 +125,6 @@ class ConfluenceIndexer(Indexer):
125
125
 
126
126
  def precheck(self) -> bool:
127
127
  try:
128
-
129
128
  # Attempt to retrieve a list of spaces with limit=1.
130
129
  # This should only succeed if all creds are valid
131
130
  with self.connection_config.get_client() as client:
@@ -28,7 +28,7 @@ CONNECTOR_TYPE = "databricks_volumes_aws"
28
28
  class DatabricksAWSVolumesAccessConfig(DatabricksVolumesAccessConfig):
29
29
  account_id: Optional[str] = Field(
30
30
  default=None,
31
- description="The Databricks account ID for the Databricks " "accounts endpoint",
31
+ description="The Databricks account ID for the Databricks accounts endpoint",
32
32
  )
33
33
  profile: Optional[str] = None
34
34
  token: Optional[str] = Field(
@@ -28,7 +28,7 @@ CONNECTOR_TYPE = "databricks_volumes_azure"
28
28
  class DatabricksAzureVolumesAccessConfig(DatabricksVolumesAccessConfig):
29
29
  account_id: Optional[str] = Field(
30
30
  default=None,
31
- description="The Databricks account ID for the Databricks " "accounts endpoint.",
31
+ description="The Databricks account ID for the Databricks accounts endpoint.",
32
32
  )
33
33
  profile: Optional[str] = None
34
34
  azure_workspace_resource_id: Optional[str] = Field(
@@ -47,7 +47,7 @@ class DatabricksAzureVolumesAccessConfig(DatabricksVolumesAccessConfig):
47
47
  )
48
48
  azure_environment: Optional[str] = Field(
49
49
  default=None,
50
- description="The Azure environment type for a " "specific set of API endpoints",
50
+ description="The Azure environment type for a specific set of API endpoints",
51
51
  examples=["Public", "UsGov", "China", "Germany"],
52
52
  )
53
53
 
@@ -28,7 +28,7 @@ CONNECTOR_TYPE = "databricks_volumes_gcp"
28
28
  class DatabricksGoogleVolumesAccessConfig(DatabricksVolumesAccessConfig):
29
29
  account_id: Optional[str] = Field(
30
30
  default=None,
31
- description="The Databricks account ID for the Databricks " "accounts endpoint.",
31
+ description="The Databricks account ID for the Databricks accounts endpoint.",
32
32
  )
33
33
  profile: Optional[str] = None
34
34
  google_credentials: Optional[str] = None
@@ -166,8 +166,7 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
166
166
  logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
167
167
  cursor.execute(f"PUT '{path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
168
168
  logger.debug(
169
- f"migrating content from {catalog_path} to "
170
- f"table {self.upload_config.table_name}"
169
+ f"migrating content from {catalog_path} to table {self.upload_config.table_name}"
171
170
  )
172
171
  data = get_json_data(path=path)
173
172
  columns = data[0].keys()
@@ -181,6 +181,7 @@ class DeltaTableUploader(Uploader):
181
181
  df = pd.DataFrame(data=data)
182
182
  self.upload_dataframe(df=df, file_data=file_data)
183
183
 
184
+ @requires_dependencies(["pandas"], extras="delta-table")
184
185
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
185
186
  df = get_data_df(path)
186
187
  self.upload_dataframe(df=df, file_data=file_data)
@@ -4,7 +4,7 @@ from typing import Any
4
4
 
5
5
  from unstructured_ingest.data_types.file_data import FileData
6
6
  from unstructured_ingest.interfaces import UploadStager
7
- from unstructured_ingest.utils.data_prep import get_data, get_enhanced_element_id, write_data
7
+ from unstructured_ingest.utils.data_prep import get_enhanced_element_id, get_json_data, write_data
8
8
  from unstructured_ingest.utils.dep_check import requires_dependencies
9
9
 
10
10
  _COLUMNS = (
@@ -81,7 +81,7 @@ class BaseDuckDBUploadStager(UploadStager):
81
81
  ) -> Path:
82
82
  import pandas as pd
83
83
 
84
- elements_contents = get_data(path=elements_filepath)
84
+ elements_contents = get_json_data(path=elements_filepath)
85
85
  output_filename_suffix = Path(elements_filepath).suffix
86
86
  output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
87
87
  output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
@@ -67,9 +67,8 @@ class DuckDBConnectionConfig(ConnectionConfig):
67
67
 
68
68
  @contextmanager
69
69
  def get_cursor(self) -> Generator["DuckDBConnection", None, None]:
70
- with self.get_client() as client:
71
- with client.cursor() as cursor:
72
- yield cursor
70
+ with self.get_client() as client, client.cursor() as cursor:
71
+ yield cursor
73
72
 
74
73
 
75
74
  class DuckDBUploadStagerConfig(UploadStagerConfig):
@@ -116,6 +115,7 @@ class DuckDBUploader(Uploader):
116
115
  df = pd.DataFrame(data=data)
117
116
  self.upload_dataframe(df=df)
118
117
 
118
+ @requires_dependencies(["pandas"], extras="duckdb")
119
119
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
120
120
  df = get_data_df(path)
121
121
  self.upload_dataframe(df=df)
@@ -66,9 +66,8 @@ class MotherDuckConnectionConfig(ConnectionConfig):
66
66
 
67
67
  @contextmanager
68
68
  def get_cursor(self) -> Generator["MotherDuckConnection", None, None]:
69
- with self.get_client() as client:
70
- with client.cursor() as cursor:
71
- yield cursor
69
+ with self.get_client() as client, client.cursor() as cursor:
70
+ yield cursor
72
71
 
73
72
 
74
73
  class MotherDuckUploadStagerConfig(UploadStagerConfig):
@@ -116,6 +115,7 @@ class MotherDuckUploader(Uploader):
116
115
  df = pd.DataFrame(data=data)
117
116
  self.upload_dataframe(df=df)
118
117
 
118
+ @requires_dependencies(["pandas"], extras="duckdb")
119
119
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
120
120
  df = get_data_df(path)
121
121
  self.upload_dataframe(df=df)
@@ -134,9 +134,11 @@ class S3Indexer(FsspecIndexer):
134
134
 
135
135
  version = file_info.get("ETag").rstrip('"').lstrip('"') if "ETag" in file_info else None
136
136
  metadata: dict[str, str] = {}
137
- with contextlib.suppress(AttributeError):
138
- with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
139
- metadata = client.metadata(path=path)
137
+ with (
138
+ contextlib.suppress(AttributeError),
139
+ self.connection_config.get_client(protocol=self.index_config.protocol) as client,
140
+ ):
141
+ metadata = client.metadata(path=path)
140
142
  record_locator = {
141
143
  "protocol": self.index_config.protocol,
142
144
  "remote_file_path": self.index_config.remote_url,
@@ -230,8 +230,7 @@ class GitLabDownloader(Downloader):
230
230
  download_path = self.get_download_path(file_data=file_data)
231
231
  if download_path is None:
232
232
  logger.error(
233
- "Generated download path is None, source_identifiers might be missing"
234
- "from FileData."
233
+ "Generated download path is None, source_identifiers might be missingfrom FileData."
235
234
  )
236
235
  raise ValueError("Generated invalid download path.")
237
236
 
@@ -334,7 +334,6 @@ class GoogleDriveIndexer(Indexer):
334
334
  recursive: bool = False,
335
335
  previous_path: Optional[str] = None,
336
336
  ) -> list[dict]:
337
-
338
337
  fields_input = "nextPageToken, files({})".format(",".join(self.fields))
339
338
  q = f"'{object_id}' in parents"
340
339
  # Filter by extension but still include any directories
@@ -394,7 +393,6 @@ class GoogleDriveIndexer(Indexer):
394
393
  if not self.is_dir(root_info):
395
394
  data = [self.map_file_data(root_info)]
396
395
  else:
397
-
398
396
  file_contents = self.get_paginated_results(
399
397
  files_client=files_client,
400
398
  object_id=object_id,
@@ -5,7 +5,6 @@ from dataclasses import dataclass, field
5
5
  from pathlib import Path
6
6
  from typing import TYPE_CHECKING, Any, Generator, Optional, Tuple
7
7
 
8
- import pandas as pd
9
8
  from pydantic import Field, Secret
10
9
 
11
10
  from unstructured_ingest.data_types.file_data import FileData
@@ -29,6 +28,7 @@ from unstructured_ingest.utils.data_prep import get_data_df
29
28
  from unstructured_ingest.utils.dep_check import requires_dependencies
30
29
 
31
30
  if TYPE_CHECKING:
31
+ from pandas import DataFrame
32
32
  from pyarrow import Table as ArrowTable
33
33
  from pyiceberg.catalog.rest import RestCatalog
34
34
  from pyiceberg.table import Table, Transaction
@@ -96,14 +96,12 @@ class IbmWatsonxConnectionConfig(ConnectionConfig):
96
96
  return UserAuthError(e)
97
97
  if 400 <= response_code < 500:
98
98
  logger.error(
99
- f"Request to {url} failed"
100
- f"in IBM watsonx.data connector, status code {response_code}"
99
+ f"Request to {url} failedin IBM watsonx.data connector, status code {response_code}"
101
100
  )
102
101
  return UserError(e)
103
102
  if response_code > 500:
104
103
  logger.error(
105
- f"Request to {url} failed"
106
- f"in IBM watsonx.data connector, status code {response_code}"
104
+ f"Request to {url} failedin IBM watsonx.data connector, status code {response_code}"
107
105
  )
108
106
  return ProviderError(e)
109
107
  logger.error(f"Unhandled exception from IBM watsonx.data connector: {e}", exc_info=True)
@@ -217,7 +215,7 @@ class IbmWatsonxUploader(SQLUploader):
217
215
  return self.upload_config.record_id_key in self.get_table_columns()
218
216
 
219
217
  @requires_dependencies(["pyarrow"], extras="ibm-watsonx-s3")
220
- def _df_to_arrow_table(self, df: pd.DataFrame) -> "ArrowTable":
218
+ def _df_to_arrow_table(self, df: "DataFrame") -> "ArrowTable":
221
219
  import pyarrow as pa
222
220
 
223
221
  # Iceberg will automatically fill missing columns with nulls
@@ -277,16 +275,20 @@ class IbmWatsonxUploader(SQLUploader):
277
275
  except Exception as e:
278
276
  raise ProviderError(f"Failed to upload data to table: {e}")
279
277
 
280
- def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
278
+ def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
281
279
  data_table = self._df_to_arrow_table(df)
282
280
 
283
281
  with self.get_table() as table:
284
282
  self.upload_data_table(table, data_table, file_data)
285
283
 
284
+ @requires_dependencies(["pandas"], extras="ibm-watsonx-s3")
286
285
  def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
286
+ import pandas as pd
287
+
287
288
  df = pd.DataFrame(data)
288
289
  self.upload_dataframe(df=df, file_data=file_data)
289
290
 
291
+ @requires_dependencies(["pandas"], extras="ibm-watsonx-s3")
290
292
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
291
293
  df = get_data_df(path=path)
292
294
  self.upload_dataframe(df=df, file_data=file_data)
@@ -141,6 +141,7 @@ class KdbaiUploader(Uploader):
141
141
  df = pd.DataFrame(data=data)
142
142
  self.process_dataframe(df=df)
143
143
 
144
+ @requires_dependencies(["pandas"], extras="kdbai")
144
145
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
145
146
  data = get_data_df(path=path)
146
147
  self.process_dataframe(df=data)
@@ -199,8 +199,7 @@ class OutlookDownloader(Downloader):
199
199
  download_path = self.get_download_path(file_data)
200
200
  if download_path is None:
201
201
  logger.error(
202
- "Generated download path is None, source_identifiers might be missing"
203
- "from FileData."
202
+ "Generated download path is None, source_identifiers might be missingfrom FileData."
204
203
  )
205
204
  raise ValueError("Generated invalid download path.")
206
205
 
@@ -227,7 +227,6 @@ class PineconeUploader(VectorDBUploader):
227
227
  self.connection_config.index_name = index_name
228
228
 
229
229
  if not self.index_exists(index_name):
230
-
231
230
  logger.info(f"creating pinecone index {index_name}")
232
231
 
233
232
  pc = self.connection_config.get_client()
@@ -143,36 +143,40 @@ class RedisUploader(Uploader):
143
143
  await asyncio.gather(*[self._write_batch(batch, redis_stack) for batch in batches])
144
144
 
145
145
  async def _write_batch(self, batch: list[dict], redis_stack: bool) -> None:
146
- async with self.connection_config.create_async_client() as async_client:
147
- async with async_client.pipeline(transaction=True) as pipe:
148
- for element in batch:
149
- key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
150
- if redis_stack:
151
- pipe.json().set(key_with_prefix, "$", element)
152
- else:
153
- pipe.set(key_with_prefix, json.dumps(element))
154
- await pipe.execute()
146
+ async with (
147
+ self.connection_config.create_async_client() as async_client,
148
+ async_client.pipeline(transaction=True) as pipe,
149
+ ):
150
+ for element in batch:
151
+ key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
152
+ if redis_stack:
153
+ pipe.json().set(key_with_prefix, "$", element)
154
+ else:
155
+ pipe.set(key_with_prefix, json.dumps(element))
156
+ await pipe.execute()
155
157
 
156
158
  @requires_dependencies(["redis"], extras="redis")
157
159
  async def _check_redis_stack(self, element: dict) -> bool:
158
160
  from redis import exceptions as redis_exceptions
159
161
 
160
162
  redis_stack = True
161
- async with self.connection_config.create_async_client() as async_client:
162
- async with async_client.pipeline(transaction=True) as pipe:
163
- key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
164
- try:
165
- # Redis with stack extension supports JSON type
166
- await pipe.json().set(key_with_prefix, "$", element).execute()
167
- except redis_exceptions.ResponseError as e:
168
- message = str(e)
169
- if "unknown command `JSON.SET`" in message:
170
- # if this error occurs, Redis server doesn't support JSON type,
171
- # so save as string type instead
172
- await pipe.set(key_with_prefix, json.dumps(element)).execute()
173
- redis_stack = False
174
- else:
175
- raise e
163
+ async with (
164
+ self.connection_config.create_async_client() as async_client,
165
+ async_client.pipeline(transaction=True) as pipe,
166
+ ):
167
+ key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
168
+ try:
169
+ # Redis with stack extension supports JSON type
170
+ await pipe.json().set(key_with_prefix, "$", element).execute()
171
+ except redis_exceptions.ResponseError as e:
172
+ message = str(e)
173
+ if "unknown command `JSON.SET`" in message:
174
+ # if this error occurs, Redis server doesn't support JSON type,
175
+ # so save as string type instead
176
+ await pipe.set(key_with_prefix, json.dumps(element)).execute()
177
+ redis_stack = False
178
+ else:
179
+ raise e
176
180
  return redis_stack
177
181
 
178
182
 
@@ -81,7 +81,7 @@ class SalesforceAccessConfig(AccessConfig):
81
81
  consumer_key: str
82
82
  private_key_path: Optional[Path] = Field(
83
83
  default=None,
84
- description="Path to the private key file. " "Key file is usually named server.key.",
84
+ description="Path to the private key file. Key file is usually named server.key.",
85
85
  )
86
86
  private_key: Optional[str] = Field(default=None, description="Contents of the private key")
87
87
 
@@ -166,8 +166,7 @@ class SlackDownloader(Downloader):
166
166
  download_path = self.get_download_path(file_data)
167
167
  if download_path is None:
168
168
  logger.error(
169
- "Generated download path is None, source_identifiers might be missing"
170
- "from FileData."
169
+ "Generated download path is None, source_identifiers might be missingfrom FileData."
171
170
  )
172
171
  raise ValueError("Generated invalid download path.")
173
172
 
@@ -2,6 +2,7 @@ import json
2
2
  import os
3
3
  from contextlib import contextmanager
4
4
  from dataclasses import dataclass
5
+ from pathlib import Path
5
6
  from typing import TYPE_CHECKING, Any, Generator, Optional
6
7
 
7
8
  from pydantic import Field, Secret
@@ -128,6 +129,10 @@ class DatabricksDeltaTablesUploader(SQLUploader):
128
129
  connection_config: DatabricksDeltaTablesConnectionConfig
129
130
  connector_type: str = CONNECTOR_TYPE
130
131
 
132
+ @requires_dependencies(["pandas"], extras="databricks-delta-tables")
133
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
134
+ super().run(path=path, file_data=file_data, **kwargs)
135
+
131
136
  @contextmanager
132
137
  def get_cursor(self) -> Generator[Any, None, None]:
133
138
  with self.connection_config.get_cursor() as cursor:
@@ -1,9 +1,11 @@
1
1
  from contextlib import contextmanager
2
2
  from dataclasses import dataclass, field
3
- from typing import TYPE_CHECKING, Generator, Optional
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING, Any, Generator, Optional
4
5
 
5
6
  from pydantic import Field, Secret
6
7
 
8
+ from unstructured_ingest.data_types.file_data import FileData
7
9
  from unstructured_ingest.logger import logger
8
10
  from unstructured_ingest.processes.connector_registry import (
9
11
  DestinationRegistryEntry,
@@ -144,6 +146,10 @@ class PostgresUploader(SQLUploader):
144
146
  connector_type: str = CONNECTOR_TYPE
145
147
  values_delimiter: str = "%s"
146
148
 
149
+ @requires_dependencies(["pandas"], extras="postgres")
150
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
151
+ super().run(path=path, file_data=file_data, **kwargs)
152
+
147
153
 
148
154
  postgres_source_entry = SourceRegistryEntry(
149
155
  connection_config=PostgresConnectionConfig,
@@ -1,10 +1,12 @@
1
1
  import json
2
2
  from contextlib import contextmanager
3
3
  from dataclasses import dataclass, field
4
+ from pathlib import Path
4
5
  from typing import TYPE_CHECKING, Any, Generator, Optional
5
6
 
6
7
  from pydantic import Field, Secret
7
8
 
9
+ from unstructured_ingest.data_types.file_data import FileData
8
10
  from unstructured_ingest.logger import logger
9
11
  from unstructured_ingest.processes.connector_registry import (
10
12
  DestinationRegistryEntry,
@@ -65,12 +67,11 @@ class SingleStoreConnectionConfig(SQLConnectionConfig):
65
67
 
66
68
  @contextmanager
67
69
  def get_cursor(self) -> Generator["SingleStoreCursor", None, None]:
68
- with self.get_connection() as connection:
69
- with connection.cursor() as cursor:
70
- try:
71
- yield cursor
72
- finally:
73
- cursor.close()
70
+ with self.get_connection() as connection, connection.cursor() as cursor:
71
+ try:
72
+ yield cursor
73
+ finally:
74
+ cursor.close()
74
75
 
75
76
 
76
77
  class SingleStoreIndexerConfig(SQLIndexerConfig):
@@ -131,6 +132,10 @@ class SingleStoreUploader(SQLUploader):
131
132
  values_delimiter: str = "%s"
132
133
  connector_type: str = CONNECTOR_TYPE
133
134
 
135
+ @requires_dependencies(["pandas"], extras="singlestore")
136
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
137
+ super().run(path=path, file_data=file_data, **kwargs)
138
+
134
139
  @requires_dependencies(["pandas"], extras="singlestore")
135
140
  def prepare_data(
136
141
  self, columns: list[str], data: tuple[tuple[Any, ...], ...]
@@ -1,6 +1,7 @@
1
1
  import json
2
2
  from contextlib import contextmanager
3
3
  from dataclasses import dataclass, field
4
+ from pathlib import Path
4
5
  from typing import TYPE_CHECKING, Any, Generator, Optional
5
6
 
6
7
  from pydantic import Field, Secret
@@ -173,6 +174,10 @@ class SnowflakeUploader(SQLUploader):
173
174
  connector_type: str = CONNECTOR_TYPE
174
175
  values_delimiter: str = "?"
175
176
 
177
+ @requires_dependencies(["pandas"], extras="snowflake")
178
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
179
+ super().run(path=path, file_data=file_data, **kwargs)
180
+
176
181
  @requires_dependencies(["pandas"], extras="snowflake")
177
182
  def prepare_data(
178
183
  self, columns: list[str], data: tuple[tuple[Any, ...], ...]