unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (187) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/README.md +28 -0
  3. unstructured_ingest/embed/mixedbreadai.py +0 -1
  4. unstructured_ingest/interfaces/upload_stager.py +2 -2
  5. unstructured_ingest/interfaces/uploader.py +3 -3
  6. unstructured_ingest/main.py +0 -0
  7. unstructured_ingest/pipeline/interfaces.py +1 -1
  8. unstructured_ingest/pipeline/pipeline.py +1 -1
  9. unstructured_ingest/processes/chunker.py +4 -0
  10. unstructured_ingest/processes/connectors/airtable.py +4 -2
  11. unstructured_ingest/processes/connectors/astradb.py +48 -34
  12. unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  13. unstructured_ingest/processes/connectors/confluence.py +0 -1
  14. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
  15. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
  16. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
  17. unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
  18. unstructured_ingest/processes/connectors/delta_table.py +1 -0
  19. unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
  20. unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
  21. unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
  22. unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
  23. unstructured_ingest/processes/connectors/gitlab.py +1 -2
  24. unstructured_ingest/processes/connectors/google_drive.py +0 -2
  25. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
  26. unstructured_ingest/processes/connectors/kdbai.py +1 -0
  27. unstructured_ingest/processes/connectors/outlook.py +1 -2
  28. unstructured_ingest/processes/connectors/pinecone.py +0 -1
  29. unstructured_ingest/processes/connectors/redisdb.py +28 -24
  30. unstructured_ingest/processes/connectors/salesforce.py +1 -1
  31. unstructured_ingest/processes/connectors/slack.py +1 -2
  32. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
  33. unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
  34. unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
  35. unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
  36. unstructured_ingest/processes/connectors/sql/sql.py +3 -4
  37. unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
  38. unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
  39. unstructured_ingest/processes/connectors/vectara.py +0 -2
  40. unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
  41. unstructured_ingest/processes/embedder.py +2 -2
  42. unstructured_ingest/processes/filter.py +1 -1
  43. unstructured_ingest/processes/partitioner.py +4 -0
  44. unstructured_ingest/processes/utils/blob_storage.py +2 -2
  45. unstructured_ingest/unstructured_api.py +13 -8
  46. unstructured_ingest/utils/data_prep.py +8 -32
  47. unstructured_ingest-1.0.2.dist-info/METADATA +226 -0
  48. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/RECORD +50 -184
  49. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/WHEEL +1 -2
  50. examples/__init__.py +0 -0
  51. examples/airtable.py +0 -44
  52. examples/azure_cognitive_search.py +0 -55
  53. examples/chroma.py +0 -54
  54. examples/couchbase.py +0 -55
  55. examples/databricks_volumes_dest.py +0 -55
  56. examples/databricks_volumes_source.py +0 -53
  57. examples/delta_table.py +0 -45
  58. examples/discord_example.py +0 -36
  59. examples/elasticsearch.py +0 -49
  60. examples/google_drive.py +0 -45
  61. examples/kdbai.py +0 -54
  62. examples/local.py +0 -36
  63. examples/milvus.py +0 -44
  64. examples/mongodb.py +0 -53
  65. examples/opensearch.py +0 -50
  66. examples/pinecone.py +0 -57
  67. examples/s3.py +0 -38
  68. examples/salesforce.py +0 -44
  69. examples/sharepoint.py +0 -47
  70. examples/singlestore.py +0 -49
  71. examples/sql.py +0 -90
  72. examples/vectara.py +0 -54
  73. examples/weaviate.py +0 -44
  74. test/__init__.py +0 -0
  75. test/integration/__init__.py +0 -0
  76. test/integration/chunkers/__init__.py +0 -0
  77. test/integration/chunkers/test_chunkers.py +0 -31
  78. test/integration/connectors/__init__.py +0 -0
  79. test/integration/connectors/conftest.py +0 -38
  80. test/integration/connectors/databricks/__init__.py +0 -0
  81. test/integration/connectors/databricks/test_volumes_native.py +0 -273
  82. test/integration/connectors/discord/__init__.py +0 -0
  83. test/integration/connectors/discord/test_discord.py +0 -90
  84. test/integration/connectors/duckdb/__init__.py +0 -0
  85. test/integration/connectors/duckdb/conftest.py +0 -14
  86. test/integration/connectors/duckdb/test_duckdb.py +0 -90
  87. test/integration/connectors/duckdb/test_motherduck.py +0 -95
  88. test/integration/connectors/elasticsearch/__init__.py +0 -0
  89. test/integration/connectors/elasticsearch/conftest.py +0 -34
  90. test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
  91. test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
  92. test/integration/connectors/sql/__init__.py +0 -0
  93. test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
  94. test/integration/connectors/sql/test_postgres.py +0 -201
  95. test/integration/connectors/sql/test_singlestore.py +0 -182
  96. test/integration/connectors/sql/test_snowflake.py +0 -244
  97. test/integration/connectors/sql/test_sqlite.py +0 -168
  98. test/integration/connectors/sql/test_vastdb.py +0 -34
  99. test/integration/connectors/test_astradb.py +0 -287
  100. test/integration/connectors/test_azure_ai_search.py +0 -254
  101. test/integration/connectors/test_chroma.py +0 -136
  102. test/integration/connectors/test_confluence.py +0 -111
  103. test/integration/connectors/test_delta_table.py +0 -183
  104. test/integration/connectors/test_dropbox.py +0 -151
  105. test/integration/connectors/test_github.py +0 -49
  106. test/integration/connectors/test_google_drive.py +0 -257
  107. test/integration/connectors/test_jira.py +0 -67
  108. test/integration/connectors/test_lancedb.py +0 -247
  109. test/integration/connectors/test_milvus.py +0 -208
  110. test/integration/connectors/test_mongodb.py +0 -335
  111. test/integration/connectors/test_neo4j.py +0 -244
  112. test/integration/connectors/test_notion.py +0 -152
  113. test/integration/connectors/test_onedrive.py +0 -163
  114. test/integration/connectors/test_pinecone.py +0 -387
  115. test/integration/connectors/test_qdrant.py +0 -216
  116. test/integration/connectors/test_redis.py +0 -143
  117. test/integration/connectors/test_s3.py +0 -184
  118. test/integration/connectors/test_sharepoint.py +0 -222
  119. test/integration/connectors/test_vectara.py +0 -282
  120. test/integration/connectors/test_zendesk.py +0 -120
  121. test/integration/connectors/utils/__init__.py +0 -0
  122. test/integration/connectors/utils/constants.py +0 -13
  123. test/integration/connectors/utils/docker.py +0 -151
  124. test/integration/connectors/utils/docker_compose.py +0 -59
  125. test/integration/connectors/utils/validation/__init__.py +0 -0
  126. test/integration/connectors/utils/validation/destination.py +0 -77
  127. test/integration/connectors/utils/validation/equality.py +0 -76
  128. test/integration/connectors/utils/validation/source.py +0 -331
  129. test/integration/connectors/utils/validation/utils.py +0 -36
  130. test/integration/connectors/weaviate/__init__.py +0 -0
  131. test/integration/connectors/weaviate/conftest.py +0 -15
  132. test/integration/connectors/weaviate/test_cloud.py +0 -39
  133. test/integration/connectors/weaviate/test_local.py +0 -152
  134. test/integration/embedders/__init__.py +0 -0
  135. test/integration/embedders/conftest.py +0 -13
  136. test/integration/embedders/test_azure_openai.py +0 -57
  137. test/integration/embedders/test_bedrock.py +0 -103
  138. test/integration/embedders/test_huggingface.py +0 -24
  139. test/integration/embedders/test_mixedbread.py +0 -71
  140. test/integration/embedders/test_octoai.py +0 -75
  141. test/integration/embedders/test_openai.py +0 -74
  142. test/integration/embedders/test_togetherai.py +0 -71
  143. test/integration/embedders/test_vertexai.py +0 -63
  144. test/integration/embedders/test_voyageai.py +0 -79
  145. test/integration/embedders/utils.py +0 -66
  146. test/integration/partitioners/__init__.py +0 -0
  147. test/integration/partitioners/test_partitioner.py +0 -76
  148. test/integration/utils.py +0 -15
  149. test/unit/__init__.py +0 -0
  150. test/unit/chunkers/__init__.py +0 -0
  151. test/unit/chunkers/test_chunkers.py +0 -49
  152. test/unit/connectors/__init__.py +0 -0
  153. test/unit/connectors/ibm_watsonx/__init__.py +0 -0
  154. test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
  155. test/unit/connectors/motherduck/__init__.py +0 -0
  156. test/unit/connectors/motherduck/test_base.py +0 -73
  157. test/unit/connectors/sql/__init__.py +0 -0
  158. test/unit/connectors/sql/test_sql.py +0 -152
  159. test/unit/connectors/test_confluence.py +0 -71
  160. test/unit/connectors/test_jira.py +0 -401
  161. test/unit/embed/__init__.py +0 -0
  162. test/unit/embed/test_mixedbreadai.py +0 -42
  163. test/unit/embed/test_octoai.py +0 -27
  164. test/unit/embed/test_openai.py +0 -28
  165. test/unit/embed/test_vertexai.py +0 -25
  166. test/unit/embed/test_voyageai.py +0 -24
  167. test/unit/embedders/__init__.py +0 -0
  168. test/unit/embedders/test_bedrock.py +0 -36
  169. test/unit/embedders/test_huggingface.py +0 -48
  170. test/unit/embedders/test_mixedbread.py +0 -37
  171. test/unit/embedders/test_octoai.py +0 -35
  172. test/unit/embedders/test_openai.py +0 -35
  173. test/unit/embedders/test_togetherai.py +0 -37
  174. test/unit/embedders/test_vertexai.py +0 -37
  175. test/unit/embedders/test_voyageai.py +0 -38
  176. test/unit/partitioners/__init__.py +0 -0
  177. test/unit/partitioners/test_partitioner.py +0 -63
  178. test/unit/test_error.py +0 -27
  179. test/unit/test_html.py +0 -112
  180. test/unit/test_interfaces.py +0 -26
  181. test/unit/test_utils.py +0 -220
  182. test/unit/utils/__init__.py +0 -0
  183. test/unit/utils/data_generator.py +0 -32
  184. unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
  185. unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
  186. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/entry_points.txt +0 -0
  187. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info/licenses}/LICENSE.md +0 -0
@@ -143,36 +143,40 @@ class RedisUploader(Uploader):
143
143
  await asyncio.gather(*[self._write_batch(batch, redis_stack) for batch in batches])
144
144
 
145
145
  async def _write_batch(self, batch: list[dict], redis_stack: bool) -> None:
146
- async with self.connection_config.create_async_client() as async_client:
147
- async with async_client.pipeline(transaction=True) as pipe:
148
- for element in batch:
149
- key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
150
- if redis_stack:
151
- pipe.json().set(key_with_prefix, "$", element)
152
- else:
153
- pipe.set(key_with_prefix, json.dumps(element))
154
- await pipe.execute()
146
+ async with (
147
+ self.connection_config.create_async_client() as async_client,
148
+ async_client.pipeline(transaction=True) as pipe,
149
+ ):
150
+ for element in batch:
151
+ key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
152
+ if redis_stack:
153
+ pipe.json().set(key_with_prefix, "$", element)
154
+ else:
155
+ pipe.set(key_with_prefix, json.dumps(element))
156
+ await pipe.execute()
155
157
 
156
158
  @requires_dependencies(["redis"], extras="redis")
157
159
  async def _check_redis_stack(self, element: dict) -> bool:
158
160
  from redis import exceptions as redis_exceptions
159
161
 
160
162
  redis_stack = True
161
- async with self.connection_config.create_async_client() as async_client:
162
- async with async_client.pipeline(transaction=True) as pipe:
163
- key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
164
- try:
165
- # Redis with stack extension supports JSON type
166
- await pipe.json().set(key_with_prefix, "$", element).execute()
167
- except redis_exceptions.ResponseError as e:
168
- message = str(e)
169
- if "unknown command `JSON.SET`" in message:
170
- # if this error occurs, Redis server doesn't support JSON type,
171
- # so save as string type instead
172
- await pipe.set(key_with_prefix, json.dumps(element)).execute()
173
- redis_stack = False
174
- else:
175
- raise e
163
+ async with (
164
+ self.connection_config.create_async_client() as async_client,
165
+ async_client.pipeline(transaction=True) as pipe,
166
+ ):
167
+ key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
168
+ try:
169
+ # Redis with stack extension supports JSON type
170
+ await pipe.json().set(key_with_prefix, "$", element).execute()
171
+ except redis_exceptions.ResponseError as e:
172
+ message = str(e)
173
+ if "unknown command `JSON.SET`" in message:
174
+ # if this error occurs, Redis server doesn't support JSON type,
175
+ # so save as string type instead
176
+ await pipe.set(key_with_prefix, json.dumps(element)).execute()
177
+ redis_stack = False
178
+ else:
179
+ raise e
176
180
  return redis_stack
177
181
 
178
182
 
@@ -81,7 +81,7 @@ class SalesforceAccessConfig(AccessConfig):
81
81
  consumer_key: str
82
82
  private_key_path: Optional[Path] = Field(
83
83
  default=None,
84
- description="Path to the private key file. " "Key file is usually named server.key.",
84
+ description="Path to the private key file. Key file is usually named server.key.",
85
85
  )
86
86
  private_key: Optional[str] = Field(default=None, description="Contents of the private key")
87
87
 
@@ -166,8 +166,7 @@ class SlackDownloader(Downloader):
166
166
  download_path = self.get_download_path(file_data)
167
167
  if download_path is None:
168
168
  logger.error(
169
- "Generated download path is None, source_identifiers might be missing"
170
- "from FileData."
169
+ "Generated download path is None, source_identifiers might be missingfrom FileData."
171
170
  )
172
171
  raise ValueError("Generated invalid download path.")
173
172
 
@@ -2,6 +2,7 @@ import json
2
2
  import os
3
3
  from contextlib import contextmanager
4
4
  from dataclasses import dataclass
5
+ from pathlib import Path
5
6
  from typing import TYPE_CHECKING, Any, Generator, Optional
6
7
 
7
8
  from pydantic import Field, Secret
@@ -128,6 +129,10 @@ class DatabricksDeltaTablesUploader(SQLUploader):
128
129
  connection_config: DatabricksDeltaTablesConnectionConfig
129
130
  connector_type: str = CONNECTOR_TYPE
130
131
 
132
+ @requires_dependencies(["pandas"], extras="databricks-delta-tables")
133
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
134
+ super().run(path=path, file_data=file_data, **kwargs)
135
+
131
136
  @contextmanager
132
137
  def get_cursor(self) -> Generator[Any, None, None]:
133
138
  with self.connection_config.get_cursor() as cursor:
@@ -1,9 +1,11 @@
1
1
  from contextlib import contextmanager
2
2
  from dataclasses import dataclass, field
3
- from typing import TYPE_CHECKING, Generator, Optional
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING, Any, Generator, Optional
4
5
 
5
6
  from pydantic import Field, Secret
6
7
 
8
+ from unstructured_ingest.data_types.file_data import FileData
7
9
  from unstructured_ingest.logger import logger
8
10
  from unstructured_ingest.processes.connector_registry import (
9
11
  DestinationRegistryEntry,
@@ -144,6 +146,10 @@ class PostgresUploader(SQLUploader):
144
146
  connector_type: str = CONNECTOR_TYPE
145
147
  values_delimiter: str = "%s"
146
148
 
149
+ @requires_dependencies(["pandas"], extras="postgres")
150
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
151
+ super().run(path=path, file_data=file_data, **kwargs)
152
+
147
153
 
148
154
  postgres_source_entry = SourceRegistryEntry(
149
155
  connection_config=PostgresConnectionConfig,
@@ -1,10 +1,12 @@
1
1
  import json
2
2
  from contextlib import contextmanager
3
3
  from dataclasses import dataclass, field
4
+ from pathlib import Path
4
5
  from typing import TYPE_CHECKING, Any, Generator, Optional
5
6
 
6
7
  from pydantic import Field, Secret
7
8
 
9
+ from unstructured_ingest.data_types.file_data import FileData
8
10
  from unstructured_ingest.logger import logger
9
11
  from unstructured_ingest.processes.connector_registry import (
10
12
  DestinationRegistryEntry,
@@ -65,12 +67,11 @@ class SingleStoreConnectionConfig(SQLConnectionConfig):
65
67
 
66
68
  @contextmanager
67
69
  def get_cursor(self) -> Generator["SingleStoreCursor", None, None]:
68
- with self.get_connection() as connection:
69
- with connection.cursor() as cursor:
70
- try:
71
- yield cursor
72
- finally:
73
- cursor.close()
70
+ with self.get_connection() as connection, connection.cursor() as cursor:
71
+ try:
72
+ yield cursor
73
+ finally:
74
+ cursor.close()
74
75
 
75
76
 
76
77
  class SingleStoreIndexerConfig(SQLIndexerConfig):
@@ -131,6 +132,10 @@ class SingleStoreUploader(SQLUploader):
131
132
  values_delimiter: str = "%s"
132
133
  connector_type: str = CONNECTOR_TYPE
133
134
 
135
+ @requires_dependencies(["pandas"], extras="singlestore")
136
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
137
+ super().run(path=path, file_data=file_data, **kwargs)
138
+
134
139
  @requires_dependencies(["pandas"], extras="singlestore")
135
140
  def prepare_data(
136
141
  self, columns: list[str], data: tuple[tuple[Any, ...], ...]
@@ -1,6 +1,7 @@
1
1
  import json
2
2
  from contextlib import contextmanager
3
3
  from dataclasses import dataclass, field
4
+ from pathlib import Path
4
5
  from typing import TYPE_CHECKING, Any, Generator, Optional
5
6
 
6
7
  from pydantic import Field, Secret
@@ -173,6 +174,10 @@ class SnowflakeUploader(SQLUploader):
173
174
  connector_type: str = CONNECTOR_TYPE
174
175
  values_delimiter: str = "?"
175
176
 
177
+ @requires_dependencies(["pandas"], extras="snowflake")
178
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
179
+ super().run(path=path, file_data=file_data, **kwargs)
180
+
176
181
  @requires_dependencies(["pandas"], extras="snowflake")
177
182
  def prepare_data(
178
183
  self, columns: list[str], data: tuple[tuple[Any, ...], ...]
@@ -36,9 +36,9 @@ from unstructured_ingest.interfaces import (
36
36
  from unstructured_ingest.logger import logger
37
37
  from unstructured_ingest.utils.constants import RECORD_ID_LABEL
38
38
  from unstructured_ingest.utils.data_prep import (
39
- get_data,
40
39
  get_data_df,
41
40
  get_enhanced_element_id,
41
+ get_json_data,
42
42
  split_dataframe,
43
43
  write_data,
44
44
  )
@@ -122,8 +122,7 @@ class SQLIndexer(Indexer, ABC):
122
122
  id_batches: list[frozenset[str]] = [
123
123
  frozenset(
124
124
  ids[
125
- i
126
- * self.index_config.batch_size : (i + 1) # noqa
125
+ i * self.index_config.batch_size : (i + 1) # noqa
127
126
  * self.index_config.batch_size
128
127
  ]
129
128
  )
@@ -272,7 +271,7 @@ class SQLUploadStager(UploadStager):
272
271
  ) -> Path:
273
272
  import pandas as pd
274
273
 
275
- elements_contents = get_data(path=elements_filepath)
274
+ elements_contents = get_json_data(path=elements_filepath)
276
275
 
277
276
  df = pd.DataFrame(
278
277
  data=[
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Any, Generator
6
6
 
7
7
  from pydantic import Field, Secret, model_validator
8
8
 
9
+ from unstructured_ingest.data_types.file_data import FileData
9
10
  from unstructured_ingest.logger import logger
10
11
  from unstructured_ingest.processes.connector_registry import (
11
12
  DestinationRegistryEntry,
@@ -133,6 +134,10 @@ class SQLiteUploader(SQLUploader):
133
134
  connection_config: SQLiteConnectionConfig
134
135
  connector_type: str = CONNECTOR_TYPE
135
136
 
137
+ @requires_dependencies(["pandas"])
138
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
139
+ super().run(path=path, file_data=file_data, **kwargs)
140
+
136
141
  @requires_dependencies(["pandas"])
137
142
  def prepare_data(
138
143
  self, columns: list[str], data: tuple[tuple[Any, ...], ...]
@@ -1,5 +1,6 @@
1
1
  from contextlib import contextmanager
2
2
  from dataclasses import dataclass, field
3
+ from pathlib import Path
3
4
  from typing import TYPE_CHECKING, Any, Optional
4
5
 
5
6
  from pydantic import Field, Secret
@@ -68,9 +69,8 @@ class VastdbConnectionConfig(SQLConnectionConfig):
68
69
 
69
70
  @contextmanager
70
71
  def get_cursor(self) -> "VastdbTransaction":
71
- with self.get_connection() as connection:
72
- with connection.transaction() as transaction:
73
- yield transaction
72
+ with self.get_connection() as connection, connection.transaction() as transaction:
73
+ yield transaction
74
74
 
75
75
  @contextmanager
76
76
  def get_table(self, table_name: str) -> "VastdbTable":
@@ -190,6 +190,10 @@ class VastdbUploader(SQLUploader):
190
190
  logger.error(f"failed to validate connection: {e}", exc_info=True)
191
191
  raise DestinationConnectionError(f"failed to validate connection: {e}")
192
192
 
193
+ @requires_dependencies(["pandas"], extras="vastdb")
194
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
195
+ super().run(path=path, file_data=file_data, **kwargs)
196
+
193
197
  @requires_dependencies(["pyarrow", "pandas"], extras="vastdb")
194
198
  def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
195
199
  import numpy as np
@@ -108,7 +108,6 @@ class VectaraUploaderConfig(UploaderConfig):
108
108
 
109
109
  @dataclass
110
110
  class VectaraUploader(Uploader):
111
-
112
111
  connector_type: str = CONNECTOR_TYPE
113
112
  upload_config: VectaraUploaderConfig
114
113
  connection_config: VectaraConnectionConfig
@@ -336,7 +335,6 @@ class VectaraUploader(Uploader):
336
335
  file_data: FileData,
337
336
  **kwargs: Any,
338
337
  ) -> None:
339
-
340
338
  logger.info(f"inserting / updating {len(data)} documents to Vectara ")
341
339
  await asyncio.gather(*(self._index_document(vdoc) for vdoc in data))
342
340
 
@@ -53,7 +53,6 @@ class ZendeskConnectionConfig(ConnectionConfig):
53
53
  access_config: Secret[ZendeskAccessConfig]
54
54
 
55
55
  def get_client(self) -> ZendeskClient:
56
-
57
56
  access_config = self.access_config.get_secret_value()
58
57
 
59
58
  return ZendeskClient(
@@ -206,7 +205,6 @@ class ZendeskDownloader(Downloader):
206
205
  await f.write(comment.as_text())
207
206
 
208
207
  async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
209
-
210
208
  zendesk_filedata = ZendeskFileData.cast(file_data=file_data)
211
209
 
212
210
  item_type = zendesk_filedata.additional_metadata.item_type
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Any, Literal, Optional
6
6
  from pydantic import BaseModel, Field, SecretStr
7
7
 
8
8
  from unstructured_ingest.interfaces.process import BaseProcess
9
- from unstructured_ingest.utils.data_prep import get_data
9
+ from unstructured_ingest.utils.data_prep import get_json_data
10
10
 
11
11
  if TYPE_CHECKING:
12
12
  from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder
@@ -192,7 +192,7 @@ class Embedder(BaseProcess, ABC):
192
192
  def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
193
193
  # TODO update base embedder classes to support async
194
194
  embedder = self.config.get_embedder()
195
- elements = get_data(path=elements_filepath)
195
+ elements = get_json_data(path=elements_filepath)
196
196
  if not elements:
197
197
  return []
198
198
  embedded_elements = embedder.embed_documents(elements=elements)
@@ -13,7 +13,7 @@ from unstructured_ingest.logger import logger
13
13
  class FiltererConfig(BaseModel):
14
14
  file_glob: Optional[list[str]] = Field(
15
15
  default=None,
16
- description="file globs to limit which data_types of " "files are accepted",
16
+ description="file globs to limit which data_types of files are accepted",
17
17
  examples=["*.pdf", "*.html"],
18
18
  )
19
19
  max_file_size: Optional[int] = Field(
@@ -68,6 +68,9 @@ class PartitionerConfig(BaseModel):
68
68
  description="Use a remote API to partition the files."
69
69
  " Otherwise, use the function from partition.auto",
70
70
  )
71
+ api_timeout_ms: Optional[int] = Field(
72
+ default=None, description="Timeout in milliseconds for all api call during partitioning."
73
+ )
71
74
  api_key: Optional[SecretStr] = Field(
72
75
  default=None, description="API Key for partition endpoint."
73
76
  )
@@ -188,6 +191,7 @@ class Partitioner(BaseProcess, ABC):
188
191
  api_key=self.config.api_key.get_secret_value(),
189
192
  filename=filename,
190
193
  api_parameters=self.config.to_partition_kwargs(),
194
+ timeout_ms=self.config.api_timeout_ms,
191
195
  )
192
196
 
193
197
  # Append the data source metadata the auto partition does for you
@@ -4,7 +4,7 @@ from typing import Any
4
4
 
5
5
  from unstructured_ingest.data_types.file_data import FileData
6
6
  from unstructured_ingest.interfaces import UploadStager, UploadStagerConfig
7
- from unstructured_ingest.utils.data_prep import get_data, write_data
7
+ from unstructured_ingest.utils.data_prep import get_json_data, write_data
8
8
 
9
9
 
10
10
  class BlobStoreUploadStagerConfig(UploadStagerConfig):
@@ -27,6 +27,6 @@ class BlobStoreUploadStager(UploadStager):
27
27
  ) -> Path:
28
28
  output_file = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
29
29
  # Always save as json
30
- data = get_data(elements_filepath)
30
+ data = get_json_data(elements_filepath)
31
31
  write_data(path=output_file.with_suffix(".json"), data=data)
32
32
  return output_file.with_suffix(".json")
@@ -80,7 +80,11 @@ def wrap_error(e: Exception) -> Exception:
80
80
 
81
81
 
82
82
  async def call_api_async(
83
- server_url: Optional[str], api_key: Optional[str], filename: Path, api_parameters: dict
83
+ server_url: Optional[str],
84
+ api_key: Optional[str],
85
+ filename: Path,
86
+ api_parameters: dict,
87
+ timeout_ms: Optional[int] = None,
84
88
  ) -> list[dict]:
85
89
  """Call the Unstructured API using unstructured-client.
86
90
 
@@ -94,13 +98,10 @@ async def call_api_async(
94
98
  """
95
99
  from unstructured_client import UnstructuredClient
96
100
 
97
- client = UnstructuredClient(
98
- server_url=server_url,
99
- api_key_auth=api_key,
100
- )
101
+ client = UnstructuredClient(server_url=server_url, api_key_auth=api_key)
101
102
  partition_request = create_partition_request(filename=filename, parameters_dict=api_parameters)
102
103
  try:
103
- res = await client.general.partition_async(request=partition_request)
104
+ res = await client.general.partition_async(request=partition_request, timeout_ms=timeout_ms)
104
105
  except Exception as e:
105
106
  raise wrap_error(e)
106
107
 
@@ -108,7 +109,11 @@ async def call_api_async(
108
109
 
109
110
 
110
111
  def call_api(
111
- server_url: Optional[str], api_key: Optional[str], filename: Path, api_parameters: dict
112
+ server_url: Optional[str],
113
+ api_key: Optional[str],
114
+ filename: Path,
115
+ api_parameters: dict,
116
+ timeout_ms: Optional[int] = None,
112
117
  ) -> list[dict]:
113
118
  """Call the Unstructured API using unstructured-client.
114
119
 
@@ -128,7 +133,7 @@ def call_api(
128
133
  )
129
134
  partition_request = create_partition_request(filename=filename, parameters_dict=api_parameters)
130
135
  try:
131
- res = client.general.partition(request=partition_request)
136
+ res = client.general.partition(request=partition_request, timeout_ms=timeout_ms)
132
137
  except Exception as e:
133
138
  raise wrap_error(e)
134
139
 
@@ -2,7 +2,7 @@ import itertools
2
2
  import json
3
3
  from datetime import datetime
4
4
  from pathlib import Path
5
- from typing import TYPE_CHECKING, Any, Generator, Iterable, Optional, Sequence, TypeVar, Union, cast
5
+ from typing import TYPE_CHECKING, Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
6
6
  from uuid import NAMESPACE_DNS, uuid5
7
7
 
8
8
  from unstructured_ingest.data_types.file_data import FileData
@@ -171,15 +171,13 @@ def write_data(path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
171
171
  raise IOError("Unsupported file type: {path}")
172
172
 
173
173
 
174
- def get_data(path: Union[Path, str]) -> list[dict]:
175
- if isinstance(path, str):
176
- path = Path(path)
177
- try:
178
- return get_data_by_suffix(path=path)
179
- except Exception as e:
180
- logger.warning(f"failed to read {path} by extension: {e}")
181
- # Fall back
174
+ def get_json_data(path: Path) -> list[dict]:
182
175
  with path.open() as f:
176
+ # Attempt by prefix
177
+ if path.suffix == ".json":
178
+ return json.load(f)
179
+ elif path.suffix == ".ndjson":
180
+ return ndjson.load(f)
183
181
  try:
184
182
  return json.load(f)
185
183
  except Exception as e:
@@ -188,29 +186,7 @@ def get_data(path: Union[Path, str]) -> list[dict]:
188
186
  return ndjson.load(f)
189
187
  except Exception as e:
190
188
  logger.warning(f"failed to read {path} as ndjson: {e}")
191
-
192
- import pandas as pd
193
-
194
- try:
195
- df = pd.read_csv(path)
196
- return df.to_dict(orient="records")
197
- except Exception as e:
198
- logger.warning(f"failed to read {path} as csv: {e}")
199
- try:
200
- df = pd.read_parquet(path)
201
- return df.to_dict(orient="records")
202
- except Exception as e:
203
- logger.warning(f"failed to read {path} as parquet: {e}")
204
-
205
-
206
- def get_json_data(path: Path) -> list[dict]:
207
- with path.open() as f:
208
- if path.suffix == ".json":
209
- return json.load(f)
210
- elif path.suffix == ".ndjson":
211
- return ndjson.load(f)
212
- else:
213
- raise ValueError(f"Unsupported file type: {path}")
189
+ raise ValueError(f"Unsupported json file: {path}")
214
190
 
215
191
 
216
192
  @requires_dependencies(["pandas"])