unstructured-ingest 0.2.2__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (93) hide show
  1. test/integration/connectors/elasticsearch/__init__.py +0 -0
  2. test/integration/connectors/elasticsearch/conftest.py +34 -0
  3. test/integration/connectors/elasticsearch/test_elasticsearch.py +308 -0
  4. test/integration/connectors/elasticsearch/test_opensearch.py +302 -0
  5. test/integration/connectors/sql/test_postgres.py +10 -4
  6. test/integration/connectors/sql/test_singlestore.py +8 -4
  7. test/integration/connectors/sql/test_snowflake.py +10 -6
  8. test/integration/connectors/sql/test_sqlite.py +4 -4
  9. test/integration/connectors/test_astradb.py +156 -0
  10. test/integration/connectors/test_azure_cog_search.py +233 -0
  11. test/integration/connectors/test_delta_table.py +46 -0
  12. test/integration/connectors/test_kafka.py +150 -16
  13. test/integration/connectors/test_lancedb.py +209 -0
  14. test/integration/connectors/test_milvus.py +141 -0
  15. test/integration/connectors/test_pinecone.py +213 -0
  16. test/integration/connectors/test_s3.py +23 -0
  17. test/integration/connectors/utils/docker.py +81 -15
  18. test/integration/connectors/utils/validation.py +10 -0
  19. test/integration/connectors/weaviate/__init__.py +0 -0
  20. test/integration/connectors/weaviate/conftest.py +15 -0
  21. test/integration/connectors/weaviate/test_local.py +131 -0
  22. test/unit/v2/__init__.py +0 -0
  23. test/unit/v2/chunkers/__init__.py +0 -0
  24. test/unit/v2/chunkers/test_chunkers.py +49 -0
  25. test/unit/v2/connectors/__init__.py +0 -0
  26. test/unit/v2/embedders/__init__.py +0 -0
  27. test/unit/v2/embedders/test_bedrock.py +36 -0
  28. test/unit/v2/embedders/test_huggingface.py +48 -0
  29. test/unit/v2/embedders/test_mixedbread.py +37 -0
  30. test/unit/v2/embedders/test_octoai.py +35 -0
  31. test/unit/v2/embedders/test_openai.py +35 -0
  32. test/unit/v2/embedders/test_togetherai.py +37 -0
  33. test/unit/v2/embedders/test_vertexai.py +37 -0
  34. test/unit/v2/embedders/test_voyageai.py +38 -0
  35. test/unit/v2/partitioners/__init__.py +0 -0
  36. test/unit/v2/partitioners/test_partitioner.py +63 -0
  37. test/unit/v2/utils/__init__.py +0 -0
  38. test/unit/v2/utils/data_generator.py +32 -0
  39. unstructured_ingest/__version__.py +1 -1
  40. unstructured_ingest/cli/cmds/__init__.py +2 -2
  41. unstructured_ingest/cli/cmds/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
  42. unstructured_ingest/connector/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
  43. unstructured_ingest/pipeline/reformat/embedding.py +1 -1
  44. unstructured_ingest/runner/writers/__init__.py +2 -2
  45. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  46. unstructured_ingest/utils/data_prep.py +9 -1
  47. unstructured_ingest/v2/constants.py +2 -0
  48. unstructured_ingest/v2/processes/connectors/__init__.py +7 -20
  49. unstructured_ingest/v2/processes/connectors/airtable.py +2 -2
  50. unstructured_ingest/v2/processes/connectors/astradb.py +35 -23
  51. unstructured_ingest/v2/processes/connectors/{azure_cognitive_search.py → azure_ai_search.py} +116 -35
  52. unstructured_ingest/v2/processes/connectors/confluence.py +2 -2
  53. unstructured_ingest/v2/processes/connectors/couchbase.py +1 -0
  54. unstructured_ingest/v2/processes/connectors/delta_table.py +37 -9
  55. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  56. unstructured_ingest/v2/processes/connectors/{elasticsearch.py → elasticsearch/elasticsearch.py} +93 -46
  57. unstructured_ingest/v2/processes/connectors/{opensearch.py → elasticsearch/opensearch.py} +1 -1
  58. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +27 -0
  59. unstructured_ingest/v2/processes/connectors/google_drive.py +3 -3
  60. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +6 -2
  61. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +38 -2
  62. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +84 -23
  63. unstructured_ingest/v2/processes/connectors/kafka/local.py +32 -4
  64. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +17 -0
  65. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  66. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  67. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  68. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +161 -0
  69. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  70. unstructured_ingest/v2/processes/connectors/milvus.py +72 -27
  71. unstructured_ingest/v2/processes/connectors/onedrive.py +2 -3
  72. unstructured_ingest/v2/processes/connectors/outlook.py +2 -2
  73. unstructured_ingest/v2/processes/connectors/pinecone.py +101 -13
  74. unstructured_ingest/v2/processes/connectors/sharepoint.py +3 -2
  75. unstructured_ingest/v2/processes/connectors/slack.py +2 -2
  76. unstructured_ingest/v2/processes/connectors/sql/postgres.py +16 -8
  77. unstructured_ingest/v2/processes/connectors/sql/sql.py +97 -26
  78. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  79. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +164 -0
  80. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  81. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  82. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +289 -0
  83. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.1.dist-info}/METADATA +20 -19
  84. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.1.dist-info}/RECORD +91 -50
  85. unstructured_ingest/runner/writers/azure_cognitive_search.py +0 -24
  86. unstructured_ingest/v2/processes/connectors/weaviate.py +0 -242
  87. /test/integration/embedders/{togetherai.py → test_togetherai.py} +0 -0
  88. /test/unit/{test_interfaces_v2.py → v2/test_interfaces.py} +0 -0
  89. /test/unit/{test_utils_v2.py → v2/test_utils.py} +0 -0
  90. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.1.dist-info}/LICENSE.md +0 -0
  91. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.1.dist-info}/WHEEL +0 -0
  92. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.1.dist-info}/entry_points.txt +0 -0
  93. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,90 @@
1
+ from contextlib import contextmanager
2
+ from dataclasses import dataclass, field
3
+ from typing import TYPE_CHECKING, Generator, Optional
4
+
5
+ from pydantic import Field, Secret
6
+
7
+ from unstructured_ingest.utils.dep_check import requires_dependencies
8
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
9
+ from unstructured_ingest.v2.processes.connectors.weaviate.weaviate import (
10
+ WeaviateAccessConfig,
11
+ WeaviateConnectionConfig,
12
+ WeaviateUploader,
13
+ WeaviateUploaderConfig,
14
+ WeaviateUploadStager,
15
+ WeaviateUploadStagerConfig,
16
+ )
17
+
18
+ if TYPE_CHECKING:
19
+ from weaviate.client import WeaviateClient
20
+
21
+ CONNECTOR_TYPE = "weaviate-embedded"
22
+
23
+
24
+ class EmbeddedWeaviateAccessConfig(WeaviateAccessConfig):
25
+ pass
26
+
27
+
28
+ class EmbeddedWeaviateConnectionConfig(WeaviateConnectionConfig):
29
+ hostname: str = Field(default="127.0.0.1", description="hostname")
30
+ port: int = Field(default=8079, description="http port")
31
+ grpc_port: int = Field(default=50050, description="grpc port")
32
+ data_path: Optional[str] = Field(
33
+ default=None,
34
+ description="directory where the files making up the "
35
+ "database are stored. If not provided, will "
36
+ "default to underlying SDK implementation",
37
+ )
38
+ access_config: Secret[WeaviateAccessConfig] = Field(
39
+ default=WeaviateAccessConfig(), validate_default=True
40
+ )
41
+
42
+ @contextmanager
43
+ @requires_dependencies(["weaviate"], extras="weaviate")
44
+ def get_client(self) -> Generator["WeaviateClient", None, None]:
45
+ from weaviate import connect_to_embedded
46
+ from weaviate.classes.init import AdditionalConfig
47
+
48
+ with connect_to_embedded(
49
+ hostname=self.hostname,
50
+ port=self.port,
51
+ grpc_port=self.grpc_port,
52
+ persistence_data_path=self.data_path,
53
+ additional_config=AdditionalConfig(timeout=self.get_timeout()),
54
+ ) as weaviate_client:
55
+ yield weaviate_client
56
+
57
+
58
+ class EmbeddedWeaviateUploadStagerConfig(WeaviateUploadStagerConfig):
59
+ pass
60
+
61
+
62
+ @dataclass
63
+ class EmbeddedWeaviateUploadStager(WeaviateUploadStager):
64
+ upload_stager_config: EmbeddedWeaviateUploadStagerConfig = field(
65
+ default_factory=lambda: WeaviateUploadStagerConfig()
66
+ )
67
+
68
+
69
+ class EmbeddedWeaviateUploaderConfig(WeaviateUploaderConfig):
70
+ pass
71
+
72
+
73
+ @dataclass
74
+ class EmbeddedWeaviateUploader(WeaviateUploader):
75
+ connection_config: EmbeddedWeaviateConnectionConfig = field(
76
+ default_factory=lambda: EmbeddedWeaviateConnectionConfig()
77
+ )
78
+ upload_config: EmbeddedWeaviateUploaderConfig = field(
79
+ default_factory=lambda: EmbeddedWeaviateUploaderConfig()
80
+ )
81
+ connector_type: str = CONNECTOR_TYPE
82
+
83
+
84
+ weaviate_embedded_destination_entry = DestinationRegistryEntry(
85
+ connection_config=EmbeddedWeaviateConnectionConfig,
86
+ uploader=EmbeddedWeaviateUploader,
87
+ uploader_config=EmbeddedWeaviateUploaderConfig,
88
+ upload_stager=EmbeddedWeaviateUploadStager,
89
+ upload_stager_config=EmbeddedWeaviateUploadStagerConfig,
90
+ )
@@ -0,0 +1,73 @@
1
+ from contextlib import contextmanager
2
+ from dataclasses import dataclass, field
3
+ from typing import TYPE_CHECKING, Generator
4
+
5
+ from pydantic import Field, Secret
6
+
7
+ from unstructured_ingest.utils.dep_check import requires_dependencies
8
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
9
+ from unstructured_ingest.v2.processes.connectors.weaviate.weaviate import (
10
+ WeaviateAccessConfig,
11
+ WeaviateConnectionConfig,
12
+ WeaviateUploader,
13
+ WeaviateUploaderConfig,
14
+ WeaviateUploadStager,
15
+ WeaviateUploadStagerConfig,
16
+ )
17
+
18
+ if TYPE_CHECKING:
19
+ from weaviate.client import WeaviateClient
20
+
21
+ CONNECTOR_TYPE = "weaviate-local"
22
+
23
+
24
+ class LocalWeaviateAccessConfig(WeaviateAccessConfig):
25
+ pass
26
+
27
+
28
+ class LocalWeaviateConnectionConfig(WeaviateConnectionConfig):
29
+ access_config: Secret[WeaviateAccessConfig] = Field(
30
+ default=WeaviateAccessConfig(), validate_default=True
31
+ )
32
+
33
+ @contextmanager
34
+ @requires_dependencies(["weaviate"], extras="weaviate")
35
+ def get_client(self) -> Generator["WeaviateClient", None, None]:
36
+ from weaviate import connect_to_local
37
+ from weaviate.classes.init import AdditionalConfig
38
+
39
+ with connect_to_local(
40
+ additional_config=AdditionalConfig(timeout=self.get_timeout())
41
+ ) as weaviate_client:
42
+ yield weaviate_client
43
+
44
+
45
+ class LocalWeaviateUploadStagerConfig(WeaviateUploadStagerConfig):
46
+ pass
47
+
48
+
49
+ @dataclass
50
+ class LocalWeaviateUploadStager(WeaviateUploadStager):
51
+ upload_stager_config: LocalWeaviateUploadStagerConfig = field(
52
+ default_factory=lambda: WeaviateUploadStagerConfig()
53
+ )
54
+
55
+
56
+ class LocalWeaviateUploaderConfig(WeaviateUploaderConfig):
57
+ pass
58
+
59
+
60
+ @dataclass
61
+ class LocalWeaviateUploader(WeaviateUploader):
62
+ upload_config: LocalWeaviateUploaderConfig
63
+ connector_type: str = CONNECTOR_TYPE
64
+ connection_config: LocalWeaviateConnectionConfig
65
+
66
+
67
+ weaviate_local_destination_entry = DestinationRegistryEntry(
68
+ connection_config=LocalWeaviateConnectionConfig,
69
+ uploader=LocalWeaviateUploader,
70
+ uploader_config=LocalWeaviateUploaderConfig,
71
+ upload_stager=LocalWeaviateUploadStager,
72
+ upload_stager_config=LocalWeaviateUploadStagerConfig,
73
+ )
@@ -0,0 +1,289 @@
1
+ import json
2
+ from abc import ABC, abstractmethod
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass, field
5
+ from datetime import date, datetime
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING, Any, Generator, Optional
8
+
9
+ from dateutil import parser
10
+ from pydantic import Field, Secret
11
+
12
+ from unstructured_ingest.error import DestinationConnectionError, WriteError
13
+ from unstructured_ingest.utils.dep_check import requires_dependencies
14
+ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
15
+ from unstructured_ingest.v2.interfaces import (
16
+ AccessConfig,
17
+ ConnectionConfig,
18
+ FileData,
19
+ Uploader,
20
+ UploaderConfig,
21
+ UploadStager,
22
+ UploadStagerConfig,
23
+ )
24
+ from unstructured_ingest.v2.logger import logger
25
+
26
+ if TYPE_CHECKING:
27
+ from weaviate.classes.init import Timeout
28
+ from weaviate.client import WeaviateClient
29
+ from weaviate.collections.batch.client import BatchClient
30
+
31
+ CONNECTOR_TYPE = "weaviate"
32
+
33
+
34
+ class WeaviateAccessConfig(AccessConfig, ABC):
35
+ pass
36
+
37
+
38
+ class WeaviateConnectionConfig(ConnectionConfig, ABC):
39
+ init_timeout: int = Field(default=2, ge=0, description="Timeout for initialization checks")
40
+ insert_timeout: int = Field(default=90, ge=0, description="Timeout for insert operations")
41
+ query_timeout: int = Field(default=30, ge=0, description="Timeout for query operations")
42
+ access_config: Secret[WeaviateAccessConfig] = Field(
43
+ default=WeaviateAccessConfig(), validate_default=True
44
+ )
45
+
46
+ @requires_dependencies(["weaviate"], extras="weaviate")
47
+ def get_timeout(self) -> "Timeout":
48
+ from weaviate.classes.init import Timeout
49
+
50
+ return Timeout(init=self.init_timeout, query=self.query_timeout, insert=self.insert_timeout)
51
+
52
+ @abstractmethod
53
+ @contextmanager
54
+ def get_client(self) -> Generator["WeaviateClient", None, None]:
55
+ pass
56
+
57
+
58
+ class WeaviateUploadStagerConfig(UploadStagerConfig):
59
+ pass
60
+
61
+
62
+ @dataclass
63
+ class WeaviateUploadStager(UploadStager):
64
+ upload_stager_config: WeaviateUploadStagerConfig = field(
65
+ default_factory=lambda: WeaviateUploadStagerConfig()
66
+ )
67
+
68
+ @staticmethod
69
+ def parse_date_string(date_string: str) -> date:
70
+ try:
71
+ timestamp = float(date_string)
72
+ return datetime.fromtimestamp(timestamp)
73
+ except Exception as e:
74
+ logger.debug(f"date {date_string} string not a timestamp: {e}")
75
+ return parser.parse(date_string)
76
+
77
+ @classmethod
78
+ def conform_dict(cls, data: dict, file_data: FileData) -> dict:
79
+ """
80
+ Updates the element dictionary to conform to the Weaviate schema
81
+ """
82
+ working_data = data.copy()
83
+ # Dict as string formatting
84
+ if (
85
+ record_locator := working_data.get("metadata", {})
86
+ .get("data_source", {})
87
+ .get("record_locator")
88
+ ):
89
+ # Explicit casting otherwise fails schema type checking
90
+ working_data["metadata"]["data_source"]["record_locator"] = str(
91
+ json.dumps(record_locator)
92
+ )
93
+
94
+ # Array of items as string formatting
95
+ if points := working_data.get("metadata", {}).get("coordinates", {}).get("points"):
96
+ working_data["metadata"]["coordinates"]["points"] = str(json.dumps(points))
97
+
98
+ if links := working_data.get("metadata", {}).get("links", {}):
99
+ working_data["metadata"]["links"] = str(json.dumps(links))
100
+
101
+ if permissions_data := (
102
+ working_data.get("metadata", {}).get("data_source", {}).get("permissions_data")
103
+ ):
104
+ working_data["metadata"]["data_source"]["permissions_data"] = json.dumps(
105
+ permissions_data
106
+ )
107
+
108
+ # Datetime formatting
109
+ if (
110
+ date_created := working_data.get("metadata", {})
111
+ .get("data_source", {})
112
+ .get("date_created")
113
+ ):
114
+ working_data["metadata"]["data_source"]["date_created"] = cls.parse_date_string(
115
+ date_created
116
+ ).strftime(
117
+ "%Y-%m-%dT%H:%M:%S.%fZ",
118
+ )
119
+
120
+ if (
121
+ date_modified := working_data.get("metadata", {})
122
+ .get("data_source", {})
123
+ .get("date_modified")
124
+ ):
125
+ working_data["metadata"]["data_source"]["date_modified"] = cls.parse_date_string(
126
+ date_modified
127
+ ).strftime(
128
+ "%Y-%m-%dT%H:%M:%S.%fZ",
129
+ )
130
+
131
+ if (
132
+ date_processed := working_data.get("metadata", {})
133
+ .get("data_source", {})
134
+ .get("date_processed")
135
+ ):
136
+ working_data["metadata"]["data_source"]["date_processed"] = cls.parse_date_string(
137
+ date_processed
138
+ ).strftime(
139
+ "%Y-%m-%dT%H:%M:%S.%fZ",
140
+ )
141
+
142
+ if last_modified := working_data.get("metadata", {}).get("last_modified"):
143
+ working_data["metadata"]["last_modified"] = cls.parse_date_string(
144
+ last_modified
145
+ ).strftime(
146
+ "%Y-%m-%dT%H:%M:%S.%fZ",
147
+ )
148
+
149
+ # String casting
150
+ if version := working_data.get("metadata", {}).get("data_source", {}).get("version"):
151
+ working_data["metadata"]["data_source"]["version"] = str(version)
152
+
153
+ if page_number := working_data.get("metadata", {}).get("page_number"):
154
+ working_data["metadata"]["page_number"] = str(page_number)
155
+
156
+ if regex_metadata := working_data.get("metadata", {}).get("regex_metadata"):
157
+ working_data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
158
+
159
+ working_data[RECORD_ID_LABEL] = file_data.identifier
160
+ return working_data
161
+
162
+ def run(
163
+ self,
164
+ elements_filepath: Path,
165
+ file_data: FileData,
166
+ output_dir: Path,
167
+ output_filename: str,
168
+ **kwargs: Any,
169
+ ) -> Path:
170
+ with open(elements_filepath) as elements_file:
171
+ elements_contents = json.load(elements_file)
172
+ updated_elements = [
173
+ self.conform_dict(data=element, file_data=file_data) for element in elements_contents
174
+ ]
175
+ output_path = Path(output_dir) / Path(f"{output_filename}.json")
176
+ output_path.parent.mkdir(parents=True, exist_ok=True)
177
+ with open(output_path, "w") as output_file:
178
+ json.dump(updated_elements, output_file, indent=2)
179
+ return output_path
180
+
181
+
182
+ class WeaviateUploaderConfig(UploaderConfig):
183
+ collection: str = Field(description="The name of the collection this object belongs to")
184
+ batch_size: Optional[int] = Field(default=None, description="Number of records per batch")
185
+ requests_per_minute: Optional[int] = Field(default=None, description="Rate limit for upload")
186
+ dynamic_batch: bool = Field(default=True, description="Whether to use dynamic batch")
187
+ record_id_key: str = Field(
188
+ default=RECORD_ID_LABEL,
189
+ description="searchable key to find entries for the same record on previous runs",
190
+ )
191
+
192
+ def model_post_init(self, __context: Any) -> None:
193
+ batch_types = {
194
+ "fixed_size": self.batch_size is not None,
195
+ "rate_limited": self.requests_per_minute is not None,
196
+ "dynamic": self.dynamic_batch,
197
+ }
198
+
199
+ enabled_batch_modes = [batch_key for batch_key, flag in batch_types.items() if flag]
200
+ if not enabled_batch_modes:
201
+ raise ValueError("No batch mode enabled")
202
+ if len(enabled_batch_modes) > 1:
203
+ raise ValueError(
204
+ "Multiple batch modes enabled, only one mode can be used: {}".format(
205
+ ", ".join(enabled_batch_modes)
206
+ )
207
+ )
208
+ logger.info(f"Uploader config instantiated with {enabled_batch_modes[0]} batch mode")
209
+
210
+ @contextmanager
211
+ def get_batch_client(self, client: "WeaviateClient") -> Generator["BatchClient", None, None]:
212
+ if self.dynamic_batch:
213
+ with client.batch.dynamic() as batch_client:
214
+ yield batch_client
215
+ elif self.batch_size:
216
+ with client.batch.fixed_size(batch_size=self.batch_size) as batch_client:
217
+ yield batch_client
218
+ elif self.requests_per_minute:
219
+ with client.batch.rate_limit(
220
+ requests_per_minute=self.requests_per_minute
221
+ ) as batch_client:
222
+ yield batch_client
223
+ else:
224
+ raise ValueError("No batch mode enabled")
225
+
226
+
227
+ @dataclass
228
+ class WeaviateUploader(Uploader, ABC):
229
+ upload_config: WeaviateUploaderConfig
230
+ connection_config: WeaviateConnectionConfig
231
+
232
+ def precheck(self) -> None:
233
+ try:
234
+ self.connection_config.get_client()
235
+ except Exception as e:
236
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
237
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
238
+
239
+ def check_for_errors(self, client: "WeaviateClient") -> None:
240
+ failed_uploads = client.batch.failed_objects
241
+ if failed_uploads:
242
+ for failure in failed_uploads:
243
+ logger.error(
244
+ f"Failed to upload object with id {failure.original_uuid}: {failure.message}"
245
+ )
246
+ raise WriteError("Failed to upload to weaviate")
247
+
248
+ @requires_dependencies(["weaviate"], extras="weaviate")
249
+ def delete_by_record_id(self, client: "WeaviateClient", file_data: FileData) -> None:
250
+ from weaviate.classes.query import Filter
251
+
252
+ record_id = file_data.identifier
253
+ collection = client.collections.get(self.upload_config.collection)
254
+ delete_filter = Filter.by_property(name=self.upload_config.record_id_key).equal(
255
+ val=record_id
256
+ )
257
+ # There is a configurable maximum limit (QUERY_MAXIMUM_RESULTS) on the number of
258
+ # objects that can be deleted in a single query (default 10,000). To delete
259
+ # more objects than the limit, re-run the query until nothing is deleted.
260
+ while True:
261
+ resp = collection.data.delete_many(where=delete_filter)
262
+ if resp.failed:
263
+ raise WriteError(
264
+ f"failed to delete records in collection "
265
+ f"{self.upload_config.collection} with record "
266
+ f"id property {record_id}"
267
+ )
268
+ if not resp.failed and not resp.successful:
269
+ break
270
+
271
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
272
+ with path.open("r") as file:
273
+ elements_dict = json.load(file)
274
+ logger.info(
275
+ f"writing {len(elements_dict)} objects to destination "
276
+ f"class {self.connection_config.access_config} "
277
+ )
278
+
279
+ with self.connection_config.get_client() as weaviate_client:
280
+ self.delete_by_record_id(client=weaviate_client, file_data=file_data)
281
+ with self.upload_config.get_batch_client(client=weaviate_client) as batch_client:
282
+ for e in elements_dict:
283
+ vector = e.pop("embeddings", None)
284
+ batch_client.add_object(
285
+ collection=self.upload_config.collection,
286
+ properties=e,
287
+ vector=vector,
288
+ )
289
+ self.check_for_errors(client=weaviate_client)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.2.2
3
+ Version: 0.3.1
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,22 +22,22 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.13
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
- Requires-Dist: opentelemetry-sdk
26
- Requires-Dist: dataclasses-json
27
- Requires-Dist: click
28
- Requires-Dist: tqdm
29
25
  Requires-Dist: python-dateutil
30
- Requires-Dist: pandas
26
+ Requires-Dist: tqdm
27
+ Requires-Dist: click
28
+ Requires-Dist: dataclasses-json
31
29
  Requires-Dist: pydantic>=2.7
30
+ Requires-Dist: pandas
31
+ Requires-Dist: opentelemetry-sdk
32
32
  Provides-Extra: airtable
33
33
  Requires-Dist: pyairtable; extra == "airtable"
34
34
  Provides-Extra: astradb
35
35
  Requires-Dist: astrapy; extra == "astradb"
36
36
  Provides-Extra: azure
37
- Requires-Dist: adlfs; extra == "azure"
38
37
  Requires-Dist: fsspec; extra == "azure"
39
- Provides-Extra: azure-cognitive-search
40
- Requires-Dist: azure-search-documents; extra == "azure-cognitive-search"
38
+ Requires-Dist: adlfs; extra == "azure"
39
+ Provides-Extra: azure-ai-search
40
+ Requires-Dist: azure-search-documents; extra == "azure-ai-search"
41
41
  Provides-Extra: bedrock
42
42
  Requires-Dist: boto3; extra == "bedrock"
43
43
  Provides-Extra: biomed
@@ -51,8 +51,8 @@ Requires-Dist: chromadb; extra == "chroma"
51
51
  Provides-Extra: clarifai
52
52
  Requires-Dist: clarifai; extra == "clarifai"
53
53
  Provides-Extra: confluence
54
- Requires-Dist: requests; extra == "confluence"
55
54
  Requires-Dist: atlassian-python-api; extra == "confluence"
55
+ Requires-Dist: requests; extra == "confluence"
56
56
  Provides-Extra: couchbase
57
57
  Requires-Dist: couchbase; extra == "couchbase"
58
58
  Provides-Extra: csv
@@ -60,8 +60,8 @@ Requires-Dist: unstructured[tsv]; extra == "csv"
60
60
  Provides-Extra: databricks-volumes
61
61
  Requires-Dist: databricks-sdk; extra == "databricks-volumes"
62
62
  Provides-Extra: delta-table
63
+ Requires-Dist: boto3; extra == "delta-table"
63
64
  Requires-Dist: deltalake; extra == "delta-table"
64
- Requires-Dist: fsspec; extra == "delta-table"
65
65
  Provides-Extra: discord
66
66
  Requires-Dist: discord-py; extra == "discord"
67
67
  Provides-Extra: doc
@@ -78,8 +78,8 @@ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
78
78
  Provides-Extra: embed-mixedbreadai
79
79
  Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
80
80
  Provides-Extra: embed-octoai
81
- Requires-Dist: tiktoken; extra == "embed-octoai"
82
81
  Requires-Dist: openai; extra == "embed-octoai"
82
+ Requires-Dist: tiktoken; extra == "embed-octoai"
83
83
  Provides-Extra: embed-vertexai
84
84
  Requires-Dist: vertexai; extra == "embed-vertexai"
85
85
  Provides-Extra: embed-voyageai
@@ -87,8 +87,8 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
87
87
  Provides-Extra: epub
88
88
  Requires-Dist: unstructured[epub]; extra == "epub"
89
89
  Provides-Extra: gcs
90
- Requires-Dist: gcsfs; extra == "gcs"
91
90
  Requires-Dist: bs4; extra == "gcs"
91
+ Requires-Dist: gcsfs; extra == "gcs"
92
92
  Requires-Dist: fsspec; extra == "gcs"
93
93
  Provides-Extra: github
94
94
  Requires-Dist: requests; extra == "github"
@@ -115,26 +115,26 @@ Requires-Dist: pymongo; extra == "mongodb"
115
115
  Provides-Extra: msg
116
116
  Requires-Dist: unstructured[msg]; extra == "msg"
117
117
  Provides-Extra: notion
118
+ Requires-Dist: htmlBuilder; extra == "notion"
118
119
  Requires-Dist: notion-client; extra == "notion"
119
- Requires-Dist: httpx; extra == "notion"
120
120
  Requires-Dist: backoff; extra == "notion"
121
- Requires-Dist: htmlBuilder; extra == "notion"
121
+ Requires-Dist: httpx; extra == "notion"
122
122
  Provides-Extra: odt
123
123
  Requires-Dist: unstructured[odt]; extra == "odt"
124
124
  Provides-Extra: onedrive
125
- Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
126
125
  Requires-Dist: msal; extra == "onedrive"
127
126
  Requires-Dist: bs4; extra == "onedrive"
127
+ Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
128
128
  Provides-Extra: openai
129
- Requires-Dist: tiktoken; extra == "openai"
130
129
  Requires-Dist: openai; extra == "openai"
130
+ Requires-Dist: tiktoken; extra == "openai"
131
131
  Provides-Extra: opensearch
132
132
  Requires-Dist: opensearch-py; extra == "opensearch"
133
133
  Provides-Extra: org
134
134
  Requires-Dist: unstructured[org]; extra == "org"
135
135
  Provides-Extra: outlook
136
- Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
137
136
  Requires-Dist: msal; extra == "outlook"
137
+ Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
138
138
  Provides-Extra: pdf
139
139
  Requires-Dist: unstructured[pdf]; extra == "pdf"
140
140
  Provides-Extra: pinecone
@@ -164,13 +164,14 @@ Provides-Extra: sftp
164
164
  Requires-Dist: paramiko; extra == "sftp"
165
165
  Requires-Dist: fsspec; extra == "sftp"
166
166
  Provides-Extra: sharepoint
167
- Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
168
167
  Requires-Dist: msal; extra == "sharepoint"
168
+ Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
169
169
  Provides-Extra: singlestore
170
170
  Requires-Dist: singlestoredb; extra == "singlestore"
171
171
  Provides-Extra: slack
172
172
  Requires-Dist: slack-sdk[optional]; extra == "slack"
173
173
  Provides-Extra: snowflake
174
+ Requires-Dist: psycopg2-binary; extra == "snowflake"
174
175
  Requires-Dist: snowflake-connector-python; extra == "snowflake"
175
176
  Provides-Extra: togetherai
176
177
  Requires-Dist: together; extra == "togetherai"