unstructured-ingest 0.7.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (192) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/README.md +28 -0
  3. unstructured_ingest/embed/mixedbreadai.py +0 -1
  4. unstructured_ingest/interfaces/upload_stager.py +2 -2
  5. unstructured_ingest/interfaces/uploader.py +3 -3
  6. unstructured_ingest/logger.py +2 -93
  7. unstructured_ingest/main.py +0 -0
  8. unstructured_ingest/pipeline/interfaces.py +1 -1
  9. unstructured_ingest/pipeline/pipeline.py +1 -1
  10. unstructured_ingest/processes/chunker.py +4 -0
  11. unstructured_ingest/processes/connectors/airtable.py +4 -2
  12. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +10 -0
  13. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  14. unstructured_ingest/processes/connectors/astradb.py +2 -2
  15. unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  16. unstructured_ingest/processes/connectors/confluence.py +0 -1
  17. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
  18. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
  19. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
  20. unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
  21. unstructured_ingest/processes/connectors/delta_table.py +1 -0
  22. unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
  23. unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
  24. unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
  25. unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
  26. unstructured_ingest/processes/connectors/gitlab.py +1 -2
  27. unstructured_ingest/processes/connectors/google_drive.py +0 -2
  28. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
  29. unstructured_ingest/processes/connectors/kdbai.py +1 -0
  30. unstructured_ingest/processes/connectors/outlook.py +1 -2
  31. unstructured_ingest/processes/connectors/pinecone.py +0 -1
  32. unstructured_ingest/processes/connectors/redisdb.py +28 -24
  33. unstructured_ingest/processes/connectors/salesforce.py +1 -1
  34. unstructured_ingest/processes/connectors/slack.py +1 -2
  35. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
  36. unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
  37. unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
  38. unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
  39. unstructured_ingest/processes/connectors/sql/sql.py +3 -4
  40. unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
  41. unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
  42. unstructured_ingest/processes/connectors/vectara.py +0 -2
  43. unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
  44. unstructured_ingest/processes/embedder.py +2 -2
  45. unstructured_ingest/processes/filter.py +1 -1
  46. unstructured_ingest/processes/partitioner.py +4 -0
  47. unstructured_ingest/processes/utils/blob_storage.py +2 -2
  48. unstructured_ingest/unstructured_api.py +13 -8
  49. unstructured_ingest/utils/data_prep.py +8 -32
  50. unstructured_ingest/utils/string_and_date_utils.py +3 -3
  51. unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
  52. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +54 -187
  53. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
  54. examples/__init__.py +0 -0
  55. examples/airtable.py +0 -44
  56. examples/azure_cognitive_search.py +0 -55
  57. examples/chroma.py +0 -54
  58. examples/couchbase.py +0 -55
  59. examples/databricks_volumes_dest.py +0 -55
  60. examples/databricks_volumes_source.py +0 -53
  61. examples/delta_table.py +0 -45
  62. examples/discord_example.py +0 -36
  63. examples/elasticsearch.py +0 -49
  64. examples/google_drive.py +0 -45
  65. examples/kdbai.py +0 -54
  66. examples/local.py +0 -36
  67. examples/milvus.py +0 -44
  68. examples/mongodb.py +0 -53
  69. examples/opensearch.py +0 -50
  70. examples/pinecone.py +0 -57
  71. examples/s3.py +0 -38
  72. examples/salesforce.py +0 -44
  73. examples/sharepoint.py +0 -47
  74. examples/singlestore.py +0 -49
  75. examples/sql.py +0 -90
  76. examples/vectara.py +0 -54
  77. examples/weaviate.py +0 -44
  78. test/__init__.py +0 -0
  79. test/integration/__init__.py +0 -0
  80. test/integration/chunkers/__init__.py +0 -0
  81. test/integration/chunkers/test_chunkers.py +0 -31
  82. test/integration/connectors/__init__.py +0 -0
  83. test/integration/connectors/conftest.py +0 -38
  84. test/integration/connectors/databricks/__init__.py +0 -0
  85. test/integration/connectors/databricks/test_volumes_native.py +0 -273
  86. test/integration/connectors/discord/__init__.py +0 -0
  87. test/integration/connectors/discord/test_discord.py +0 -90
  88. test/integration/connectors/duckdb/__init__.py +0 -0
  89. test/integration/connectors/duckdb/conftest.py +0 -14
  90. test/integration/connectors/duckdb/test_duckdb.py +0 -90
  91. test/integration/connectors/duckdb/test_motherduck.py +0 -95
  92. test/integration/connectors/elasticsearch/__init__.py +0 -0
  93. test/integration/connectors/elasticsearch/conftest.py +0 -34
  94. test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
  95. test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
  96. test/integration/connectors/sql/__init__.py +0 -0
  97. test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
  98. test/integration/connectors/sql/test_postgres.py +0 -201
  99. test/integration/connectors/sql/test_singlestore.py +0 -182
  100. test/integration/connectors/sql/test_snowflake.py +0 -244
  101. test/integration/connectors/sql/test_sqlite.py +0 -168
  102. test/integration/connectors/sql/test_vastdb.py +0 -34
  103. test/integration/connectors/test_astradb.py +0 -287
  104. test/integration/connectors/test_azure_ai_search.py +0 -254
  105. test/integration/connectors/test_chroma.py +0 -136
  106. test/integration/connectors/test_confluence.py +0 -111
  107. test/integration/connectors/test_delta_table.py +0 -183
  108. test/integration/connectors/test_dropbox.py +0 -151
  109. test/integration/connectors/test_github.py +0 -49
  110. test/integration/connectors/test_google_drive.py +0 -257
  111. test/integration/connectors/test_jira.py +0 -67
  112. test/integration/connectors/test_lancedb.py +0 -247
  113. test/integration/connectors/test_milvus.py +0 -208
  114. test/integration/connectors/test_mongodb.py +0 -335
  115. test/integration/connectors/test_neo4j.py +0 -244
  116. test/integration/connectors/test_notion.py +0 -152
  117. test/integration/connectors/test_onedrive.py +0 -163
  118. test/integration/connectors/test_pinecone.py +0 -387
  119. test/integration/connectors/test_qdrant.py +0 -216
  120. test/integration/connectors/test_redis.py +0 -143
  121. test/integration/connectors/test_s3.py +0 -184
  122. test/integration/connectors/test_sharepoint.py +0 -222
  123. test/integration/connectors/test_vectara.py +0 -282
  124. test/integration/connectors/test_zendesk.py +0 -120
  125. test/integration/connectors/utils/__init__.py +0 -0
  126. test/integration/connectors/utils/constants.py +0 -13
  127. test/integration/connectors/utils/docker.py +0 -151
  128. test/integration/connectors/utils/docker_compose.py +0 -59
  129. test/integration/connectors/utils/validation/__init__.py +0 -0
  130. test/integration/connectors/utils/validation/destination.py +0 -77
  131. test/integration/connectors/utils/validation/equality.py +0 -76
  132. test/integration/connectors/utils/validation/source.py +0 -331
  133. test/integration/connectors/utils/validation/utils.py +0 -36
  134. test/integration/connectors/weaviate/__init__.py +0 -0
  135. test/integration/connectors/weaviate/conftest.py +0 -15
  136. test/integration/connectors/weaviate/test_cloud.py +0 -39
  137. test/integration/connectors/weaviate/test_local.py +0 -152
  138. test/integration/embedders/__init__.py +0 -0
  139. test/integration/embedders/conftest.py +0 -13
  140. test/integration/embedders/test_azure_openai.py +0 -57
  141. test/integration/embedders/test_bedrock.py +0 -103
  142. test/integration/embedders/test_huggingface.py +0 -24
  143. test/integration/embedders/test_mixedbread.py +0 -71
  144. test/integration/embedders/test_octoai.py +0 -75
  145. test/integration/embedders/test_openai.py +0 -74
  146. test/integration/embedders/test_togetherai.py +0 -71
  147. test/integration/embedders/test_vertexai.py +0 -63
  148. test/integration/embedders/test_voyageai.py +0 -79
  149. test/integration/embedders/utils.py +0 -66
  150. test/integration/partitioners/__init__.py +0 -0
  151. test/integration/partitioners/test_partitioner.py +0 -76
  152. test/integration/utils.py +0 -15
  153. test/unit/__init__.py +0 -0
  154. test/unit/chunkers/__init__.py +0 -0
  155. test/unit/chunkers/test_chunkers.py +0 -49
  156. test/unit/connectors/__init__.py +0 -0
  157. test/unit/connectors/ibm_watsonx/__init__.py +0 -0
  158. test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
  159. test/unit/connectors/motherduck/__init__.py +0 -0
  160. test/unit/connectors/motherduck/test_base.py +0 -73
  161. test/unit/connectors/sql/__init__.py +0 -0
  162. test/unit/connectors/sql/test_sql.py +0 -152
  163. test/unit/connectors/test_confluence.py +0 -71
  164. test/unit/connectors/test_jira.py +0 -401
  165. test/unit/embed/__init__.py +0 -0
  166. test/unit/embed/test_mixedbreadai.py +0 -42
  167. test/unit/embed/test_octoai.py +0 -27
  168. test/unit/embed/test_openai.py +0 -28
  169. test/unit/embed/test_vertexai.py +0 -25
  170. test/unit/embed/test_voyageai.py +0 -24
  171. test/unit/embedders/__init__.py +0 -0
  172. test/unit/embedders/test_bedrock.py +0 -36
  173. test/unit/embedders/test_huggingface.py +0 -48
  174. test/unit/embedders/test_mixedbread.py +0 -37
  175. test/unit/embedders/test_octoai.py +0 -35
  176. test/unit/embedders/test_openai.py +0 -35
  177. test/unit/embedders/test_togetherai.py +0 -37
  178. test/unit/embedders/test_vertexai.py +0 -37
  179. test/unit/embedders/test_voyageai.py +0 -38
  180. test/unit/partitioners/__init__.py +0 -0
  181. test/unit/partitioners/test_partitioner.py +0 -63
  182. test/unit/test_error.py +0 -27
  183. test/unit/test_html.py +0 -112
  184. test/unit/test_interfaces.py +0 -26
  185. test/unit/test_logger.py +0 -78
  186. test/unit/test_utils.py +0 -220
  187. test/unit/utils/__init__.py +0 -0
  188. test/unit/utils/data_generator.py +0 -32
  189. unstructured_ingest-0.7.1.dist-info/METADATA +0 -383
  190. unstructured_ingest-0.7.1.dist-info/top_level.txt +0 -3
  191. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
  192. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
@@ -1,387 +0,0 @@
1
- import json
2
- import math
3
- import os
4
- import re
5
- import time
6
- from pathlib import Path
7
- from typing import Generator
8
- from uuid import uuid4
9
-
10
- import pytest
11
- from _pytest.fixtures import TopRequest
12
- from pinecone import Pinecone, ServerlessSpec
13
- from pinecone.core.openapi.shared.exceptions import NotFoundException
14
-
15
- from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
16
- from test.integration.connectors.utils.validation.destination import (
17
- StagerValidationConfigs,
18
- stager_validation,
19
- )
20
- from test.integration.utils import requires_env
21
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
22
- from unstructured_ingest.error import DestinationConnectionError
23
- from unstructured_ingest.logger import logger
24
- from unstructured_ingest.processes.connectors.pinecone import (
25
- CONNECTOR_TYPE,
26
- MAX_QUERY_RESULTS,
27
- PineconeAccessConfig,
28
- PineconeConnectionConfig,
29
- PineconeUploader,
30
- PineconeUploaderConfig,
31
- PineconeUploadStager,
32
- PineconeUploadStagerConfig,
33
- )
34
-
35
- METADATA_BYTES_LIMIT = (
36
- 40960 # 40KB https://docs.pinecone.io/reference/quotas-and-limits#hard-limits
37
- )
38
- VECTOR_DIMENSION = 384
39
- SPEC = {"serverless": {"cloud": "aws", "region": "us-east-1"}}
40
- ALLOWED_METADATA_FIELD = "text"
41
- API_KEY = "PINECONE_API_KEY"
42
-
43
-
44
- def get_api_key() -> str:
45
- api_key = os.getenv(API_KEY, None)
46
- assert api_key
47
- return api_key
48
-
49
-
50
- def wait_for_delete(client: Pinecone, index_name: str, timeout=60, interval=1) -> None:
51
- start = time.time()
52
- while True and time.time() - start < timeout:
53
- try:
54
- description = client.describe_index(name=index_name)
55
- logger.info(f"current index status: {description}")
56
- except NotFoundException:
57
- return
58
- time.sleep(interval)
59
-
60
- raise TimeoutError("time out waiting for index to delete")
61
-
62
-
63
- def wait_for_ready(client: Pinecone, index_name: str, timeout=60, interval=1) -> None:
64
- def is_ready_status():
65
- description = client.describe_index(name=index_name)
66
- status = description["status"]
67
- return status["ready"]
68
-
69
- start = time.time()
70
- is_ready = is_ready_status()
71
- while not is_ready and time.time() - start < timeout:
72
- time.sleep(interval)
73
- is_ready = is_ready_status()
74
- if not is_ready:
75
- raise TimeoutError("time out waiting for index to be ready")
76
-
77
-
78
- @pytest.fixture
79
- def pinecone_index() -> Generator[str, None, None]:
80
- pinecone = Pinecone(api_key=get_api_key())
81
- random_id = str(uuid4()).split("-")[0]
82
- index_name = f"ingest-test-{random_id}"
83
- assert len(index_name) < 45
84
- logger.info(f"Creating index: {index_name}")
85
- try:
86
- pinecone.create_index(
87
- name=index_name,
88
- dimension=384,
89
- metric="cosine",
90
- spec=ServerlessSpec(
91
- cloud="aws",
92
- region="us-east-1",
93
- ),
94
- deletion_protection="disabled",
95
- )
96
- wait_for_ready(client=pinecone, index_name=index_name)
97
- yield index_name
98
- except Exception as e:
99
- logger.error(f"failed to create index {index_name}: {e}")
100
- finally:
101
- try:
102
- logger.info(f"deleting index: {index_name}")
103
- pinecone.delete_index(name=index_name)
104
- wait_for_delete(client=pinecone, index_name=index_name)
105
- except NotFoundException:
106
- return
107
-
108
-
109
- def validate_pinecone_index(
110
- index_name: str,
111
- expected_num_of_vectors: int,
112
- retries=30,
113
- interval=1,
114
- namespace: str = "default",
115
- ) -> None:
116
- # Because there's a delay for the index to catch up to the recent writes, add in a retry
117
- pinecone = Pinecone(api_key=get_api_key())
118
- index = pinecone.Index(name=index_name, namespace=namespace)
119
- vector_count = -1
120
- for i in range(retries):
121
- index_stats = index.describe_index_stats()
122
- vector_count = index_stats["total_vector_count"]
123
- if vector_count == expected_num_of_vectors:
124
- logger.info(f"expected {expected_num_of_vectors} == vector count {vector_count}")
125
- break
126
- logger.info(
127
- f"retry attempt {i}: expected {expected_num_of_vectors} != vector count {vector_count}"
128
- )
129
- time.sleep(interval)
130
- assert vector_count == expected_num_of_vectors, (
131
- f"vector count from index ({vector_count}) doesn't "
132
- f"match expected number: {expected_num_of_vectors}"
133
- )
134
-
135
-
136
- @requires_env(API_KEY)
137
- @pytest.mark.asyncio
138
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
139
- async def test_pinecone_destination(pinecone_index: str, upload_file: Path, temp_dir: Path):
140
-
141
- file_data = FileData(
142
- source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
143
- connector_type=CONNECTOR_TYPE,
144
- identifier="pinecone_mock_id",
145
- )
146
-
147
- connection_config = PineconeConnectionConfig(
148
- index_name=pinecone_index,
149
- access_config=PineconeAccessConfig(api_key=get_api_key()),
150
- )
151
- stager_config = PineconeUploadStagerConfig()
152
- stager = PineconeUploadStager(upload_stager_config=stager_config)
153
- new_upload_file = stager.run(
154
- elements_filepath=upload_file,
155
- output_dir=temp_dir,
156
- output_filename=upload_file.name,
157
- file_data=file_data,
158
- )
159
-
160
- upload_config = PineconeUploaderConfig()
161
- uploader = PineconeUploader(connection_config=connection_config, upload_config=upload_config)
162
- uploader.precheck()
163
-
164
- uploader.run(path=new_upload_file, file_data=file_data)
165
- with new_upload_file.open() as f:
166
- staged_content = json.load(f)
167
- expected_num_of_vectors = len(staged_content)
168
- logger.info("validating first upload")
169
- validate_pinecone_index(
170
- index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
171
- )
172
-
173
- # Rerun uploader and make sure no duplicates exist
174
- uploader.run(path=new_upload_file, file_data=file_data)
175
- logger.info("validating second upload")
176
- validate_pinecone_index(
177
- index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
178
- )
179
-
180
-
181
- @requires_env(API_KEY)
182
- @pytest.mark.asyncio
183
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
184
- @pytest.mark.skip(reason="TODO: get this to work")
185
- async def test_pinecone_destination_large_index(
186
- pinecone_index: str, upload_file: Path, temp_dir: Path
187
- ):
188
- new_file = temp_dir / "large_file.json"
189
- with upload_file.open() as f:
190
- upload_content = json.load(f)
191
-
192
- min_entries = math.ceil((MAX_QUERY_RESULTS * 2) / len(upload_content))
193
- new_content = (upload_content * min_entries)[: (2 * MAX_QUERY_RESULTS)]
194
- print(f"Creating large index content with {len(new_content)} records")
195
- with new_file.open("w") as f:
196
- json.dump(new_content, f)
197
-
198
- expected_num_of_vectors = len(new_content)
199
- file_data = FileData(
200
- source_identifiers=SourceIdentifiers(fullpath=new_file.name, filename=new_file.name),
201
- connector_type=CONNECTOR_TYPE,
202
- identifier="pinecone_mock_id",
203
- )
204
- connection_config = PineconeConnectionConfig(
205
- index_name=pinecone_index,
206
- access_config=PineconeAccessConfig(api_key=get_api_key()),
207
- )
208
- stager_config = PineconeUploadStagerConfig()
209
- stager = PineconeUploadStager(upload_stager_config=stager_config)
210
- new_upload_file = stager.run(
211
- elements_filepath=new_file,
212
- output_dir=temp_dir,
213
- output_filename=new_file.name,
214
- file_data=file_data,
215
- )
216
-
217
- upload_config = PineconeUploaderConfig()
218
- uploader = PineconeUploader(connection_config=connection_config, upload_config=upload_config)
219
- uploader.precheck()
220
-
221
- uploader.run(path=new_upload_file, file_data=file_data)
222
- validate_pinecone_index(
223
- index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
224
- )
225
- # Rerun uploader and make sure no duplicates exist
226
- uploader.run(path=new_upload_file, file_data=file_data)
227
- logger.info("validating second upload")
228
- validate_pinecone_index(
229
- index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
230
- )
231
-
232
-
233
- @requires_env(API_KEY)
234
- @pytest.mark.asyncio
235
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
236
- async def test_pinecone_destination_namespace(
237
- pinecone_index: str, upload_file: Path, temp_dir: Path
238
- ):
239
- """
240
- tests namespace functionality of destination connector.
241
- """
242
-
243
- # creates a file data structure.
244
- file_data = FileData(
245
- source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
246
- connector_type=CONNECTOR_TYPE,
247
- identifier="pinecone_mock_id",
248
- )
249
-
250
- connection_config = PineconeConnectionConfig(
251
- index_name=pinecone_index,
252
- access_config=PineconeAccessConfig(api_key=get_api_key()),
253
- )
254
-
255
- stager_config = PineconeUploadStagerConfig()
256
-
257
- stager = PineconeUploadStager(upload_stager_config=stager_config)
258
- new_upload_file = stager.run(
259
- elements_filepath=upload_file,
260
- output_dir=temp_dir,
261
- output_filename=upload_file.name,
262
- file_data=file_data,
263
- )
264
-
265
- # here add namespace defintion
266
- upload_config = PineconeUploaderConfig()
267
- namespace_test_name = "user-1"
268
- upload_config.namespace = namespace_test_name
269
- uploader = PineconeUploader(connection_config=connection_config, upload_config=upload_config)
270
- uploader.precheck()
271
-
272
- uploader.run(path=new_upload_file, file_data=file_data)
273
- with new_upload_file.open() as f:
274
- staged_content = json.load(f)
275
- expected_num_of_vectors = len(staged_content)
276
- logger.info("validating first upload")
277
- validate_pinecone_index(
278
- index_name=pinecone_index,
279
- expected_num_of_vectors=expected_num_of_vectors,
280
- namespace=namespace_test_name,
281
- )
282
-
283
- # Rerun uploader and make sure no duplicates exist
284
- uploader.run(path=new_upload_file, file_data=file_data)
285
- logger.info("validating second upload")
286
- validate_pinecone_index(
287
- index_name=pinecone_index,
288
- expected_num_of_vectors=expected_num_of_vectors,
289
- namespace=namespace_test_name,
290
- )
291
-
292
-
293
- @requires_env(API_KEY)
294
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
295
- def test_large_metadata(pinecone_index: str, tmp_path: Path, upload_file: Path):
296
- stager = PineconeUploadStager()
297
- uploader = PineconeUploader(
298
- connection_config=PineconeConnectionConfig(
299
- access_config=PineconeAccessConfig(api_key=get_api_key()),
300
- index_name=pinecone_index,
301
- ),
302
- upload_config=PineconeUploaderConfig(),
303
- )
304
- large_metadata_upload_file = tmp_path / "mock-upload-file.pdf.json"
305
- large_metadata = {ALLOWED_METADATA_FIELD: "0" * 2 * METADATA_BYTES_LIMIT}
306
-
307
- with open(upload_file) as file:
308
- elements = json.load(file)
309
-
310
- with open(large_metadata_upload_file, "w") as file:
311
- mock_element = elements[0]
312
- mock_element["metadata"] = large_metadata
313
- json.dump([mock_element], file)
314
-
315
- file_data = FileData(
316
- source_identifiers=SourceIdentifiers(
317
- fullpath=large_metadata_upload_file.name, filename=large_metadata_upload_file.name
318
- ),
319
- connector_type=CONNECTOR_TYPE,
320
- identifier="mock-file-data",
321
- )
322
- staged_file = stager.run(
323
- elements_filepath=large_metadata_upload_file,
324
- file_data=file_data,
325
- output_dir=tmp_path,
326
- output_filename=large_metadata_upload_file.name,
327
- )
328
- try:
329
- uploader.run(staged_file, file_data)
330
- except DestinationConnectionError as e:
331
- error_line = r"Metadata size is \d+ bytes, which exceeds the limit of \d+ bytes per vector"
332
- if re.search(re.compile(error_line), str(e)) is None:
333
- raise e
334
- raise pytest.fail("Upload request failed due to metadata exceeding limits.")
335
-
336
- validate_pinecone_index(pinecone_index, 1, interval=5)
337
-
338
-
339
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
340
- @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
341
- def test_pinecone_stager(
342
- request: TopRequest,
343
- upload_file_str: str,
344
- tmp_path: Path,
345
- ):
346
- upload_file: Path = request.getfixturevalue(upload_file_str)
347
- stager = PineconeUploadStager()
348
- stager_validation(
349
- configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
350
- input_file=upload_file,
351
- stager=stager,
352
- tmp_dir=tmp_path,
353
- )
354
-
355
-
356
- @requires_env(API_KEY)
357
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
358
- def test_pinecone_create_destination(pinecone_index):
359
- uploader = PineconeUploader(
360
- connection_config=PineconeConnectionConfig(
361
- access_config=PineconeAccessConfig(api_key=get_api_key())
362
- ),
363
- upload_config=PineconeUploaderConfig(),
364
- )
365
-
366
- random_id = str(uuid4()).split("-")[0]
367
-
368
- index_name = f"test-create-destination-{random_id}"
369
-
370
- assert not uploader.index_exists(index_name=index_name)
371
-
372
- try:
373
- uploader.create_destination(destination_name=index_name, vector_length=1536)
374
- except Exception as e:
375
- error_body = getattr(e, "body", None)
376
- raise pytest.fail(f"failed to create destination: {e} {error_body}")
377
-
378
- assert uploader.index_exists(index_name=index_name), "destination was not created successfully"
379
-
380
- try:
381
- pc = uploader.connection_config.get_client()
382
- logger.info(f"deleting index for test create destination: {index_name}")
383
- pc.delete_index(name=index_name)
384
- except Exception as e:
385
- raise pytest.fail(f"failed to cleanup / delete the destination: {e}")
386
-
387
- assert not uploader.index_exists(index_name=index_name), "cleanup failed"
@@ -1,216 +0,0 @@
1
- import json
2
- import os
3
- import uuid
4
- from contextlib import asynccontextmanager
5
- from pathlib import Path
6
- from typing import AsyncGenerator
7
-
8
- import pytest
9
- from _pytest.fixtures import TopRequest
10
- from qdrant_client import AsyncQdrantClient
11
-
12
- from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
13
- from test.integration.connectors.utils.docker import container_context
14
- from test.integration.connectors.utils.validation.destination import (
15
- StagerValidationConfigs,
16
- stager_validation,
17
- )
18
- from test.integration.utils import requires_env
19
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
20
- from unstructured_ingest.processes.connectors.qdrant.cloud import (
21
- CloudQdrantAccessConfig,
22
- CloudQdrantConnectionConfig,
23
- CloudQdrantUploader,
24
- CloudQdrantUploaderConfig,
25
- CloudQdrantUploadStager,
26
- CloudQdrantUploadStagerConfig,
27
- )
28
- from unstructured_ingest.processes.connectors.qdrant.local import (
29
- CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE,
30
- )
31
- from unstructured_ingest.processes.connectors.qdrant.local import (
32
- LocalQdrantConnectionConfig,
33
- LocalQdrantUploader,
34
- LocalQdrantUploaderConfig,
35
- LocalQdrantUploadStager,
36
- LocalQdrantUploadStagerConfig,
37
- )
38
- from unstructured_ingest.processes.connectors.qdrant.server import (
39
- CONNECTOR_TYPE as SERVER_CONNECTOR_TYPE,
40
- )
41
- from unstructured_ingest.processes.connectors.qdrant.server import (
42
- ServerQdrantConnectionConfig,
43
- ServerQdrantUploader,
44
- ServerQdrantUploaderConfig,
45
- ServerQdrantUploadStager,
46
- ServerQdrantUploadStagerConfig,
47
- )
48
-
49
- COLLECTION_NAME = f"test-coll-{uuid.uuid4().hex[:12]}"
50
- VECTORS_CONFIG = {"size": 384, "distance": "Cosine"}
51
-
52
-
53
- @asynccontextmanager
54
- async def qdrant_client(client_params: dict) -> AsyncGenerator[AsyncQdrantClient, None]:
55
- client = AsyncQdrantClient(**client_params)
56
- try:
57
- yield client
58
- finally:
59
- await client.close()
60
-
61
-
62
- async def validate_upload(client: AsyncQdrantClient, upload_file: Path):
63
- with upload_file.open() as upload_fp:
64
- elements = json.load(upload_fp)
65
- expected_point_count = len(elements)
66
- first_element = elements[0]
67
- expected_text = first_element["text"]
68
- embeddings = first_element["embeddings"]
69
- collection = await client.get_collection(COLLECTION_NAME)
70
- assert collection.points_count == expected_point_count
71
-
72
- response = await client.query_points(COLLECTION_NAME, query=embeddings, limit=1)
73
- assert response.points[0].payload is not None
74
- assert response.points[0].payload["text"] == expected_text
75
-
76
-
77
- @pytest.mark.asyncio
78
- @pytest.mark.tags(LOCAL_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant", VECTOR_DB_TAG)
79
- async def test_qdrant_destination_local(upload_file: Path, tmp_path: Path):
80
- connection_kwargs = {"path": str(tmp_path / "qdrant")}
81
- async with qdrant_client(connection_kwargs) as client:
82
- await client.create_collection(COLLECTION_NAME, vectors_config=VECTORS_CONFIG)
83
- AsyncQdrantClient(**connection_kwargs)
84
- stager = LocalQdrantUploadStager(
85
- upload_stager_config=LocalQdrantUploadStagerConfig(),
86
- )
87
- uploader = LocalQdrantUploader(
88
- connection_config=LocalQdrantConnectionConfig(**connection_kwargs),
89
- upload_config=LocalQdrantUploaderConfig(collection_name=COLLECTION_NAME),
90
- )
91
-
92
- file_data = FileData(
93
- source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
94
- connector_type=LOCAL_CONNECTOR_TYPE,
95
- identifier="mock-file-data",
96
- )
97
-
98
- staged_upload_file = stager.run(
99
- elements_filepath=upload_file,
100
- file_data=file_data,
101
- output_dir=tmp_path,
102
- output_filename=upload_file.name,
103
- )
104
-
105
- if uploader.is_async():
106
- await uploader.run_async(path=staged_upload_file, file_data=file_data)
107
- else:
108
- uploader.run(path=upload_file, file_data=file_data)
109
- async with qdrant_client(connection_kwargs) as client:
110
- await validate_upload(client=client, upload_file=upload_file)
111
-
112
-
113
- @pytest.fixture
114
- def docker_context():
115
- with container_context(image="qdrant/qdrant:latest", ports={"6333": "6333"}) as container:
116
- yield container
117
-
118
-
119
- @pytest.mark.asyncio
120
- @pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant", VECTOR_DB_TAG)
121
- async def test_qdrant_destination_server(upload_file: Path, tmp_path: Path, docker_context):
122
- connection_kwargs = {"location": "http://localhost:6333"}
123
- async with qdrant_client(connection_kwargs) as client:
124
- await client.create_collection(COLLECTION_NAME, vectors_config=VECTORS_CONFIG)
125
- AsyncQdrantClient(**connection_kwargs)
126
- stager = ServerQdrantUploadStager(
127
- upload_stager_config=ServerQdrantUploadStagerConfig(),
128
- )
129
- uploader = ServerQdrantUploader(
130
- connection_config=ServerQdrantConnectionConfig(**connection_kwargs),
131
- upload_config=ServerQdrantUploaderConfig(collection_name=COLLECTION_NAME),
132
- )
133
-
134
- file_data = FileData(
135
- source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
136
- connector_type=SERVER_CONNECTOR_TYPE,
137
- identifier="mock-file-data",
138
- )
139
-
140
- staged_upload_file = stager.run(
141
- elements_filepath=upload_file,
142
- file_data=file_data,
143
- output_dir=tmp_path,
144
- output_filename=upload_file.name,
145
- )
146
- uploader.precheck()
147
- if uploader.is_async():
148
- await uploader.run_async(path=staged_upload_file, file_data=file_data)
149
- else:
150
- uploader.run(path=upload_file, file_data=file_data)
151
- async with qdrant_client(connection_kwargs) as client:
152
- await validate_upload(client=client, upload_file=upload_file)
153
-
154
-
155
- @pytest.mark.asyncio
156
- @pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant", VECTOR_DB_TAG)
157
- @requires_env("QDRANT_API_KEY", "QDRANT_SERVER_URL")
158
- async def test_qdrant_destination_cloud(upload_file: Path, tmp_path: Path):
159
- server_url = os.environ["QDRANT_SERVER_URL"]
160
- api_key = os.environ["QDRANT_API_KEY"]
161
- connection_kwargs = {"location": server_url, "api_key": api_key}
162
- async with qdrant_client(connection_kwargs) as client:
163
- await client.create_collection(COLLECTION_NAME, vectors_config=VECTORS_CONFIG)
164
- AsyncQdrantClient(**connection_kwargs)
165
-
166
- stager = CloudQdrantUploadStager(
167
- upload_stager_config=CloudQdrantUploadStagerConfig(),
168
- )
169
- uploader = CloudQdrantUploader(
170
- connection_config=CloudQdrantConnectionConfig(
171
- url=server_url,
172
- access_config=CloudQdrantAccessConfig(
173
- api_key=api_key,
174
- ),
175
- ),
176
- upload_config=CloudQdrantUploaderConfig(collection_name=COLLECTION_NAME),
177
- )
178
-
179
- file_data = FileData(
180
- source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
181
- connector_type=SERVER_CONNECTOR_TYPE,
182
- identifier="mock-file-data",
183
- )
184
-
185
- staged_upload_file = stager.run(
186
- elements_filepath=upload_file,
187
- file_data=file_data,
188
- output_dir=tmp_path,
189
- output_filename=upload_file.name,
190
- )
191
- uploader.precheck()
192
- if uploader.is_async():
193
- await uploader.run_async(path=staged_upload_file, file_data=file_data)
194
- else:
195
- uploader.run(path=staged_upload_file, file_data=file_data)
196
- async with qdrant_client(connection_kwargs) as client:
197
- await validate_upload(client=client, upload_file=upload_file)
198
-
199
-
200
- @pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant", VECTOR_DB_TAG)
201
- @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
202
- def test_qdrant_stager(
203
- request: TopRequest,
204
- upload_file_str: str,
205
- tmp_path: Path,
206
- ):
207
- upload_file: Path = request.getfixturevalue(upload_file_str)
208
- stager = LocalQdrantUploadStager(
209
- upload_stager_config=LocalQdrantUploadStagerConfig(),
210
- )
211
- stager_validation(
212
- configs=StagerValidationConfigs(test_id=LOCAL_CONNECTOR_TYPE, expected_count=22),
213
- input_file=upload_file,
214
- stager=stager,
215
- tmp_dir=tmp_path,
216
- )