unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (187) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/README.md +28 -0
  3. unstructured_ingest/embed/mixedbreadai.py +0 -1
  4. unstructured_ingest/interfaces/upload_stager.py +2 -2
  5. unstructured_ingest/interfaces/uploader.py +3 -3
  6. unstructured_ingest/main.py +0 -0
  7. unstructured_ingest/pipeline/interfaces.py +1 -1
  8. unstructured_ingest/pipeline/pipeline.py +1 -1
  9. unstructured_ingest/processes/chunker.py +4 -0
  10. unstructured_ingest/processes/connectors/airtable.py +4 -2
  11. unstructured_ingest/processes/connectors/astradb.py +2 -2
  12. unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  13. unstructured_ingest/processes/connectors/confluence.py +0 -1
  14. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
  15. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
  16. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
  17. unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
  18. unstructured_ingest/processes/connectors/delta_table.py +1 -0
  19. unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
  20. unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
  21. unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
  22. unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
  23. unstructured_ingest/processes/connectors/gitlab.py +1 -2
  24. unstructured_ingest/processes/connectors/google_drive.py +0 -2
  25. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
  26. unstructured_ingest/processes/connectors/kdbai.py +1 -0
  27. unstructured_ingest/processes/connectors/outlook.py +1 -2
  28. unstructured_ingest/processes/connectors/pinecone.py +0 -1
  29. unstructured_ingest/processes/connectors/redisdb.py +28 -24
  30. unstructured_ingest/processes/connectors/salesforce.py +1 -1
  31. unstructured_ingest/processes/connectors/slack.py +1 -2
  32. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
  33. unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
  34. unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
  35. unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
  36. unstructured_ingest/processes/connectors/sql/sql.py +3 -4
  37. unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
  38. unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
  39. unstructured_ingest/processes/connectors/vectara.py +0 -2
  40. unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
  41. unstructured_ingest/processes/embedder.py +2 -2
  42. unstructured_ingest/processes/filter.py +1 -1
  43. unstructured_ingest/processes/partitioner.py +4 -0
  44. unstructured_ingest/processes/utils/blob_storage.py +2 -2
  45. unstructured_ingest/unstructured_api.py +13 -8
  46. unstructured_ingest/utils/data_prep.py +8 -32
  47. unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
  48. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +50 -184
  49. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
  50. examples/__init__.py +0 -0
  51. examples/airtable.py +0 -44
  52. examples/azure_cognitive_search.py +0 -55
  53. examples/chroma.py +0 -54
  54. examples/couchbase.py +0 -55
  55. examples/databricks_volumes_dest.py +0 -55
  56. examples/databricks_volumes_source.py +0 -53
  57. examples/delta_table.py +0 -45
  58. examples/discord_example.py +0 -36
  59. examples/elasticsearch.py +0 -49
  60. examples/google_drive.py +0 -45
  61. examples/kdbai.py +0 -54
  62. examples/local.py +0 -36
  63. examples/milvus.py +0 -44
  64. examples/mongodb.py +0 -53
  65. examples/opensearch.py +0 -50
  66. examples/pinecone.py +0 -57
  67. examples/s3.py +0 -38
  68. examples/salesforce.py +0 -44
  69. examples/sharepoint.py +0 -47
  70. examples/singlestore.py +0 -49
  71. examples/sql.py +0 -90
  72. examples/vectara.py +0 -54
  73. examples/weaviate.py +0 -44
  74. test/__init__.py +0 -0
  75. test/integration/__init__.py +0 -0
  76. test/integration/chunkers/__init__.py +0 -0
  77. test/integration/chunkers/test_chunkers.py +0 -31
  78. test/integration/connectors/__init__.py +0 -0
  79. test/integration/connectors/conftest.py +0 -38
  80. test/integration/connectors/databricks/__init__.py +0 -0
  81. test/integration/connectors/databricks/test_volumes_native.py +0 -273
  82. test/integration/connectors/discord/__init__.py +0 -0
  83. test/integration/connectors/discord/test_discord.py +0 -90
  84. test/integration/connectors/duckdb/__init__.py +0 -0
  85. test/integration/connectors/duckdb/conftest.py +0 -14
  86. test/integration/connectors/duckdb/test_duckdb.py +0 -90
  87. test/integration/connectors/duckdb/test_motherduck.py +0 -95
  88. test/integration/connectors/elasticsearch/__init__.py +0 -0
  89. test/integration/connectors/elasticsearch/conftest.py +0 -34
  90. test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
  91. test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
  92. test/integration/connectors/sql/__init__.py +0 -0
  93. test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
  94. test/integration/connectors/sql/test_postgres.py +0 -201
  95. test/integration/connectors/sql/test_singlestore.py +0 -182
  96. test/integration/connectors/sql/test_snowflake.py +0 -244
  97. test/integration/connectors/sql/test_sqlite.py +0 -168
  98. test/integration/connectors/sql/test_vastdb.py +0 -34
  99. test/integration/connectors/test_astradb.py +0 -287
  100. test/integration/connectors/test_azure_ai_search.py +0 -254
  101. test/integration/connectors/test_chroma.py +0 -136
  102. test/integration/connectors/test_confluence.py +0 -111
  103. test/integration/connectors/test_delta_table.py +0 -183
  104. test/integration/connectors/test_dropbox.py +0 -151
  105. test/integration/connectors/test_github.py +0 -49
  106. test/integration/connectors/test_google_drive.py +0 -257
  107. test/integration/connectors/test_jira.py +0 -67
  108. test/integration/connectors/test_lancedb.py +0 -247
  109. test/integration/connectors/test_milvus.py +0 -208
  110. test/integration/connectors/test_mongodb.py +0 -335
  111. test/integration/connectors/test_neo4j.py +0 -244
  112. test/integration/connectors/test_notion.py +0 -152
  113. test/integration/connectors/test_onedrive.py +0 -163
  114. test/integration/connectors/test_pinecone.py +0 -387
  115. test/integration/connectors/test_qdrant.py +0 -216
  116. test/integration/connectors/test_redis.py +0 -143
  117. test/integration/connectors/test_s3.py +0 -184
  118. test/integration/connectors/test_sharepoint.py +0 -222
  119. test/integration/connectors/test_vectara.py +0 -282
  120. test/integration/connectors/test_zendesk.py +0 -120
  121. test/integration/connectors/utils/__init__.py +0 -0
  122. test/integration/connectors/utils/constants.py +0 -13
  123. test/integration/connectors/utils/docker.py +0 -151
  124. test/integration/connectors/utils/docker_compose.py +0 -59
  125. test/integration/connectors/utils/validation/__init__.py +0 -0
  126. test/integration/connectors/utils/validation/destination.py +0 -77
  127. test/integration/connectors/utils/validation/equality.py +0 -76
  128. test/integration/connectors/utils/validation/source.py +0 -331
  129. test/integration/connectors/utils/validation/utils.py +0 -36
  130. test/integration/connectors/weaviate/__init__.py +0 -0
  131. test/integration/connectors/weaviate/conftest.py +0 -15
  132. test/integration/connectors/weaviate/test_cloud.py +0 -39
  133. test/integration/connectors/weaviate/test_local.py +0 -152
  134. test/integration/embedders/__init__.py +0 -0
  135. test/integration/embedders/conftest.py +0 -13
  136. test/integration/embedders/test_azure_openai.py +0 -57
  137. test/integration/embedders/test_bedrock.py +0 -103
  138. test/integration/embedders/test_huggingface.py +0 -24
  139. test/integration/embedders/test_mixedbread.py +0 -71
  140. test/integration/embedders/test_octoai.py +0 -75
  141. test/integration/embedders/test_openai.py +0 -74
  142. test/integration/embedders/test_togetherai.py +0 -71
  143. test/integration/embedders/test_vertexai.py +0 -63
  144. test/integration/embedders/test_voyageai.py +0 -79
  145. test/integration/embedders/utils.py +0 -66
  146. test/integration/partitioners/__init__.py +0 -0
  147. test/integration/partitioners/test_partitioner.py +0 -76
  148. test/integration/utils.py +0 -15
  149. test/unit/__init__.py +0 -0
  150. test/unit/chunkers/__init__.py +0 -0
  151. test/unit/chunkers/test_chunkers.py +0 -49
  152. test/unit/connectors/__init__.py +0 -0
  153. test/unit/connectors/ibm_watsonx/__init__.py +0 -0
  154. test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
  155. test/unit/connectors/motherduck/__init__.py +0 -0
  156. test/unit/connectors/motherduck/test_base.py +0 -73
  157. test/unit/connectors/sql/__init__.py +0 -0
  158. test/unit/connectors/sql/test_sql.py +0 -152
  159. test/unit/connectors/test_confluence.py +0 -71
  160. test/unit/connectors/test_jira.py +0 -401
  161. test/unit/embed/__init__.py +0 -0
  162. test/unit/embed/test_mixedbreadai.py +0 -42
  163. test/unit/embed/test_octoai.py +0 -27
  164. test/unit/embed/test_openai.py +0 -28
  165. test/unit/embed/test_vertexai.py +0 -25
  166. test/unit/embed/test_voyageai.py +0 -24
  167. test/unit/embedders/__init__.py +0 -0
  168. test/unit/embedders/test_bedrock.py +0 -36
  169. test/unit/embedders/test_huggingface.py +0 -48
  170. test/unit/embedders/test_mixedbread.py +0 -37
  171. test/unit/embedders/test_octoai.py +0 -35
  172. test/unit/embedders/test_openai.py +0 -35
  173. test/unit/embedders/test_togetherai.py +0 -37
  174. test/unit/embedders/test_vertexai.py +0 -37
  175. test/unit/embedders/test_voyageai.py +0 -38
  176. test/unit/partitioners/__init__.py +0 -0
  177. test/unit/partitioners/test_partitioner.py +0 -63
  178. test/unit/test_error.py +0 -27
  179. test/unit/test_html.py +0 -112
  180. test/unit/test_interfaces.py +0 -26
  181. test/unit/test_utils.py +0 -220
  182. test/unit/utils/__init__.py +0 -0
  183. test/unit/utils/data_generator.py +0 -32
  184. unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
  185. unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
  186. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
  187. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
@@ -1,287 +0,0 @@
1
- import contextlib
2
- import json
3
- import os
4
- from dataclasses import dataclass
5
- from pathlib import Path
6
- from uuid import uuid4
7
-
8
- import pytest
9
- from _pytest.fixtures import TopRequest
10
- from astrapy import Collection
11
- from astrapy import DataAPIClient as AstraDBClient
12
-
13
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, VECTOR_DB_TAG
14
- from test.integration.connectors.utils.validation.destination import (
15
- StagerValidationConfigs,
16
- stager_validation,
17
- )
18
- from test.integration.connectors.utils.validation.source import (
19
- SourceValidationConfigs,
20
- source_connector_validation,
21
- )
22
- from test.integration.utils import requires_env
23
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
24
- from unstructured_ingest.processes.connectors.astradb import (
25
- CONNECTOR_TYPE,
26
- AstraDBAccessConfig,
27
- AstraDBConnectionConfig,
28
- AstraDBDownloader,
29
- AstraDBDownloaderConfig,
30
- AstraDBIndexer,
31
- AstraDBIndexerConfig,
32
- AstraDBUploader,
33
- AstraDBUploaderConfig,
34
- AstraDBUploadStager,
35
- AstraDBUploadStagerConfig,
36
- DestinationConnectionError,
37
- SourceConnectionError,
38
- )
39
-
40
- EXISTENT_COLLECTION_NAME = "ingest_test_src"
41
- NONEXISTENT_COLLECTION_NAME = "nonexistant"
42
-
43
-
44
- @pytest.fixture
45
- def connection_config() -> AstraDBConnectionConfig:
46
- return AstraDBConnectionConfig(
47
- access_config=AstraDBAccessConfig(
48
- token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
49
- api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
50
- )
51
- )
52
-
53
-
54
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, VECTOR_DB_TAG)
55
- @requires_env("ASTRA_DB_APPLICATION_TOKEN", "ASTRA_DB_API_ENDPOINT")
56
- def test_precheck_succeeds_indexer(connection_config: AstraDBConnectionConfig):
57
- indexer = AstraDBIndexer(
58
- connection_config=connection_config,
59
- index_config=AstraDBIndexerConfig(collection_name=EXISTENT_COLLECTION_NAME),
60
- )
61
- indexer.precheck()
62
-
63
-
64
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
65
- @requires_env("ASTRA_DB_APPLICATION_TOKEN", "ASTRA_DB_API_ENDPOINT")
66
- def test_precheck_succeeds_uploader(connection_config: AstraDBConnectionConfig):
67
- uploader = AstraDBUploader(
68
- connection_config=connection_config,
69
- upload_config=AstraDBUploaderConfig(collection_name=EXISTENT_COLLECTION_NAME),
70
- )
71
- uploader.precheck()
72
-
73
- uploader2 = AstraDBUploader(
74
- connection_config=connection_config,
75
- upload_config=AstraDBUploaderConfig(),
76
- )
77
- uploader2.precheck()
78
-
79
-
80
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, VECTOR_DB_TAG)
81
- @requires_env("ASTRA_DB_APPLICATION_TOKEN", "ASTRA_DB_API_ENDPOINT")
82
- def test_precheck_fails_indexer(connection_config: AstraDBConnectionConfig):
83
- indexer = AstraDBIndexer(
84
- connection_config=connection_config,
85
- index_config=AstraDBIndexerConfig(collection_name=NONEXISTENT_COLLECTION_NAME),
86
- )
87
- with pytest.raises(expected_exception=SourceConnectionError):
88
- indexer.precheck()
89
-
90
-
91
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
92
- @requires_env("ASTRA_DB_APPLICATION_TOKEN", "ASTRA_DB_API_ENDPOINT")
93
- def test_precheck_fails_uploader(connection_config: AstraDBConnectionConfig):
94
- uploader = AstraDBUploader(
95
- connection_config=connection_config,
96
- upload_config=AstraDBUploaderConfig(collection_name=NONEXISTENT_COLLECTION_NAME),
97
- )
98
- with pytest.raises(expected_exception=DestinationConnectionError):
99
- uploader.precheck()
100
-
101
-
102
- @dataclass(frozen=True)
103
- class EnvData:
104
- api_endpoint: str
105
- token: str
106
-
107
-
108
- def get_env_data() -> EnvData:
109
- return EnvData(
110
- api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
111
- token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
112
- )
113
-
114
-
115
- @pytest.fixture
116
- def collection(upload_file: Path) -> Collection:
117
- random_id = str(uuid4())[:8]
118
- collection_name = f"utic_test_{random_id}"
119
- with upload_file.open("r") as upload_fp:
120
- upload_data = json.load(upload_fp)
121
- first_content = upload_data[0]
122
- embeddings = first_content["embeddings"]
123
- embedding_dimension = len(embeddings)
124
- my_client = AstraDBClient()
125
- env_data = get_env_data()
126
- astra_db = my_client.get_database(
127
- api_endpoint=env_data.api_endpoint,
128
- token=env_data.token,
129
- )
130
- collection = astra_db.create_collection(collection_name, dimension=embedding_dimension)
131
- try:
132
- yield collection
133
- finally:
134
- astra_db.drop_collection(collection)
135
-
136
-
137
- @pytest.mark.asyncio
138
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, VECTOR_DB_TAG)
139
- @requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
140
- async def test_astra_search_source(
141
- tmp_path: Path,
142
- ):
143
- env_data = get_env_data()
144
- collection_name = "ingest_test_src"
145
- connection_config = AstraDBConnectionConfig(
146
- access_config=AstraDBAccessConfig(token=env_data.token, api_endpoint=env_data.api_endpoint)
147
- )
148
- indexer = AstraDBIndexer(
149
- index_config=AstraDBIndexerConfig(
150
- collection_name=collection_name,
151
- ),
152
- connection_config=connection_config,
153
- )
154
- downloader = AstraDBDownloader(
155
- connection_config=connection_config,
156
- download_config=AstraDBDownloaderConfig(download_dir=tmp_path),
157
- )
158
-
159
- await source_connector_validation(
160
- indexer=indexer,
161
- downloader=downloader,
162
- configs=SourceValidationConfigs(
163
- test_id=CONNECTOR_TYPE,
164
- expected_num_files=5,
165
- expected_number_indexed_file_data=1,
166
- validate_downloaded_files=True,
167
- ),
168
- )
169
-
170
-
171
- @pytest.mark.asyncio
172
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
173
- @requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
174
- async def test_astra_search_destination(
175
- upload_file: Path,
176
- collection: Collection,
177
- tmp_path: Path,
178
- ):
179
- file_data = FileData(
180
- source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
181
- connector_type=CONNECTOR_TYPE,
182
- identifier="mock file data",
183
- )
184
- stager = AstraDBUploadStager()
185
- env_data = get_env_data()
186
- uploader = AstraDBUploader(
187
- connection_config=AstraDBConnectionConfig(
188
- access_config=AstraDBAccessConfig(
189
- api_endpoint=env_data.api_endpoint, token=env_data.token
190
- ),
191
- ),
192
- upload_config=AstraDBUploaderConfig(collection_name=collection.name),
193
- )
194
- staged_filepath = stager.run(
195
- elements_filepath=upload_file,
196
- file_data=file_data,
197
- output_dir=tmp_path,
198
- output_filename=upload_file.name,
199
- )
200
- uploader.precheck()
201
- uploader.run(path=staged_filepath, file_data=file_data)
202
-
203
- # Run validation
204
- with staged_filepath.open() as f:
205
- staged_elements = json.load(f)
206
- expected_count = len(staged_elements)
207
- current_count = collection.count_documents(filter={}, upper_bound=expected_count * 2)
208
- assert current_count == expected_count, (
209
- f"Expected count ({expected_count}) doesn't match how "
210
- f"much came back from collection: {current_count}"
211
- )
212
-
213
- # Rerun and make sure the same documents get updated
214
- uploader.run(path=staged_filepath, file_data=file_data)
215
- current_count = collection.count_documents(filter={}, upper_bound=expected_count * 2)
216
- assert current_count == expected_count, (
217
- f"Expected count ({expected_count}) doesn't match how "
218
- f"much came back from collection: {current_count}"
219
- )
220
-
221
-
222
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
223
- @requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
224
- def test_astra_create_destination():
225
- env_data = get_env_data()
226
- connection_config = AstraDBConnectionConfig(
227
- access_config=AstraDBAccessConfig(api_endpoint=env_data.api_endpoint, token=env_data.token),
228
- )
229
- uploader = AstraDBUploader(
230
- connection_config=connection_config,
231
- upload_config=AstraDBUploaderConfig(),
232
- )
233
- collection_name = "system_created-123"
234
- formatted_collection_name = "system_created_123"
235
-
236
- client = AstraDBClient()
237
- db = client.get_database(api_endpoint=env_data.api_endpoint, token=env_data.token)
238
- with contextlib.suppress(Exception):
239
- # drop collection before trying to create it
240
- db.drop_collection(formatted_collection_name)
241
-
242
- created = uploader.create_destination(destination_name=collection_name, vector_length=3072)
243
- assert created
244
- assert uploader.upload_config.collection_name == formatted_collection_name
245
-
246
- created = uploader.create_destination(destination_name=collection_name, vector_length=3072)
247
- assert not created
248
-
249
- # cleanup
250
- db.drop_collection(formatted_collection_name)
251
-
252
-
253
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
254
- @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
255
- def test_astra_stager(
256
- request: TopRequest,
257
- upload_file_str: str,
258
- tmp_path: Path,
259
- ):
260
- upload_file: Path = request.getfixturevalue(upload_file_str)
261
- stager = AstraDBUploadStager()
262
- stager_validation(
263
- configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
264
- input_file=upload_file,
265
- stager=stager,
266
- tmp_dir=tmp_path,
267
- )
268
-
269
-
270
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
271
- @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
272
- def test_astra_stager_flatten_metadata(
273
- request: TopRequest,
274
- upload_file_str: str,
275
- tmp_path: Path,
276
- ):
277
- stager_config = AstraDBUploadStagerConfig(flatten_metadata=True)
278
- upload_file: Path = request.getfixturevalue(upload_file_str)
279
- stager = AstraDBUploadStager(upload_stager_config=stager_config)
280
- stager_validation(
281
- configs=StagerValidationConfigs(
282
- test_id=CONNECTOR_TYPE, expected_count=22, expected_folder="stager_flatten_metadata"
283
- ),
284
- input_file=upload_file,
285
- stager=stager,
286
- tmp_dir=tmp_path,
287
- )
@@ -1,254 +0,0 @@
1
- import json
2
- import os
3
- import time
4
- from pathlib import Path
5
- from uuid import uuid4
6
-
7
- import pytest
8
- from _pytest.fixtures import TopRequest
9
- from azure.core.credentials import AzureKeyCredential
10
- from azure.search.documents import SearchClient
11
- from azure.search.documents.indexes import SearchIndexClient
12
- from azure.search.documents.indexes.models import (
13
- ComplexField,
14
- CorsOptions,
15
- HnswAlgorithmConfiguration,
16
- HnswParameters,
17
- SearchField,
18
- SearchFieldDataType,
19
- SearchIndex,
20
- SimpleField,
21
- VectorSearch,
22
- VectorSearchAlgorithmMetric,
23
- VectorSearchProfile,
24
- )
25
-
26
- from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
27
- from test.integration.connectors.utils.validation.destination import (
28
- StagerValidationConfigs,
29
- stager_validation,
30
- )
31
- from test.integration.utils import requires_env
32
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
33
- from unstructured_ingest.processes.connectors.azure_ai_search import (
34
- CONNECTOR_TYPE,
35
- RECORD_ID_LABEL,
36
- AzureAISearchAccessConfig,
37
- AzureAISearchConnectionConfig,
38
- AzureAISearchUploader,
39
- AzureAISearchUploaderConfig,
40
- AzureAISearchUploadStager,
41
- AzureAISearchUploadStagerConfig,
42
- )
43
-
44
- repo_path = Path(__file__).parent.resolve()
45
-
46
- API_KEY = "AZURE_SEARCH_API_KEY"
47
- ENDPOINT = "https://ingest-test-azure-cognitive-search.search.windows.net"
48
-
49
-
50
- def get_api_key() -> str:
51
- key = os.environ[API_KEY]
52
- return key
53
-
54
-
55
- def get_fields() -> list:
56
- data_source_fields = [
57
- SimpleField(name="url", type=SearchFieldDataType.String),
58
- SimpleField(name="version", type=SearchFieldDataType.String),
59
- SimpleField(name="date_created", type=SearchFieldDataType.DateTimeOffset),
60
- SimpleField(name="date_modified", type=SearchFieldDataType.DateTimeOffset),
61
- SimpleField(name="date_processed", type=SearchFieldDataType.DateTimeOffset),
62
- SimpleField(name="permissions_data", type=SearchFieldDataType.String),
63
- SimpleField(name="record_locator", type=SearchFieldDataType.String),
64
- ]
65
- coordinates_fields = [
66
- SimpleField(name="system", type=SearchFieldDataType.String),
67
- SimpleField(name="layout_width", type=SearchFieldDataType.Double),
68
- SimpleField(name="layout_height", type=SearchFieldDataType.Double),
69
- SimpleField(name="points", type=SearchFieldDataType.String),
70
- ]
71
- metadata_fields = [
72
- SimpleField(name="orig_elements", type=SearchFieldDataType.String),
73
- SimpleField(name="category_depth", type=SearchFieldDataType.Int32),
74
- SimpleField(name="parent_id", type=SearchFieldDataType.String),
75
- SimpleField(name="attached_to_filename", type=SearchFieldDataType.String),
76
- SimpleField(name="filetype", type=SearchFieldDataType.String),
77
- SimpleField(name="last_modified", type=SearchFieldDataType.DateTimeOffset),
78
- SimpleField(name="is_continuation", type=SearchFieldDataType.Boolean),
79
- SimpleField(name="file_directory", type=SearchFieldDataType.String),
80
- SimpleField(name="filename", type=SearchFieldDataType.String),
81
- ComplexField(name="data_source", fields=data_source_fields),
82
- ComplexField(name="coordinates", fields=coordinates_fields),
83
- SimpleField(
84
- name="languages", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
85
- ),
86
- SimpleField(name="page_number", type=SearchFieldDataType.String),
87
- SimpleField(name="links", type=SearchFieldDataType.Collection(SearchFieldDataType.String)),
88
- SimpleField(name="page_name", type=SearchFieldDataType.String),
89
- SimpleField(name="url", type=SearchFieldDataType.String),
90
- SimpleField(
91
- name="link_urls", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
92
- ),
93
- SimpleField(
94
- name="link_texts", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
95
- ),
96
- SimpleField(
97
- name="sent_from", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
98
- ),
99
- SimpleField(
100
- name="sent_to", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
101
- ),
102
- SimpleField(name="subject", type=SearchFieldDataType.String),
103
- SimpleField(name="section", type=SearchFieldDataType.String),
104
- SimpleField(name="header_footer_type", type=SearchFieldDataType.String),
105
- SimpleField(
106
- name="emphasized_text_contents",
107
- type=SearchFieldDataType.Collection(SearchFieldDataType.String),
108
- ),
109
- SimpleField(
110
- name="emphasized_text_tags",
111
- type=SearchFieldDataType.Collection(SearchFieldDataType.String),
112
- ),
113
- SimpleField(name="text_as_html", type=SearchFieldDataType.String),
114
- SimpleField(name="regex_metadata", type=SearchFieldDataType.String),
115
- SimpleField(name="detection_class_prob", type=SearchFieldDataType.Double),
116
- ]
117
- fields = [
118
- SimpleField(name="id", type=SearchFieldDataType.String, key=True),
119
- SimpleField(name=RECORD_ID_LABEL, type=SearchFieldDataType.String, filterable=True),
120
- SimpleField(name="element_id", type=SearchFieldDataType.String),
121
- SimpleField(name="text", type=SearchFieldDataType.String),
122
- SimpleField(name="type", type=SearchFieldDataType.String),
123
- ComplexField(name="metadata", fields=metadata_fields),
124
- SearchField(
125
- name="embeddings",
126
- type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
127
- vector_search_dimensions=384,
128
- vector_search_profile_name="embeddings-config-profile",
129
- ),
130
- ]
131
- return fields
132
-
133
-
134
- def get_vector_search() -> VectorSearch:
135
- return VectorSearch(
136
- algorithms=[
137
- HnswAlgorithmConfiguration(
138
- name="hnsw-config",
139
- parameters=HnswParameters(
140
- metric=VectorSearchAlgorithmMetric.COSINE,
141
- ),
142
- )
143
- ],
144
- profiles=[
145
- VectorSearchProfile(
146
- name="embeddings-config-profile", algorithm_configuration_name="hnsw-config"
147
- )
148
- ],
149
- )
150
-
151
-
152
- def get_search_index_client() -> SearchIndexClient:
153
- api_key = get_api_key()
154
- return SearchIndexClient(ENDPOINT, AzureKeyCredential(api_key))
155
-
156
-
157
- @pytest.fixture
158
- def index() -> str:
159
- random_id = str(uuid4())[:8]
160
- index_name = f"utic-test-{random_id}"
161
- client = get_search_index_client()
162
- index = SearchIndex(
163
- name=index_name,
164
- fields=get_fields(),
165
- vector_search=get_vector_search(),
166
- cors_options=CorsOptions(allowed_origins=["*"], max_age_in_seconds=60),
167
- )
168
- print(f"creating index: {index_name}")
169
- client.create_index(index=index)
170
- try:
171
- yield index_name
172
- finally:
173
- print(f"deleting index: {index_name}")
174
- client.delete_index(index)
175
-
176
-
177
- def validate_count(
178
- search_client: SearchClient, expected_count: int, retries: int = 10, interval: int = 1
179
- ) -> None:
180
- index_count = search_client.get_document_count()
181
- if index_count == expected_count:
182
- return
183
- tries = 0
184
- while tries < retries:
185
- time.sleep(interval)
186
- index_count = search_client.get_document_count()
187
- if index_count == expected_count:
188
- break
189
- assert index_count == expected_count, (
190
- f"Expected count ({expected_count}) doesn't match how "
191
- f"much came back from index: {index_count}"
192
- )
193
-
194
-
195
- @pytest.mark.asyncio
196
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
197
- @requires_env("AZURE_SEARCH_API_KEY")
198
- async def test_azure_ai_search_destination(
199
- upload_file: Path,
200
- index: str,
201
- tmp_path: Path,
202
- ):
203
- file_data = FileData(
204
- source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
205
- connector_type=CONNECTOR_TYPE,
206
- identifier="mock file data",
207
- )
208
- stager = AzureAISearchUploadStager(upload_stager_config=AzureAISearchUploadStagerConfig())
209
-
210
- uploader = AzureAISearchUploader(
211
- connection_config=AzureAISearchConnectionConfig(
212
- access_config=AzureAISearchAccessConfig(key=get_api_key()),
213
- endpoint=ENDPOINT,
214
- index=index,
215
- ),
216
- upload_config=AzureAISearchUploaderConfig(),
217
- )
218
- staged_filepath = stager.run(
219
- elements_filepath=upload_file,
220
- file_data=file_data,
221
- output_dir=tmp_path,
222
- output_filename=upload_file.name,
223
- )
224
- uploader.precheck()
225
- uploader.run(path=staged_filepath, file_data=file_data)
226
-
227
- # Run validation
228
- with staged_filepath.open() as f:
229
- staged_elements = json.load(f)
230
- expected_count = len(staged_elements)
231
- with uploader.connection_config.get_search_client() as search_client:
232
- validate_count(search_client=search_client, expected_count=expected_count)
233
-
234
- # Rerun and make sure the same documents get updated
235
- uploader.run(path=staged_filepath, file_data=file_data)
236
- with uploader.connection_config.get_search_client() as search_client:
237
- validate_count(search_client=search_client, expected_count=expected_count)
238
-
239
-
240
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
241
- @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
242
- def test_azure_ai_search_stager(
243
- request: TopRequest,
244
- upload_file_str: str,
245
- tmp_path: Path,
246
- ):
247
- upload_file: Path = request.getfixturevalue(upload_file_str)
248
- stager = AzureAISearchUploadStager()
249
- stager_validation(
250
- configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
251
- input_file=upload_file,
252
- stager=stager,
253
- tmp_dir=tmp_path,
254
- )
@@ -1,136 +0,0 @@
1
- # add this back in when figure out why it's failing since NOTHING changed when it started failing
2
-
3
- # ==================================== ERRORS ====================================
4
- # _________ ERROR collecting test/integration/connectors/test_chroma.py __________
5
- # ImportError while importing test module '/home/runner/work/unstructured-ingest/
6
- # unstructured-ingest/test/integration/connectors/test_chroma.py'.
7
- # Hint: make sure your test modules/packages have valid Python names.
8
- # Traceback:
9
- # /opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/importlib/__init__.py:126: in import_module
10
- # return _bootstrap._gcd_import(name[level:], package, level)
11
- # test/integration/connectors/test_chroma.py:4: in <module>
12
- # import chromadb
13
- # E ModuleNotFoundError: No module named 'chromadb'
14
-
15
-
16
- """
17
- import json
18
- from pathlib import Path
19
-
20
- import chromadb
21
- import pytest
22
- from _pytest.fixtures import TopRequest
23
-
24
- from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
25
- from test.integration.connectors.utils.docker import HealthCheck, container_context
26
- from test.integration.connectors.utils.validation.destination import (
27
- StagerValidationConfigs,
28
- stager_validation,
29
- )
30
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
31
- from unstructured_ingest.processes.connectors.chroma import (
32
- CONNECTOR_TYPE,
33
- ChromaConnectionConfig,
34
- ChromaUploader,
35
- ChromaUploaderConfig,
36
- ChromaUploadStager,
37
- ChromaUploadStagerConfig,
38
- )
39
-
40
-
41
- @pytest.fixture
42
- def chroma_instance():
43
- with container_context(
44
- image="chromadb/chroma:0.6.2",
45
- ports={8000: 8000},
46
- name="chroma_int_test",
47
- healthcheck=HealthCheck(
48
- interval=5,
49
- timeout=10,
50
- retries=3,
51
- test="timeout 10s bash -c ':> /dev/tcp/127.0.0.1/8000' || exit 1",
52
- ),
53
- ) as ctx:
54
- yield ctx
55
-
56
-
57
- def validate_collection(collection_name: str, num_embeddings: int):
58
- print(f"Checking contents of Chroma collection: {collection_name}")
59
-
60
- chroma_client = chromadb.HttpClient(
61
- host="localhost",
62
- port="8000",
63
- tenant="default_tenant",
64
- database="default_database",
65
- )
66
-
67
- collection = chroma_client.get_or_create_collection(name=collection_name)
68
-
69
- number_of_embeddings = collection.count()
70
- expected_embeddings = num_embeddings
71
- print(
72
- f"# of embeddings in collection vs expected: {number_of_embeddings}/{expected_embeddings}"
73
- )
74
-
75
- assert number_of_embeddings == expected_embeddings, (
76
- f"Number of rows in generated table ({number_of_embeddings}) "
77
- f"doesn't match expected value: {expected_embeddings}"
78
- )
79
-
80
-
81
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
82
- def test_chroma_destination(
83
- upload_file: Path,
84
- chroma_instance,
85
- tmp_path: Path,
86
- ):
87
- collection_name = "test_collection"
88
- file_data = FileData(
89
- source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
90
- connector_type=CONNECTOR_TYPE,
91
- identifier="mock file data",
92
- )
93
- stager = ChromaUploadStager(upload_stager_config=ChromaUploadStagerConfig())
94
-
95
- uploader = ChromaUploader(
96
- connection_config=ChromaConnectionConfig(
97
- host="localhost",
98
- port=8000,
99
- tenant="default_tenant",
100
- database="default_database",
101
- ),
102
- upload_config=ChromaUploaderConfig(collection_name=collection_name),
103
- )
104
- staged_filepath = stager.run(
105
- elements_filepath=upload_file,
106
- file_data=file_data,
107
- output_dir=tmp_path,
108
- output_filename=upload_file.name,
109
- )
110
- uploader.precheck()
111
- uploader.run(path=staged_filepath, file_data=file_data)
112
-
113
- # Run validation
114
- with staged_filepath.open() as f:
115
- staged_elements = json.load(f)
116
- expected_count = len(staged_elements)
117
- validate_collection(collection_name=collection_name, num_embeddings=expected_count)
118
-
119
-
120
- @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
121
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "stager", VECTOR_DB_TAG)
122
- def test_chroma_stager(
123
- request: TopRequest,
124
- upload_file_str: str,
125
- tmp_path: Path,
126
- ):
127
- upload_file: Path = request.getfixturevalue(upload_file_str)
128
- stager = ChromaUploadStager()
129
- stager_validation(
130
- configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
131
- input_file=upload_file,
132
- stager=stager,
133
- tmp_dir=tmp_path,
134
- )
135
-
136
- """