unstructured-ingest 0.7.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (192) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/README.md +28 -0
  3. unstructured_ingest/embed/mixedbreadai.py +0 -1
  4. unstructured_ingest/interfaces/upload_stager.py +2 -2
  5. unstructured_ingest/interfaces/uploader.py +3 -3
  6. unstructured_ingest/logger.py +2 -93
  7. unstructured_ingest/main.py +0 -0
  8. unstructured_ingest/pipeline/interfaces.py +1 -1
  9. unstructured_ingest/pipeline/pipeline.py +1 -1
  10. unstructured_ingest/processes/chunker.py +4 -0
  11. unstructured_ingest/processes/connectors/airtable.py +4 -2
  12. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +10 -0
  13. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  14. unstructured_ingest/processes/connectors/astradb.py +2 -2
  15. unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  16. unstructured_ingest/processes/connectors/confluence.py +0 -1
  17. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
  18. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
  19. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
  20. unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
  21. unstructured_ingest/processes/connectors/delta_table.py +1 -0
  22. unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
  23. unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
  24. unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
  25. unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
  26. unstructured_ingest/processes/connectors/gitlab.py +1 -2
  27. unstructured_ingest/processes/connectors/google_drive.py +0 -2
  28. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
  29. unstructured_ingest/processes/connectors/kdbai.py +1 -0
  30. unstructured_ingest/processes/connectors/outlook.py +1 -2
  31. unstructured_ingest/processes/connectors/pinecone.py +0 -1
  32. unstructured_ingest/processes/connectors/redisdb.py +28 -24
  33. unstructured_ingest/processes/connectors/salesforce.py +1 -1
  34. unstructured_ingest/processes/connectors/slack.py +1 -2
  35. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
  36. unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
  37. unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
  38. unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
  39. unstructured_ingest/processes/connectors/sql/sql.py +3 -4
  40. unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
  41. unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
  42. unstructured_ingest/processes/connectors/vectara.py +0 -2
  43. unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
  44. unstructured_ingest/processes/embedder.py +2 -2
  45. unstructured_ingest/processes/filter.py +1 -1
  46. unstructured_ingest/processes/partitioner.py +4 -0
  47. unstructured_ingest/processes/utils/blob_storage.py +2 -2
  48. unstructured_ingest/unstructured_api.py +13 -8
  49. unstructured_ingest/utils/data_prep.py +8 -32
  50. unstructured_ingest/utils/string_and_date_utils.py +3 -3
  51. unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
  52. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +54 -187
  53. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
  54. examples/__init__.py +0 -0
  55. examples/airtable.py +0 -44
  56. examples/azure_cognitive_search.py +0 -55
  57. examples/chroma.py +0 -54
  58. examples/couchbase.py +0 -55
  59. examples/databricks_volumes_dest.py +0 -55
  60. examples/databricks_volumes_source.py +0 -53
  61. examples/delta_table.py +0 -45
  62. examples/discord_example.py +0 -36
  63. examples/elasticsearch.py +0 -49
  64. examples/google_drive.py +0 -45
  65. examples/kdbai.py +0 -54
  66. examples/local.py +0 -36
  67. examples/milvus.py +0 -44
  68. examples/mongodb.py +0 -53
  69. examples/opensearch.py +0 -50
  70. examples/pinecone.py +0 -57
  71. examples/s3.py +0 -38
  72. examples/salesforce.py +0 -44
  73. examples/sharepoint.py +0 -47
  74. examples/singlestore.py +0 -49
  75. examples/sql.py +0 -90
  76. examples/vectara.py +0 -54
  77. examples/weaviate.py +0 -44
  78. test/__init__.py +0 -0
  79. test/integration/__init__.py +0 -0
  80. test/integration/chunkers/__init__.py +0 -0
  81. test/integration/chunkers/test_chunkers.py +0 -31
  82. test/integration/connectors/__init__.py +0 -0
  83. test/integration/connectors/conftest.py +0 -38
  84. test/integration/connectors/databricks/__init__.py +0 -0
  85. test/integration/connectors/databricks/test_volumes_native.py +0 -273
  86. test/integration/connectors/discord/__init__.py +0 -0
  87. test/integration/connectors/discord/test_discord.py +0 -90
  88. test/integration/connectors/duckdb/__init__.py +0 -0
  89. test/integration/connectors/duckdb/conftest.py +0 -14
  90. test/integration/connectors/duckdb/test_duckdb.py +0 -90
  91. test/integration/connectors/duckdb/test_motherduck.py +0 -95
  92. test/integration/connectors/elasticsearch/__init__.py +0 -0
  93. test/integration/connectors/elasticsearch/conftest.py +0 -34
  94. test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
  95. test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
  96. test/integration/connectors/sql/__init__.py +0 -0
  97. test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
  98. test/integration/connectors/sql/test_postgres.py +0 -201
  99. test/integration/connectors/sql/test_singlestore.py +0 -182
  100. test/integration/connectors/sql/test_snowflake.py +0 -244
  101. test/integration/connectors/sql/test_sqlite.py +0 -168
  102. test/integration/connectors/sql/test_vastdb.py +0 -34
  103. test/integration/connectors/test_astradb.py +0 -287
  104. test/integration/connectors/test_azure_ai_search.py +0 -254
  105. test/integration/connectors/test_chroma.py +0 -136
  106. test/integration/connectors/test_confluence.py +0 -111
  107. test/integration/connectors/test_delta_table.py +0 -183
  108. test/integration/connectors/test_dropbox.py +0 -151
  109. test/integration/connectors/test_github.py +0 -49
  110. test/integration/connectors/test_google_drive.py +0 -257
  111. test/integration/connectors/test_jira.py +0 -67
  112. test/integration/connectors/test_lancedb.py +0 -247
  113. test/integration/connectors/test_milvus.py +0 -208
  114. test/integration/connectors/test_mongodb.py +0 -335
  115. test/integration/connectors/test_neo4j.py +0 -244
  116. test/integration/connectors/test_notion.py +0 -152
  117. test/integration/connectors/test_onedrive.py +0 -163
  118. test/integration/connectors/test_pinecone.py +0 -387
  119. test/integration/connectors/test_qdrant.py +0 -216
  120. test/integration/connectors/test_redis.py +0 -143
  121. test/integration/connectors/test_s3.py +0 -184
  122. test/integration/connectors/test_sharepoint.py +0 -222
  123. test/integration/connectors/test_vectara.py +0 -282
  124. test/integration/connectors/test_zendesk.py +0 -120
  125. test/integration/connectors/utils/__init__.py +0 -0
  126. test/integration/connectors/utils/constants.py +0 -13
  127. test/integration/connectors/utils/docker.py +0 -151
  128. test/integration/connectors/utils/docker_compose.py +0 -59
  129. test/integration/connectors/utils/validation/__init__.py +0 -0
  130. test/integration/connectors/utils/validation/destination.py +0 -77
  131. test/integration/connectors/utils/validation/equality.py +0 -76
  132. test/integration/connectors/utils/validation/source.py +0 -331
  133. test/integration/connectors/utils/validation/utils.py +0 -36
  134. test/integration/connectors/weaviate/__init__.py +0 -0
  135. test/integration/connectors/weaviate/conftest.py +0 -15
  136. test/integration/connectors/weaviate/test_cloud.py +0 -39
  137. test/integration/connectors/weaviate/test_local.py +0 -152
  138. test/integration/embedders/__init__.py +0 -0
  139. test/integration/embedders/conftest.py +0 -13
  140. test/integration/embedders/test_azure_openai.py +0 -57
  141. test/integration/embedders/test_bedrock.py +0 -103
  142. test/integration/embedders/test_huggingface.py +0 -24
  143. test/integration/embedders/test_mixedbread.py +0 -71
  144. test/integration/embedders/test_octoai.py +0 -75
  145. test/integration/embedders/test_openai.py +0 -74
  146. test/integration/embedders/test_togetherai.py +0 -71
  147. test/integration/embedders/test_vertexai.py +0 -63
  148. test/integration/embedders/test_voyageai.py +0 -79
  149. test/integration/embedders/utils.py +0 -66
  150. test/integration/partitioners/__init__.py +0 -0
  151. test/integration/partitioners/test_partitioner.py +0 -76
  152. test/integration/utils.py +0 -15
  153. test/unit/__init__.py +0 -0
  154. test/unit/chunkers/__init__.py +0 -0
  155. test/unit/chunkers/test_chunkers.py +0 -49
  156. test/unit/connectors/__init__.py +0 -0
  157. test/unit/connectors/ibm_watsonx/__init__.py +0 -0
  158. test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
  159. test/unit/connectors/motherduck/__init__.py +0 -0
  160. test/unit/connectors/motherduck/test_base.py +0 -73
  161. test/unit/connectors/sql/__init__.py +0 -0
  162. test/unit/connectors/sql/test_sql.py +0 -152
  163. test/unit/connectors/test_confluence.py +0 -71
  164. test/unit/connectors/test_jira.py +0 -401
  165. test/unit/embed/__init__.py +0 -0
  166. test/unit/embed/test_mixedbreadai.py +0 -42
  167. test/unit/embed/test_octoai.py +0 -27
  168. test/unit/embed/test_openai.py +0 -28
  169. test/unit/embed/test_vertexai.py +0 -25
  170. test/unit/embed/test_voyageai.py +0 -24
  171. test/unit/embedders/__init__.py +0 -0
  172. test/unit/embedders/test_bedrock.py +0 -36
  173. test/unit/embedders/test_huggingface.py +0 -48
  174. test/unit/embedders/test_mixedbread.py +0 -37
  175. test/unit/embedders/test_octoai.py +0 -35
  176. test/unit/embedders/test_openai.py +0 -35
  177. test/unit/embedders/test_togetherai.py +0 -37
  178. test/unit/embedders/test_vertexai.py +0 -37
  179. test/unit/embedders/test_voyageai.py +0 -38
  180. test/unit/partitioners/__init__.py +0 -0
  181. test/unit/partitioners/test_partitioner.py +0 -63
  182. test/unit/test_error.py +0 -27
  183. test/unit/test_html.py +0 -112
  184. test/unit/test_interfaces.py +0 -26
  185. test/unit/test_logger.py +0 -78
  186. test/unit/test_utils.py +0 -220
  187. test/unit/utils/__init__.py +0 -0
  188. test/unit/utils/data_generator.py +0 -32
  189. unstructured_ingest-0.7.1.dist-info/METADATA +0 -383
  190. unstructured_ingest-0.7.1.dist-info/top_level.txt +0 -3
  191. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
  192. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
@@ -1,273 +0,0 @@
1
- import json
2
- import os
3
- import uuid
4
- from contextlib import contextmanager
5
- from dataclasses import dataclass
6
- from pathlib import Path
7
- from unittest import mock
8
-
9
- import pytest
10
- from databricks.sdk import WorkspaceClient
11
- from databricks.sdk.errors.platform import NotFound
12
-
13
- from test.integration.connectors.utils.constants import (
14
- BLOB_STORAGE_TAG,
15
- DESTINATION_TAG,
16
- SOURCE_TAG,
17
- )
18
- from test.integration.connectors.utils.validation.source import (
19
- SourceValidationConfigs,
20
- source_connector_validation,
21
- )
22
- from test.integration.utils import requires_env
23
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
24
- from unstructured_ingest.errors_v2 import UserAuthError, UserError
25
- from unstructured_ingest.processes.connectors.databricks.volumes_native import (
26
- CONNECTOR_TYPE,
27
- DatabricksNativeVolumesAccessConfig,
28
- DatabricksNativeVolumesConnectionConfig,
29
- DatabricksNativeVolumesDownloader,
30
- DatabricksNativeVolumesDownloaderConfig,
31
- DatabricksNativeVolumesIndexer,
32
- DatabricksNativeVolumesIndexerConfig,
33
- DatabricksNativeVolumesUploader,
34
- DatabricksNativeVolumesUploaderConfig,
35
- )
36
-
37
-
38
- @dataclass
39
- class BaseEnvData:
40
- host: str
41
- catalog: str
42
-
43
-
44
- @dataclass
45
- class BasicAuthEnvData(BaseEnvData):
46
- client_id: str
47
- client_secret: str
48
-
49
- def get_connection_config(self) -> DatabricksNativeVolumesConnectionConfig:
50
- return DatabricksNativeVolumesConnectionConfig(
51
- host=self.host,
52
- access_config=DatabricksNativeVolumesAccessConfig(
53
- client_id=self.client_id,
54
- client_secret=self.client_secret,
55
- ),
56
- )
57
-
58
-
59
- @dataclass
60
- class PATEnvData(BaseEnvData):
61
- token: str
62
-
63
- def get_connection_config(self) -> DatabricksNativeVolumesConnectionConfig:
64
- return DatabricksNativeVolumesConnectionConfig(
65
- host=self.host,
66
- access_config=DatabricksNativeVolumesAccessConfig(
67
- token=self.token,
68
- ),
69
- )
70
-
71
-
72
- def get_basic_auth_env_data() -> BasicAuthEnvData:
73
- return BasicAuthEnvData(
74
- host=os.environ["DATABRICKS_HOST"],
75
- client_id=os.environ["DATABRICKS_CLIENT_ID"],
76
- client_secret=os.environ["DATABRICKS_CLIENT_SECRET"],
77
- catalog=os.environ["DATABRICKS_CATALOG"],
78
- )
79
-
80
-
81
- def get_pat_env_data() -> PATEnvData:
82
- return PATEnvData(
83
- host=os.environ["DATABRICKS_HOST"],
84
- catalog=os.environ["DATABRICKS_CATALOG"],
85
- token=os.environ["DATABRICKS_PAT"],
86
- )
87
-
88
-
89
- @pytest.mark.asyncio
90
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
91
- @requires_env(
92
- "DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
93
- )
94
- async def test_volumes_native_source(tmp_path: Path):
95
- env_data = get_basic_auth_env_data()
96
- with mock.patch.dict(os.environ, clear=True):
97
- indexer_config = DatabricksNativeVolumesIndexerConfig(
98
- recursive=True,
99
- volume="test-platform",
100
- volume_path="databricks-volumes-test-input",
101
- catalog=env_data.catalog,
102
- )
103
- connection_config = env_data.get_connection_config()
104
- download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tmp_path)
105
- indexer = DatabricksNativeVolumesIndexer(
106
- connection_config=connection_config, index_config=indexer_config
107
- )
108
- downloader = DatabricksNativeVolumesDownloader(
109
- connection_config=connection_config, download_config=download_config
110
- )
111
- await source_connector_validation(
112
- indexer=indexer,
113
- downloader=downloader,
114
- configs=SourceValidationConfigs(
115
- test_id="databricks_volumes_native",
116
- expected_num_files=1,
117
- ),
118
- )
119
-
120
-
121
- @pytest.mark.asyncio
122
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
123
- @requires_env("DATABRICKS_HOST", "DATABRICKS_PAT", "DATABRICKS_CATALOG")
124
- async def test_volumes_native_source_pat(tmp_path: Path):
125
- env_data = get_pat_env_data()
126
- with mock.patch.dict(os.environ, clear=True):
127
- indexer_config = DatabricksNativeVolumesIndexerConfig(
128
- recursive=True,
129
- volume="test-platform",
130
- volume_path="databricks-volumes-test-input",
131
- catalog=env_data.catalog,
132
- )
133
- connection_config = env_data.get_connection_config()
134
- download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tmp_path)
135
- indexer = DatabricksNativeVolumesIndexer(
136
- connection_config=connection_config, index_config=indexer_config
137
- )
138
- downloader = DatabricksNativeVolumesDownloader(
139
- connection_config=connection_config, download_config=download_config
140
- )
141
- await source_connector_validation(
142
- indexer=indexer,
143
- downloader=downloader,
144
- configs=SourceValidationConfigs(
145
- test_id="databricks_volumes_native_pat",
146
- expected_num_files=1,
147
- ),
148
- )
149
-
150
-
151
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
152
- @requires_env("DATABRICKS_HOST", "DATABRICKS_PAT", "DATABRICKS_CATALOG")
153
- def test_volumes_native_source_pat_invalid_catalog():
154
- env_data = get_pat_env_data()
155
- with mock.patch.dict(os.environ, clear=True):
156
- indexer_config = DatabricksNativeVolumesIndexerConfig(
157
- recursive=True,
158
- volume="test-platform",
159
- volume_path="databricks-volumes-test-input",
160
- catalog="fake_catalog",
161
- )
162
- indexer = DatabricksNativeVolumesIndexer(
163
- connection_config=env_data.get_connection_config(), index_config=indexer_config
164
- )
165
- with pytest.raises(UserError):
166
- _ = list(indexer.run())
167
-
168
-
169
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
170
- @requires_env("DATABRICKS_HOST")
171
- def test_volumes_native_source_pat_invalid_pat():
172
- host = os.environ["DATABRICKS_HOST"]
173
- with mock.patch.dict(os.environ, clear=True):
174
- indexer_config = DatabricksNativeVolumesIndexerConfig(
175
- recursive=True,
176
- volume="test-platform",
177
- volume_path="databricks-volumes-test-input",
178
- catalog="fake_catalog",
179
- )
180
- connection_config = DatabricksNativeVolumesConnectionConfig(
181
- host=host,
182
- access_config=DatabricksNativeVolumesAccessConfig(
183
- token="invalid-token",
184
- ),
185
- )
186
- indexer = DatabricksNativeVolumesIndexer(
187
- connection_config=connection_config, index_config=indexer_config
188
- )
189
- with pytest.raises(UserAuthError):
190
- _ = list(indexer.run())
191
-
192
-
193
- def _get_volume_path(catalog: str, volume: str, volume_path: str):
194
- return f"/Volumes/{catalog}/default/{volume}/{volume_path}"
195
-
196
-
197
- @contextmanager
198
- def databricks_destination_context(
199
- env_data: BasicAuthEnvData, volume: str, volume_path
200
- ) -> WorkspaceClient:
201
- client = WorkspaceClient(
202
- host=env_data.host, client_id=env_data.client_id, client_secret=env_data.client_secret
203
- )
204
- try:
205
- yield client
206
- finally:
207
- # Cleanup
208
- try:
209
- for file in client.files.list_directory_contents(
210
- directory_path=_get_volume_path(env_data.catalog, volume, volume_path)
211
- ):
212
- client.files.delete(file.path)
213
- client.files.delete_directory(_get_volume_path(env_data.catalog, volume, volume_path))
214
- except NotFound:
215
- # Directory was never created, don't need to delete
216
- pass
217
-
218
-
219
- def validate_upload(client: WorkspaceClient, catalog: str, volume: str, volume_path: str):
220
- files = list(
221
- client.files.list_directory_contents(
222
- directory_path=_get_volume_path(catalog, volume, volume_path)
223
- )
224
- )
225
-
226
- assert len(files) == 1
227
-
228
- resp = client.files.download(files[0].path)
229
- data = json.loads(resp.contents.read())
230
-
231
- assert len(data) == 22
232
- element_types = {v["type"] for v in data}
233
- assert len(element_types) == 1
234
- assert "CompositeElement" in element_types
235
-
236
-
237
- @pytest.mark.asyncio
238
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, BLOB_STORAGE_TAG)
239
- @requires_env(
240
- "DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
241
- )
242
- async def test_volumes_native_destination(upload_file: Path):
243
- env_data = get_basic_auth_env_data()
244
- volume_path = f"databricks-volumes-test-output-{uuid.uuid4()}"
245
- file_data = FileData(
246
- source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
247
- connector_type=CONNECTOR_TYPE,
248
- identifier="mock file data",
249
- )
250
- with databricks_destination_context(
251
- volume="test-platform", volume_path=volume_path, env_data=env_data
252
- ) as workspace_client:
253
- connection_config = env_data.get_connection_config()
254
- uploader = DatabricksNativeVolumesUploader(
255
- connection_config=connection_config,
256
- upload_config=DatabricksNativeVolumesUploaderConfig(
257
- volume="test-platform",
258
- volume_path=volume_path,
259
- catalog=env_data.catalog,
260
- ),
261
- )
262
- uploader.precheck()
263
- if uploader.is_async():
264
- await uploader.run_async(path=upload_file, file_data=file_data)
265
- else:
266
- uploader.run(path=upload_file, file_data=file_data)
267
-
268
- validate_upload(
269
- client=workspace_client,
270
- catalog=env_data.catalog,
271
- volume="test-platform",
272
- volume_path=volume_path,
273
- )
File without changes
@@ -1,90 +0,0 @@
1
- import os
2
- import tempfile
3
- from dataclasses import dataclass
4
- from pathlib import Path
5
- from typing import Optional
6
-
7
- import pytest
8
-
9
- from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
10
- from test.integration.connectors.utils.validation.source import (
11
- SourceValidationConfigs,
12
- source_connector_validation,
13
- )
14
- from test.integration.utils import requires_env
15
- from unstructured_ingest.error import SourceConnectionError
16
- from unstructured_ingest.processes.connectors.discord import (
17
- CONNECTOR_TYPE,
18
- DiscordAccessConfig,
19
- DiscordConnectionConfig,
20
- DiscordDownloader,
21
- DiscordDownloaderConfig,
22
- DiscordIndexer,
23
- DiscordIndexerConfig,
24
- )
25
-
26
-
27
- @dataclass(frozen=True)
28
- class EnvData:
29
- token: Optional[str]
30
- channels: Optional[list[str]]
31
-
32
-
33
- def get_env_data() -> EnvData:
34
- return EnvData(
35
- token=os.getenv("DISCORD_TOKEN"),
36
- channels=os.getenv("DISCORD_CHANNELS", default=[]).split(","),
37
- )
38
-
39
-
40
- @pytest.mark.asyncio
41
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
42
- @requires_env("DISCORD_TOKEN", "DISCORD_CHANNELS")
43
- async def test_discord_source():
44
- env = get_env_data()
45
- indexer_config = DiscordIndexerConfig(channels=env.channels)
46
- with tempfile.TemporaryDirectory() as tempdir:
47
- tempdir_path = Path(tempdir)
48
- connection_config = DiscordConnectionConfig(
49
- access_config=DiscordAccessConfig(token=env.token)
50
- )
51
- download_config = DiscordDownloaderConfig(download_dir=tempdir_path)
52
- indexer = DiscordIndexer(connection_config=connection_config, index_config=indexer_config)
53
- downloader = DiscordDownloader(
54
- connection_config=connection_config, download_config=download_config
55
- )
56
- expected_num_files = len(env.channels)
57
- await source_connector_validation(
58
- indexer=indexer,
59
- downloader=downloader,
60
- configs=SourceValidationConfigs(
61
- test_id=CONNECTOR_TYPE,
62
- expected_num_files=expected_num_files,
63
- expected_number_indexed_file_data=expected_num_files,
64
- validate_downloaded_files=True,
65
- ),
66
- )
67
-
68
-
69
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
70
- @requires_env("DISCORD_CHANNELS")
71
- def test_discord_source_precheck_fail_no_token():
72
- indexer_config = DiscordIndexerConfig(channels=get_env_data().channels)
73
-
74
- connection_config = DiscordConnectionConfig(access_config=DiscordAccessConfig(token=""))
75
- indexer = DiscordIndexer(connection_config=connection_config, index_config=indexer_config)
76
- with pytest.raises(SourceConnectionError):
77
- indexer.precheck()
78
-
79
-
80
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
81
- @requires_env("DISCORD_TOKEN")
82
- def test_discord_source_precheck_fail_no_channels():
83
- indexer_config = DiscordIndexerConfig(channels=[])
84
-
85
- connection_config = DiscordConnectionConfig(
86
- access_config=DiscordAccessConfig(token=get_env_data().token)
87
- )
88
- indexer = DiscordIndexer(connection_config=connection_config, index_config=indexer_config)
89
- with pytest.raises(SourceConnectionError):
90
- indexer.precheck()
File without changes
@@ -1,14 +0,0 @@
1
- from pathlib import Path
2
-
3
- import pytest
4
-
5
- int_test_dir = Path(__file__).parent
6
- assets_dir = int_test_dir / "assets"
7
-
8
-
9
- @pytest.fixture
10
- def duckdb_schema() -> Path:
11
- schema_file = assets_dir / "duckdb-schema.sql"
12
- assert schema_file.exists()
13
- assert schema_file.is_file()
14
- return schema_file
@@ -1,90 +0,0 @@
1
- import json
2
- from pathlib import Path
3
-
4
- import duckdb
5
- import pytest
6
- from _pytest.fixtures import TopRequest
7
-
8
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
9
- from test.integration.connectors.utils.validation.destination import (
10
- StagerValidationConfigs,
11
- stager_validation,
12
- )
13
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
14
- from unstructured_ingest.processes.connectors.duckdb.duckdb import (
15
- CONNECTOR_TYPE,
16
- DuckDBConnectionConfig,
17
- DuckDBUploader,
18
- DuckDBUploaderConfig,
19
- DuckDBUploadStager,
20
- )
21
-
22
-
23
- @pytest.fixture
24
- def provisioned_db_file(duckdb_schema: Path, temp_dir: Path) -> Path:
25
- db_path = Path(temp_dir) / "temp_duck.db"
26
- with duckdb.connect(database=db_path) as duckdb_connection:
27
- with duckdb_schema.open("r") as f:
28
- query = f.read()
29
- duckdb_connection.execute(query)
30
- duckdb_connection.close()
31
- return db_path
32
-
33
-
34
- def validate_duckdb_destination(db_path: Path, expected_num_elements: int):
35
- conn = None
36
- try:
37
- conn = duckdb.connect(db_path)
38
- _results = conn.sql("select count(*) from elements").fetchall()
39
- _count = _results[0][0]
40
- assert (
41
- _count == expected_num_elements
42
- ), f"dest check failed: got {_count}, expected {expected_num_elements}"
43
- conn.close()
44
- finally:
45
- if conn:
46
- conn.close()
47
-
48
-
49
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "duckdb", SQL_TAG)
50
- def test_duckdb_destination(upload_file: Path, provisioned_db_file: Path, temp_dir: Path):
51
- file_data = FileData(
52
- source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
53
- connector_type=CONNECTOR_TYPE,
54
- identifier="mock-file-data",
55
- )
56
-
57
- stager = DuckDBUploadStager()
58
- staged_path = stager.run(
59
- elements_filepath=upload_file,
60
- file_data=file_data,
61
- output_dir=temp_dir,
62
- output_filename=upload_file.name,
63
- )
64
-
65
- connection_config = DuckDBConnectionConfig(database=str(provisioned_db_file))
66
- upload_config = DuckDBUploaderConfig()
67
- uploader = DuckDBUploader(connection_config=connection_config, upload_config=upload_config)
68
-
69
- uploader.run(path=staged_path, file_data=file_data)
70
-
71
- with staged_path.open() as f:
72
- data = json.load(f)
73
- validate_duckdb_destination(db_path=provisioned_db_file, expected_num_elements=len(data))
74
-
75
-
76
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "duckdb", SQL_TAG)
77
- @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
78
- def test_duckdb_stager(
79
- request: TopRequest,
80
- upload_file_str: str,
81
- tmp_path: Path,
82
- ):
83
- upload_file: Path = request.getfixturevalue(upload_file_str)
84
- stager = DuckDBUploadStager()
85
- stager_validation(
86
- configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
87
- input_file=upload_file,
88
- stager=stager,
89
- tmp_dir=tmp_path,
90
- )
@@ -1,95 +0,0 @@
1
- import os
2
- import uuid
3
- from pathlib import Path
4
- from typing import Generator
5
-
6
- import duckdb
7
- import pandas as pd
8
- import pytest
9
-
10
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
11
- from test.integration.utils import requires_env
12
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
13
- from unstructured_ingest.processes.connectors.duckdb.motherduck import (
14
- CONNECTOR_TYPE,
15
- MotherDuckAccessConfig,
16
- MotherDuckConnectionConfig,
17
- MotherDuckUploader,
18
- MotherDuckUploaderConfig,
19
- MotherDuckUploadStager,
20
- )
21
-
22
-
23
- @pytest.fixture
24
- def md_token() -> str:
25
- motherduck_token = os.getenv("MOTHERDUCK_TOKEN", None)
26
- assert motherduck_token
27
- return motherduck_token
28
-
29
-
30
- @pytest.fixture
31
- def provisioned_db(md_token: str, duckdb_schema: Path) -> Generator[str, None, None]:
32
- database_name = f"test_{str(uuid.uuid4()).replace('-', '_')}"
33
- try:
34
- with duckdb.connect(f"md:?motherduck_token={md_token}") as md_conn:
35
- with duckdb_schema.open("r") as f:
36
- query = f.read()
37
- md_conn.execute(f"CREATE DATABASE {database_name}")
38
- md_conn.execute(f"USE {database_name}")
39
- md_conn.execute(query)
40
- md_conn.close()
41
- yield database_name
42
- finally:
43
- with duckdb.connect(f"md:?motherduck_token={md_token}") as md_conn:
44
- md_conn.execute(f"DROP DATABASE {database_name}")
45
- md_conn.close()
46
-
47
-
48
- def validate_motherduck_destination(database: str, expected_num_elements: int, md_token: str):
49
- conn = None
50
- try:
51
- conn = duckdb.connect(f"md:?motherduck_token={md_token}")
52
- conn.execute(f"USE {database}")
53
- _results = conn.sql("select count(*) from elements").fetchall()
54
- _count = _results[0][0]
55
- assert (
56
- _count == expected_num_elements
57
- ), f"dest check failed: got {_count}, expected {expected_num_elements}"
58
- conn.close()
59
- finally:
60
- if conn:
61
- conn.close()
62
-
63
-
64
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
65
- @requires_env("MOTHERDUCK_TOKEN")
66
- def test_motherduck_destination(
67
- md_token: str, upload_file: Path, provisioned_db: str, temp_dir: Path
68
- ):
69
- file_data = FileData(
70
- source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
71
- connector_type=CONNECTOR_TYPE,
72
- identifier="mock-file-data",
73
- )
74
-
75
- stager = MotherDuckUploadStager()
76
- staged_path = stager.run(
77
- elements_filepath=upload_file,
78
- file_data=file_data,
79
- output_dir=temp_dir,
80
- output_filename=upload_file.name,
81
- )
82
-
83
- access_config = MotherDuckAccessConfig(md_token=md_token)
84
- connection_config = MotherDuckConnectionConfig(
85
- database=provisioned_db, access_config=access_config
86
- )
87
- upload_config = MotherDuckUploaderConfig()
88
- uploader = MotherDuckUploader(connection_config=connection_config, upload_config=upload_config)
89
-
90
- uploader.run(path=staged_path, file_data=file_data)
91
-
92
- staged_df = pd.read_json(staged_path, orient="records", lines=True)
93
- validate_motherduck_destination(
94
- database=provisioned_db, expected_num_elements=len(staged_df), md_token=md_token
95
- )
File without changes
@@ -1,34 +0,0 @@
1
- import json
2
- from pathlib import Path
3
-
4
- import pandas as pd
5
- import pytest
6
-
7
- int_test_dir = Path(__file__).parent
8
- assets_dir = int_test_dir / "assets"
9
-
10
-
11
- @pytest.fixture
12
- def movies_dataframe() -> pd.DataFrame:
13
- movies_file = assets_dir / "wiki_movie_plots_small.csv"
14
- assert movies_file.exists()
15
- assert movies_file.is_file()
16
- return pd.read_csv(movies_file).dropna().reset_index()
17
-
18
-
19
- @pytest.fixture
20
- def opensearch_elements_mapping() -> dict:
21
- elements_mapping_file = assets_dir / "opensearch_elements_mappings.json"
22
- assert elements_mapping_file.exists()
23
- assert elements_mapping_file.is_file()
24
- with elements_mapping_file.open() as fp:
25
- return json.load(fp)
26
-
27
-
28
- @pytest.fixture
29
- def elasticsearch_elements_mapping() -> dict:
30
- elements_mapping_file = assets_dir / "elasticsearch_elements_mappings.json"
31
- assert elements_mapping_file.exists()
32
- assert elements_mapping_file.is_file()
33
- with elements_mapping_file.open() as fp:
34
- return json.load(fp)