unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (187) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/README.md +28 -0
  3. unstructured_ingest/embed/mixedbreadai.py +0 -1
  4. unstructured_ingest/interfaces/upload_stager.py +2 -2
  5. unstructured_ingest/interfaces/uploader.py +3 -3
  6. unstructured_ingest/main.py +0 -0
  7. unstructured_ingest/pipeline/interfaces.py +1 -1
  8. unstructured_ingest/pipeline/pipeline.py +1 -1
  9. unstructured_ingest/processes/chunker.py +4 -0
  10. unstructured_ingest/processes/connectors/airtable.py +4 -2
  11. unstructured_ingest/processes/connectors/astradb.py +2 -2
  12. unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  13. unstructured_ingest/processes/connectors/confluence.py +0 -1
  14. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
  15. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
  16. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
  17. unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
  18. unstructured_ingest/processes/connectors/delta_table.py +1 -0
  19. unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
  20. unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
  21. unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
  22. unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
  23. unstructured_ingest/processes/connectors/gitlab.py +1 -2
  24. unstructured_ingest/processes/connectors/google_drive.py +0 -2
  25. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
  26. unstructured_ingest/processes/connectors/kdbai.py +1 -0
  27. unstructured_ingest/processes/connectors/outlook.py +1 -2
  28. unstructured_ingest/processes/connectors/pinecone.py +0 -1
  29. unstructured_ingest/processes/connectors/redisdb.py +28 -24
  30. unstructured_ingest/processes/connectors/salesforce.py +1 -1
  31. unstructured_ingest/processes/connectors/slack.py +1 -2
  32. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
  33. unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
  34. unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
  35. unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
  36. unstructured_ingest/processes/connectors/sql/sql.py +3 -4
  37. unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
  38. unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
  39. unstructured_ingest/processes/connectors/vectara.py +0 -2
  40. unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
  41. unstructured_ingest/processes/embedder.py +2 -2
  42. unstructured_ingest/processes/filter.py +1 -1
  43. unstructured_ingest/processes/partitioner.py +4 -0
  44. unstructured_ingest/processes/utils/blob_storage.py +2 -2
  45. unstructured_ingest/unstructured_api.py +13 -8
  46. unstructured_ingest/utils/data_prep.py +8 -32
  47. unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
  48. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +50 -184
  49. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
  50. examples/__init__.py +0 -0
  51. examples/airtable.py +0 -44
  52. examples/azure_cognitive_search.py +0 -55
  53. examples/chroma.py +0 -54
  54. examples/couchbase.py +0 -55
  55. examples/databricks_volumes_dest.py +0 -55
  56. examples/databricks_volumes_source.py +0 -53
  57. examples/delta_table.py +0 -45
  58. examples/discord_example.py +0 -36
  59. examples/elasticsearch.py +0 -49
  60. examples/google_drive.py +0 -45
  61. examples/kdbai.py +0 -54
  62. examples/local.py +0 -36
  63. examples/milvus.py +0 -44
  64. examples/mongodb.py +0 -53
  65. examples/opensearch.py +0 -50
  66. examples/pinecone.py +0 -57
  67. examples/s3.py +0 -38
  68. examples/salesforce.py +0 -44
  69. examples/sharepoint.py +0 -47
  70. examples/singlestore.py +0 -49
  71. examples/sql.py +0 -90
  72. examples/vectara.py +0 -54
  73. examples/weaviate.py +0 -44
  74. test/__init__.py +0 -0
  75. test/integration/__init__.py +0 -0
  76. test/integration/chunkers/__init__.py +0 -0
  77. test/integration/chunkers/test_chunkers.py +0 -31
  78. test/integration/connectors/__init__.py +0 -0
  79. test/integration/connectors/conftest.py +0 -38
  80. test/integration/connectors/databricks/__init__.py +0 -0
  81. test/integration/connectors/databricks/test_volumes_native.py +0 -273
  82. test/integration/connectors/discord/__init__.py +0 -0
  83. test/integration/connectors/discord/test_discord.py +0 -90
  84. test/integration/connectors/duckdb/__init__.py +0 -0
  85. test/integration/connectors/duckdb/conftest.py +0 -14
  86. test/integration/connectors/duckdb/test_duckdb.py +0 -90
  87. test/integration/connectors/duckdb/test_motherduck.py +0 -95
  88. test/integration/connectors/elasticsearch/__init__.py +0 -0
  89. test/integration/connectors/elasticsearch/conftest.py +0 -34
  90. test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
  91. test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
  92. test/integration/connectors/sql/__init__.py +0 -0
  93. test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
  94. test/integration/connectors/sql/test_postgres.py +0 -201
  95. test/integration/connectors/sql/test_singlestore.py +0 -182
  96. test/integration/connectors/sql/test_snowflake.py +0 -244
  97. test/integration/connectors/sql/test_sqlite.py +0 -168
  98. test/integration/connectors/sql/test_vastdb.py +0 -34
  99. test/integration/connectors/test_astradb.py +0 -287
  100. test/integration/connectors/test_azure_ai_search.py +0 -254
  101. test/integration/connectors/test_chroma.py +0 -136
  102. test/integration/connectors/test_confluence.py +0 -111
  103. test/integration/connectors/test_delta_table.py +0 -183
  104. test/integration/connectors/test_dropbox.py +0 -151
  105. test/integration/connectors/test_github.py +0 -49
  106. test/integration/connectors/test_google_drive.py +0 -257
  107. test/integration/connectors/test_jira.py +0 -67
  108. test/integration/connectors/test_lancedb.py +0 -247
  109. test/integration/connectors/test_milvus.py +0 -208
  110. test/integration/connectors/test_mongodb.py +0 -335
  111. test/integration/connectors/test_neo4j.py +0 -244
  112. test/integration/connectors/test_notion.py +0 -152
  113. test/integration/connectors/test_onedrive.py +0 -163
  114. test/integration/connectors/test_pinecone.py +0 -387
  115. test/integration/connectors/test_qdrant.py +0 -216
  116. test/integration/connectors/test_redis.py +0 -143
  117. test/integration/connectors/test_s3.py +0 -184
  118. test/integration/connectors/test_sharepoint.py +0 -222
  119. test/integration/connectors/test_vectara.py +0 -282
  120. test/integration/connectors/test_zendesk.py +0 -120
  121. test/integration/connectors/utils/__init__.py +0 -0
  122. test/integration/connectors/utils/constants.py +0 -13
  123. test/integration/connectors/utils/docker.py +0 -151
  124. test/integration/connectors/utils/docker_compose.py +0 -59
  125. test/integration/connectors/utils/validation/__init__.py +0 -0
  126. test/integration/connectors/utils/validation/destination.py +0 -77
  127. test/integration/connectors/utils/validation/equality.py +0 -76
  128. test/integration/connectors/utils/validation/source.py +0 -331
  129. test/integration/connectors/utils/validation/utils.py +0 -36
  130. test/integration/connectors/weaviate/__init__.py +0 -0
  131. test/integration/connectors/weaviate/conftest.py +0 -15
  132. test/integration/connectors/weaviate/test_cloud.py +0 -39
  133. test/integration/connectors/weaviate/test_local.py +0 -152
  134. test/integration/embedders/__init__.py +0 -0
  135. test/integration/embedders/conftest.py +0 -13
  136. test/integration/embedders/test_azure_openai.py +0 -57
  137. test/integration/embedders/test_bedrock.py +0 -103
  138. test/integration/embedders/test_huggingface.py +0 -24
  139. test/integration/embedders/test_mixedbread.py +0 -71
  140. test/integration/embedders/test_octoai.py +0 -75
  141. test/integration/embedders/test_openai.py +0 -74
  142. test/integration/embedders/test_togetherai.py +0 -71
  143. test/integration/embedders/test_vertexai.py +0 -63
  144. test/integration/embedders/test_voyageai.py +0 -79
  145. test/integration/embedders/utils.py +0 -66
  146. test/integration/partitioners/__init__.py +0 -0
  147. test/integration/partitioners/test_partitioner.py +0 -76
  148. test/integration/utils.py +0 -15
  149. test/unit/__init__.py +0 -0
  150. test/unit/chunkers/__init__.py +0 -0
  151. test/unit/chunkers/test_chunkers.py +0 -49
  152. test/unit/connectors/__init__.py +0 -0
  153. test/unit/connectors/ibm_watsonx/__init__.py +0 -0
  154. test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
  155. test/unit/connectors/motherduck/__init__.py +0 -0
  156. test/unit/connectors/motherduck/test_base.py +0 -73
  157. test/unit/connectors/sql/__init__.py +0 -0
  158. test/unit/connectors/sql/test_sql.py +0 -152
  159. test/unit/connectors/test_confluence.py +0 -71
  160. test/unit/connectors/test_jira.py +0 -401
  161. test/unit/embed/__init__.py +0 -0
  162. test/unit/embed/test_mixedbreadai.py +0 -42
  163. test/unit/embed/test_octoai.py +0 -27
  164. test/unit/embed/test_openai.py +0 -28
  165. test/unit/embed/test_vertexai.py +0 -25
  166. test/unit/embed/test_voyageai.py +0 -24
  167. test/unit/embedders/__init__.py +0 -0
  168. test/unit/embedders/test_bedrock.py +0 -36
  169. test/unit/embedders/test_huggingface.py +0 -48
  170. test/unit/embedders/test_mixedbread.py +0 -37
  171. test/unit/embedders/test_octoai.py +0 -35
  172. test/unit/embedders/test_openai.py +0 -35
  173. test/unit/embedders/test_togetherai.py +0 -37
  174. test/unit/embedders/test_vertexai.py +0 -37
  175. test/unit/embedders/test_voyageai.py +0 -38
  176. test/unit/partitioners/__init__.py +0 -0
  177. test/unit/partitioners/test_partitioner.py +0 -63
  178. test/unit/test_error.py +0 -27
  179. test/unit/test_html.py +0 -112
  180. test/unit/test_interfaces.py +0 -26
  181. test/unit/test_utils.py +0 -220
  182. test/unit/utils/__init__.py +0 -0
  183. test/unit/utils/data_generator.py +0 -32
  184. unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
  185. unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
  186. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
  187. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
@@ -1,257 +0,0 @@
1
- import os
2
- import uuid
3
-
4
- import pytest
5
- from googleapiclient.errors import HttpError
6
-
7
- from test.integration.connectors.utils.constants import (
8
- SOURCE_TAG,
9
- UNCATEGORIZED_TAG,
10
- )
11
- from test.integration.connectors.utils.validation.source import (
12
- SourceValidationConfigs,
13
- get_all_file_data,
14
- run_all_validations,
15
- update_fixtures,
16
- )
17
- from test.integration.utils import requires_env
18
- from unstructured_ingest.error import (
19
- SourceConnectionError,
20
- )
21
- from unstructured_ingest.interfaces import Downloader, Indexer
22
- from unstructured_ingest.processes.connectors.google_drive import (
23
- CONNECTOR_TYPE,
24
- GoogleDriveAccessConfig,
25
- GoogleDriveConnectionConfig,
26
- GoogleDriveDownloader,
27
- GoogleDriveDownloaderConfig,
28
- GoogleDriveIndexer,
29
- GoogleDriveIndexerConfig,
30
- )
31
-
32
-
33
- @pytest.fixture
34
- def google_drive_connection_config():
35
- """
36
- Build a valid GoogleDriveConnectionConfig using the environment variables.
37
- Expects:
38
- - GOOGLE_DRIVE_ID
39
- - GOOGLE_DRIVE_SERVICE_KEY
40
- """
41
- drive_id = os.getenv("GOOGLE_DRIVE_ID")
42
- service_key = os.getenv("GOOGLE_DRIVE_SERVICE_KEY")
43
- if not drive_id or not service_key:
44
- pytest.skip("Google Drive credentials not provided in environment variables.")
45
-
46
- access_config = GoogleDriveAccessConfig(service_account_key=service_key)
47
- return GoogleDriveConnectionConfig(drive_id=drive_id, access_config=access_config)
48
-
49
-
50
- @pytest.fixture
51
- def google_drive_empty_folder(google_drive_connection_config):
52
- """
53
- Creates an empty folder on Google Drive for testing the "empty folder" case.
54
- The folder is deleted after the test.
55
- """
56
- from google.oauth2 import service_account
57
- from googleapiclient.discovery import build
58
-
59
- access_config = google_drive_connection_config.access_config.get_secret_value()
60
- creds = service_account.Credentials.from_service_account_info(access_config.service_account_key)
61
- service = build("drive", "v3", credentials=creds)
62
-
63
- # Create an empty folder.
64
- file_metadata = {
65
- "name": f"utic-empty-folder-{uuid.uuid4()}",
66
- "mimeType": "application/vnd.google-apps.folder",
67
- }
68
- folder = service.files().create(body=file_metadata, fields="id, name").execute()
69
- folder_id = folder.get("id")
70
- try:
71
- yield folder_id
72
- finally:
73
- service.files().delete(fileId=folder_id).execute()
74
-
75
-
76
- @requires_env("GOOGLE_DRIVE_SERVICE_KEY")
77
- @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE)
78
- def test_google_drive_source(temp_dir):
79
- # Retrieve environment variables
80
- service_account_key = os.environ["GOOGLE_DRIVE_SERVICE_KEY"]
81
-
82
- # Create connection and indexer configurations
83
- access_config = GoogleDriveAccessConfig(service_account_key=service_account_key)
84
- connection_config = GoogleDriveConnectionConfig(
85
- drive_id="1XidSOO76VpZ4m0i3gJN2m1X0Obol3UAi",
86
- access_config=access_config,
87
- )
88
- index_config = GoogleDriveIndexerConfig(recursive=True)
89
-
90
- download_config = GoogleDriveDownloaderConfig(download_dir=temp_dir)
91
-
92
- # Instantiate indexer and downloader
93
- indexer = GoogleDriveIndexer(
94
- connection_config=connection_config,
95
- index_config=index_config,
96
- )
97
- downloader = GoogleDriveDownloader(
98
- connection_config=connection_config,
99
- download_config=download_config,
100
- )
101
-
102
- # Run the source connector validation
103
- source_connector_validation(
104
- indexer=indexer,
105
- downloader=downloader,
106
- configs=SourceValidationConfigs(
107
- test_id="google_drive_source",
108
- expected_num_files=1,
109
- validate_downloaded_files=True,
110
- ),
111
- )
112
-
113
-
114
- @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
115
- def source_connector_validation(
116
- indexer: Indexer,
117
- downloader: Downloader,
118
- configs: SourceValidationConfigs,
119
- overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
120
- ) -> None:
121
- # Run common validations on the process of running a source connector, supporting dynamic
122
- # validators that get passed in along with comparisons on the saved expected values.
123
- # If overwrite_fixtures is st to True, will ignore all validators but instead overwrite the
124
- # expected values with what gets generated by this test.
125
- all_predownload_file_data = []
126
- all_postdownload_file_data = []
127
- indexer.precheck()
128
- download_dir = downloader.download_config.download_dir
129
- test_output_dir = configs.test_output_dir()
130
-
131
- for file_data in indexer.run():
132
- assert file_data
133
- predownload_file_data = file_data.model_copy(deep=True)
134
- all_predownload_file_data.append(predownload_file_data)
135
- resp = downloader.run(file_data=file_data)
136
- if isinstance(resp, list):
137
- for r in resp:
138
- postdownload_file_data = r["file_data"].model_copy(deep=True)
139
- all_postdownload_file_data.append(postdownload_file_data)
140
- else:
141
- postdownload_file_data = resp["file_data"].model_copy(deep=True)
142
- all_postdownload_file_data.append(postdownload_file_data)
143
-
144
- if not overwrite_fixtures:
145
- print("Running validation")
146
- run_all_validations(
147
- configs=configs,
148
- predownload_file_data=all_predownload_file_data,
149
- postdownload_file_data=all_postdownload_file_data,
150
- download_dir=download_dir,
151
- test_output_dir=test_output_dir,
152
- )
153
- else:
154
- print("Running fixtures update")
155
- update_fixtures(
156
- output_dir=test_output_dir,
157
- download_dir=download_dir,
158
- all_file_data=get_all_file_data(
159
- all_predownload_file_data=all_predownload_file_data,
160
- all_postdownload_file_data=all_postdownload_file_data,
161
- ),
162
- save_downloads=configs.validate_downloaded_files,
163
- save_filedata=configs.validate_file_data,
164
- )
165
-
166
-
167
- # Precheck fails when the drive ID has an appended parameter (simulate copy-paste error)
168
- @pytest.mark.tags("google-drive", "precheck")
169
- @requires_env("GOOGLE_DRIVE_ID", "GOOGLE_DRIVE_SERVICE_KEY")
170
- def test_google_drive_precheck_invalid_parameter(google_drive_connection_config):
171
- # Append a query parameter as often happens when copying from a URL.
172
- invalid_drive_id = google_drive_connection_config.drive_id + "?usp=sharing"
173
- connection_config = GoogleDriveConnectionConfig(
174
- drive_id=invalid_drive_id,
175
- access_config=google_drive_connection_config.access_config,
176
- )
177
- index_config = GoogleDriveIndexerConfig(recursive=True)
178
- indexer = GoogleDriveIndexer(connection_config=connection_config, index_config=index_config)
179
- with pytest.raises(SourceConnectionError) as excinfo:
180
- indexer.precheck()
181
- assert "invalid" in str(excinfo.value).lower() or "not found" in str(excinfo.value).lower()
182
-
183
-
184
- # Precheck fails due to lack of permission (simulate via monkeypatching).
185
- @pytest.mark.tags("google-drive", "precheck")
186
- @requires_env("GOOGLE_DRIVE_ID", "GOOGLE_DRIVE_SERVICE_KEY")
187
- def test_google_drive_precheck_no_permission(google_drive_connection_config, monkeypatch):
188
- index_config = GoogleDriveIndexerConfig(recursive=True)
189
- indexer = GoogleDriveIndexer(
190
- connection_config=google_drive_connection_config,
191
- index_config=index_config,
192
- )
193
-
194
- # Monkeypatch get_root_info to always raise an HTTP 403 error.
195
- def fake_get_root_info(files_client, object_id):
196
- raise HttpError(
197
- resp=type("Response", (), {"status": 403, "reason": "Forbidden"})(),
198
- content=b"Forbidden",
199
- )
200
-
201
- monkeypatch.setattr(indexer, "get_root_info", fake_get_root_info)
202
- with pytest.raises(SourceConnectionError) as excinfo:
203
- indexer.precheck()
204
- assert "forbidden" in str(excinfo.value).lower() or "permission" in str(excinfo.value).lower()
205
-
206
-
207
- # Precheck fails when the folder is empty.
208
- # @pytest.mark.tags("google-drive", "precheck")
209
- # @requires_env("GOOGLE_DRIVE_ID", "GOOGLE_DRIVE_SERVICE_KEY")
210
- # def test_google_drive_precheck_empty_folder(
211
- # google_drive_connection_config, google_drive_empty_folder
212
- # ):
213
- # # Use the empty folder's ID as the target.
214
- # connection_config = GoogleDriveConnectionConfig(
215
- # drive_id=google_drive_empty_folder,
216
- # access_config=google_drive_connection_config.access_config,
217
- # )
218
-
219
- # index_config = GoogleDriveIndexerConfig(recursive=True)
220
- # indexer = GoogleDriveIndexer(connection_config=connection_config, index_config=index_config)
221
- # with pytest.raises(SourceConnectionError) as excinfo:
222
- # indexer.precheck()
223
- # assert "empty folder" in str(excinfo.value).lower()
224
-
225
-
226
- @pytest.mark.tags("google-drive", "count", "integration")
227
- @requires_env("GOOGLE_DRIVE_ID", "GOOGLE_DRIVE_SERVICE_KEY")
228
- def test_google_drive_count_files(google_drive_connection_config):
229
- """
230
- This test verifies that the count_files_recursively method returns the expected count of files.
231
- According to the test credentials, there are 3 files in the root directory and 1 nested file,
232
- so the total count should be 4.
233
- """
234
- # I assumed that we're applying the same extension filter as with other tests
235
- # However there's 6 files in total in the test dir
236
- extensions_filter = ["pdf", "docx"]
237
- with google_drive_connection_config.get_client() as client:
238
- count = GoogleDriveIndexer.count_files_recursively(
239
- client, google_drive_connection_config.drive_id, extensions_filter
240
- )
241
- assert count == 4, f"Expected file count of 4, but got {count}"
242
-
243
-
244
- # Precheck fails with a completely invalid drive ID.
245
- @pytest.mark.tags("google-drive", "precheck")
246
- @requires_env("GOOGLE_DRIVE_ID", "GOOGLE_DRIVE_SERVICE_KEY")
247
- def test_google_drive_precheck_invalid_drive_id(google_drive_connection_config):
248
- invalid_drive_id = "invalid_drive_id"
249
- connection_config = GoogleDriveConnectionConfig(
250
- drive_id=invalid_drive_id,
251
- access_config=google_drive_connection_config.access_config,
252
- )
253
- index_config = GoogleDriveIndexerConfig(recursive=True)
254
- indexer = GoogleDriveIndexer(connection_config=connection_config, index_config=index_config)
255
- with pytest.raises(SourceConnectionError) as excinfo:
256
- indexer.precheck()
257
- assert "invalid" in str(excinfo.value).lower() or "not found" in str(excinfo.value).lower()
@@ -1,67 +0,0 @@
1
- import os
2
-
3
- import pytest
4
-
5
- from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
6
- from test.integration.connectors.utils.validation.source import (
7
- SourceValidationConfigs,
8
- source_connector_validation,
9
- )
10
- from test.integration.utils import requires_env
11
- from unstructured_ingest.processes.connectors.jira import (
12
- CONNECTOR_TYPE,
13
- JiraAccessConfig,
14
- JiraConnectionConfig,
15
- JiraDownloader,
16
- JiraDownloaderConfig,
17
- JiraIndexer,
18
- JiraIndexerConfig,
19
- )
20
-
21
-
22
- @pytest.mark.asyncio
23
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
24
- @requires_env("JIRA_INGEST_USER_EMAIL", "JIRA_INGEST_API_TOKEN")
25
- async def test_jira_source(temp_dir):
26
- # Retrieve environment variables
27
- jira_url = os.environ.get(
28
- "JIRA_INGEST_URL", "https://unstructured-jira-connector-test.atlassian.net"
29
- )
30
- user_email = os.environ["JIRA_INGEST_USER_EMAIL"]
31
- api_token = os.environ["JIRA_INGEST_API_TOKEN"]
32
- projects = ["JCTP1"]
33
- boards = ["3"]
34
- issues = ["JCTP2-1", "JCTP2-2", "JCTP2-3"]
35
-
36
- # Create connection and indexer configurations
37
- access_config = JiraAccessConfig(password=api_token)
38
- connection_config = JiraConnectionConfig(
39
- url=jira_url,
40
- username=user_email,
41
- access_config=access_config,
42
- )
43
- index_config = JiraIndexerConfig(projects=projects, boards=boards, issues=issues)
44
-
45
- download_config = JiraDownloaderConfig(download_dir=temp_dir)
46
-
47
- # Instantiate indexer and downloader
48
- indexer = JiraIndexer(
49
- connection_config=connection_config,
50
- index_config=index_config,
51
- )
52
- downloader = JiraDownloader(
53
- connection_config=connection_config,
54
- download_config=download_config,
55
- )
56
-
57
- # Run the source connector validation
58
- await source_connector_validation(
59
- indexer=indexer,
60
- downloader=downloader,
61
- configs=SourceValidationConfigs(
62
- test_id="jira",
63
- expected_num_files=8,
64
- validate_file_data=True,
65
- validate_downloaded_files=True,
66
- ),
67
- )
@@ -1,247 +0,0 @@
1
- import os
2
- from pathlib import Path
3
- from typing import Literal, Union
4
- from uuid import uuid4
5
-
6
- import lancedb
7
- import pandas as pd
8
- import pyarrow as pa
9
- import pytest
10
- import pytest_asyncio
11
- from lancedb import AsyncConnection
12
- from upath import UPath
13
-
14
- from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
15
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
16
- from unstructured_ingest.processes.connectors.lancedb.aws import (
17
- LanceDBAwsAccessConfig,
18
- LanceDBAwsConnectionConfig,
19
- LanceDBAwsUploader,
20
- )
21
- from unstructured_ingest.processes.connectors.lancedb.azure import (
22
- LanceDBAzureAccessConfig,
23
- LanceDBAzureConnectionConfig,
24
- LanceDBAzureUploader,
25
- )
26
- from unstructured_ingest.processes.connectors.lancedb.gcp import (
27
- LanceDBGCSAccessConfig,
28
- LanceDBGCSConnectionConfig,
29
- LanceDBGSPUploader,
30
- )
31
- from unstructured_ingest.processes.connectors.lancedb.lancedb import (
32
- CONNECTOR_TYPE,
33
- LanceDBUploaderConfig,
34
- LanceDBUploadStager,
35
- )
36
- from unstructured_ingest.processes.connectors.lancedb.local import (
37
- LanceDBLocalAccessConfig,
38
- LanceDBLocalConnectionConfig,
39
- LanceDBLocalUploader,
40
- )
41
- from unstructured_ingest.utils.constants import RECORD_ID_LABEL
42
-
43
- DATABASE_NAME = "database"
44
- TABLE_NAME = "elements"
45
- DIMENSION = 384
46
- NUMBER_EXPECTED_ROWS = 22
47
- S3_BUCKET = "s3://utic-ingest-test-fixtures/"
48
- GS_BUCKET = "gs://utic-test-ingest-fixtures-output/"
49
- AZURE_BUCKET = "az://utic-ingest-test-fixtures-output/"
50
- REQUIRED_ENV_VARS = {
51
- "s3": ("S3_INGEST_TEST_ACCESS_KEY", "S3_INGEST_TEST_SECRET_KEY"),
52
- "gcs": ("GCP_INGEST_SERVICE_KEY",),
53
- "az": ("AZURE_DEST_CONNECTION_STR",),
54
- "local": (),
55
- }
56
-
57
- SCHEMA = pa.schema(
58
- [
59
- pa.field(RECORD_ID_LABEL, pa.string()),
60
- pa.field("vector", pa.list_(pa.float16(), DIMENSION)),
61
- pa.field("text", pa.string(), nullable=True),
62
- pa.field("type", pa.string(), nullable=True),
63
- pa.field("element_id", pa.string(), nullable=True),
64
- pa.field("metadata-text_as_html", pa.string(), nullable=True),
65
- pa.field("metadata-filetype", pa.string(), nullable=True),
66
- pa.field("metadata-filename", pa.string(), nullable=True),
67
- pa.field("metadata-languages", pa.list_(pa.string()), nullable=True),
68
- pa.field("metadata-is_continuation", pa.bool_(), nullable=True),
69
- pa.field("metadata-page_number", pa.int32(), nullable=True),
70
- ]
71
- )
72
- NUMBER_EXPECTED_COLUMNS = len(SCHEMA.names)
73
-
74
-
75
- @pytest_asyncio.fixture
76
- async def connection_with_uri(request, tmp_path: Path):
77
- target = request.param
78
- uri = _get_uri(target, local_base_path=tmp_path)
79
-
80
- unset_variables = [env for env in REQUIRED_ENV_VARS[target] if env not in os.environ]
81
- if unset_variables:
82
- pytest.skip(
83
- reason="Following required environment variables were not set: "
84
- + f"{', '.join(unset_variables)}"
85
- )
86
-
87
- storage_options = {
88
- "aws_access_key_id": os.getenv("S3_INGEST_TEST_ACCESS_KEY"),
89
- "aws_secret_access_key": os.getenv("S3_INGEST_TEST_SECRET_KEY"),
90
- "google_service_account_key": os.getenv("GCP_INGEST_SERVICE_KEY"),
91
- }
92
- azure_connection_string = os.getenv("AZURE_DEST_CONNECTION_STR")
93
- if azure_connection_string:
94
- storage_options.update(_parse_azure_connection_string(azure_connection_string))
95
-
96
- storage_options = {key: value for key, value in storage_options.items() if value is not None}
97
- connection = await lancedb.connect_async(
98
- uri=uri,
99
- storage_options=storage_options,
100
- )
101
- await connection.create_table(name=TABLE_NAME, schema=SCHEMA)
102
-
103
- yield connection, uri
104
-
105
- await connection.drop_database()
106
-
107
-
108
- @pytest.mark.asyncio
109
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
110
- @pytest.mark.parametrize("connection_with_uri", ["local", "s3", "gcs", "az"], indirect=True)
111
- async def test_lancedb_destination(
112
- upload_file: Path,
113
- connection_with_uri: tuple[AsyncConnection, str],
114
- tmp_path: Path,
115
- ) -> None:
116
- connection, uri = connection_with_uri
117
- file_data = FileData(
118
- source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
119
- connector_type=CONNECTOR_TYPE,
120
- identifier="mock-file-data",
121
- )
122
- stager = LanceDBUploadStager()
123
- uploader = _get_uploader(uri)
124
- staged_file_path = stager.run(
125
- elements_filepath=upload_file,
126
- file_data=file_data,
127
- output_dir=tmp_path,
128
- output_filename=upload_file.name,
129
- )
130
-
131
- await uploader.run_async(path=staged_file_path, file_data=file_data)
132
-
133
- # Test upload to empty table
134
- with await connection.open_table(TABLE_NAME) as table:
135
- table_df: pd.DataFrame = await table.to_pandas()
136
-
137
- assert len(table_df) == NUMBER_EXPECTED_ROWS
138
- assert len(table_df.columns) == NUMBER_EXPECTED_COLUMNS
139
-
140
- assert table_df[RECORD_ID_LABEL][0] == file_data.identifier
141
- assert table_df["element_id"][0] == "2470d8dc42215b3d68413b55bf00fed2"
142
- assert table_df["type"][0] == "CompositeElement"
143
- assert table_df["metadata-filename"][0] == "DA-1p-with-duplicate-pages.pdf.json"
144
- assert table_df["metadata-text_as_html"][0] is None
145
-
146
- # Test upload of the second file, rows should be appended
147
- file_data.identifier = "mock-file-data-2"
148
- staged_second_file_path = stager.run(
149
- elements_filepath=upload_file,
150
- file_data=file_data,
151
- output_dir=tmp_path,
152
- output_filename=f"{upload_file.stem}-2{upload_file.suffix}",
153
- )
154
- await uploader.run_async(path=staged_second_file_path, file_data=file_data)
155
- with await connection.open_table(TABLE_NAME) as table:
156
- appended_table_df: pd.DataFrame = await table.to_pandas()
157
- assert len(appended_table_df) == 2 * NUMBER_EXPECTED_ROWS
158
-
159
- # Test re-upload of the first file, rows should be overwritten, not appended
160
- await uploader.run_async(path=staged_file_path, file_data=file_data)
161
- with await connection.open_table(TABLE_NAME) as table:
162
- overwritten_table_df: pd.DataFrame = await table.to_pandas()
163
- assert len(overwritten_table_df) == 2 * NUMBER_EXPECTED_ROWS
164
-
165
-
166
- class TestPrecheck:
167
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
168
- @pytest.mark.parametrize("connection_with_uri", ["local", "s3", "gcs", "az"], indirect=True)
169
- def test_succeeds(
170
- self,
171
- upload_file: Path,
172
- connection_with_uri: tuple[AsyncConnection, str],
173
- tmp_path: Path,
174
- ) -> None:
175
- _, uri = connection_with_uri
176
- uploader = _get_uploader(uri)
177
- uploader.precheck()
178
-
179
-
180
- def _get_uri(target: Literal["local", "s3", "gcs", "az"], local_base_path: Path) -> str:
181
- if target == "local":
182
- return str(local_base_path / DATABASE_NAME)
183
- if target == "s3":
184
- base_uri = UPath(S3_BUCKET)
185
- elif target == "gcs":
186
- base_uri = UPath(GS_BUCKET)
187
- elif target == "az":
188
- base_uri = UPath(AZURE_BUCKET)
189
-
190
- return str(base_uri / "destination" / "lancedb" / str(uuid4()) / DATABASE_NAME)
191
-
192
-
193
- def _get_uploader(
194
- uri: str,
195
- ) -> Union[LanceDBAzureUploader, LanceDBAzureUploader, LanceDBAwsUploader, LanceDBGSPUploader]:
196
- target = uri.split("://", maxsplit=1)[0] if uri.startswith(("s3", "az", "gs")) else "local"
197
- upload_config = LanceDBUploaderConfig(table_name=TABLE_NAME)
198
- if target == "az":
199
- azure_connection_string = os.getenv("AZURE_DEST_CONNECTION_STR")
200
- access_config_kwargs = _parse_azure_connection_string(azure_connection_string)
201
- return LanceDBAzureUploader(
202
- upload_config=upload_config,
203
- connection_config=LanceDBAzureConnectionConfig(
204
- access_config=LanceDBAzureAccessConfig(**access_config_kwargs),
205
- uri=uri,
206
- ),
207
- )
208
-
209
- elif target == "s3":
210
- return LanceDBAwsUploader(
211
- upload_config=upload_config,
212
- connection_config=LanceDBAwsConnectionConfig(
213
- access_config=LanceDBAwsAccessConfig(
214
- aws_access_key_id=os.getenv("S3_INGEST_TEST_ACCESS_KEY"),
215
- aws_secret_access_key=os.getenv("S3_INGEST_TEST_SECRET_KEY"),
216
- ),
217
- uri=uri,
218
- ),
219
- )
220
- elif target == "gs":
221
- return LanceDBGSPUploader(
222
- upload_config=upload_config,
223
- connection_config=LanceDBGCSConnectionConfig(
224
- access_config=LanceDBGCSAccessConfig(
225
- google_service_account_key=os.getenv("GCP_INGEST_SERVICE_KEY")
226
- ),
227
- uri=uri,
228
- ),
229
- )
230
- else:
231
- return LanceDBLocalUploader(
232
- upload_config=upload_config,
233
- connection_config=LanceDBLocalConnectionConfig(
234
- access_config=LanceDBLocalAccessConfig(),
235
- uri=uri,
236
- ),
237
- )
238
-
239
-
240
- def _parse_azure_connection_string(
241
- connection_str: str,
242
- ) -> dict[Literal["azure_storage_account_name", "azure_storage_account_key"], str]:
243
- parameters = dict(keyvalue.split("=", maxsplit=1) for keyvalue in connection_str.split(";"))
244
- return {
245
- "azure_storage_account_name": parameters.get("AccountName"),
246
- "azure_storage_account_key": parameters.get("AccountKey"),
247
- }