unstructured-ingest 0.7.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (192) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/README.md +28 -0
  3. unstructured_ingest/embed/mixedbreadai.py +0 -1
  4. unstructured_ingest/interfaces/upload_stager.py +2 -2
  5. unstructured_ingest/interfaces/uploader.py +3 -3
  6. unstructured_ingest/logger.py +2 -93
  7. unstructured_ingest/main.py +0 -0
  8. unstructured_ingest/pipeline/interfaces.py +1 -1
  9. unstructured_ingest/pipeline/pipeline.py +1 -1
  10. unstructured_ingest/processes/chunker.py +4 -0
  11. unstructured_ingest/processes/connectors/airtable.py +4 -2
  12. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +10 -0
  13. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  14. unstructured_ingest/processes/connectors/astradb.py +2 -2
  15. unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  16. unstructured_ingest/processes/connectors/confluence.py +0 -1
  17. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
  18. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
  19. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
  20. unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
  21. unstructured_ingest/processes/connectors/delta_table.py +1 -0
  22. unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
  23. unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
  24. unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
  25. unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
  26. unstructured_ingest/processes/connectors/gitlab.py +1 -2
  27. unstructured_ingest/processes/connectors/google_drive.py +0 -2
  28. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
  29. unstructured_ingest/processes/connectors/kdbai.py +1 -0
  30. unstructured_ingest/processes/connectors/outlook.py +1 -2
  31. unstructured_ingest/processes/connectors/pinecone.py +0 -1
  32. unstructured_ingest/processes/connectors/redisdb.py +28 -24
  33. unstructured_ingest/processes/connectors/salesforce.py +1 -1
  34. unstructured_ingest/processes/connectors/slack.py +1 -2
  35. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
  36. unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
  37. unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
  38. unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
  39. unstructured_ingest/processes/connectors/sql/sql.py +3 -4
  40. unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
  41. unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
  42. unstructured_ingest/processes/connectors/vectara.py +0 -2
  43. unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
  44. unstructured_ingest/processes/embedder.py +2 -2
  45. unstructured_ingest/processes/filter.py +1 -1
  46. unstructured_ingest/processes/partitioner.py +4 -0
  47. unstructured_ingest/processes/utils/blob_storage.py +2 -2
  48. unstructured_ingest/unstructured_api.py +13 -8
  49. unstructured_ingest/utils/data_prep.py +8 -32
  50. unstructured_ingest/utils/string_and_date_utils.py +3 -3
  51. unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
  52. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +54 -187
  53. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
  54. examples/__init__.py +0 -0
  55. examples/airtable.py +0 -44
  56. examples/azure_cognitive_search.py +0 -55
  57. examples/chroma.py +0 -54
  58. examples/couchbase.py +0 -55
  59. examples/databricks_volumes_dest.py +0 -55
  60. examples/databricks_volumes_source.py +0 -53
  61. examples/delta_table.py +0 -45
  62. examples/discord_example.py +0 -36
  63. examples/elasticsearch.py +0 -49
  64. examples/google_drive.py +0 -45
  65. examples/kdbai.py +0 -54
  66. examples/local.py +0 -36
  67. examples/milvus.py +0 -44
  68. examples/mongodb.py +0 -53
  69. examples/opensearch.py +0 -50
  70. examples/pinecone.py +0 -57
  71. examples/s3.py +0 -38
  72. examples/salesforce.py +0 -44
  73. examples/sharepoint.py +0 -47
  74. examples/singlestore.py +0 -49
  75. examples/sql.py +0 -90
  76. examples/vectara.py +0 -54
  77. examples/weaviate.py +0 -44
  78. test/__init__.py +0 -0
  79. test/integration/__init__.py +0 -0
  80. test/integration/chunkers/__init__.py +0 -0
  81. test/integration/chunkers/test_chunkers.py +0 -31
  82. test/integration/connectors/__init__.py +0 -0
  83. test/integration/connectors/conftest.py +0 -38
  84. test/integration/connectors/databricks/__init__.py +0 -0
  85. test/integration/connectors/databricks/test_volumes_native.py +0 -273
  86. test/integration/connectors/discord/__init__.py +0 -0
  87. test/integration/connectors/discord/test_discord.py +0 -90
  88. test/integration/connectors/duckdb/__init__.py +0 -0
  89. test/integration/connectors/duckdb/conftest.py +0 -14
  90. test/integration/connectors/duckdb/test_duckdb.py +0 -90
  91. test/integration/connectors/duckdb/test_motherduck.py +0 -95
  92. test/integration/connectors/elasticsearch/__init__.py +0 -0
  93. test/integration/connectors/elasticsearch/conftest.py +0 -34
  94. test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
  95. test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
  96. test/integration/connectors/sql/__init__.py +0 -0
  97. test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
  98. test/integration/connectors/sql/test_postgres.py +0 -201
  99. test/integration/connectors/sql/test_singlestore.py +0 -182
  100. test/integration/connectors/sql/test_snowflake.py +0 -244
  101. test/integration/connectors/sql/test_sqlite.py +0 -168
  102. test/integration/connectors/sql/test_vastdb.py +0 -34
  103. test/integration/connectors/test_astradb.py +0 -287
  104. test/integration/connectors/test_azure_ai_search.py +0 -254
  105. test/integration/connectors/test_chroma.py +0 -136
  106. test/integration/connectors/test_confluence.py +0 -111
  107. test/integration/connectors/test_delta_table.py +0 -183
  108. test/integration/connectors/test_dropbox.py +0 -151
  109. test/integration/connectors/test_github.py +0 -49
  110. test/integration/connectors/test_google_drive.py +0 -257
  111. test/integration/connectors/test_jira.py +0 -67
  112. test/integration/connectors/test_lancedb.py +0 -247
  113. test/integration/connectors/test_milvus.py +0 -208
  114. test/integration/connectors/test_mongodb.py +0 -335
  115. test/integration/connectors/test_neo4j.py +0 -244
  116. test/integration/connectors/test_notion.py +0 -152
  117. test/integration/connectors/test_onedrive.py +0 -163
  118. test/integration/connectors/test_pinecone.py +0 -387
  119. test/integration/connectors/test_qdrant.py +0 -216
  120. test/integration/connectors/test_redis.py +0 -143
  121. test/integration/connectors/test_s3.py +0 -184
  122. test/integration/connectors/test_sharepoint.py +0 -222
  123. test/integration/connectors/test_vectara.py +0 -282
  124. test/integration/connectors/test_zendesk.py +0 -120
  125. test/integration/connectors/utils/__init__.py +0 -0
  126. test/integration/connectors/utils/constants.py +0 -13
  127. test/integration/connectors/utils/docker.py +0 -151
  128. test/integration/connectors/utils/docker_compose.py +0 -59
  129. test/integration/connectors/utils/validation/__init__.py +0 -0
  130. test/integration/connectors/utils/validation/destination.py +0 -77
  131. test/integration/connectors/utils/validation/equality.py +0 -76
  132. test/integration/connectors/utils/validation/source.py +0 -331
  133. test/integration/connectors/utils/validation/utils.py +0 -36
  134. test/integration/connectors/weaviate/__init__.py +0 -0
  135. test/integration/connectors/weaviate/conftest.py +0 -15
  136. test/integration/connectors/weaviate/test_cloud.py +0 -39
  137. test/integration/connectors/weaviate/test_local.py +0 -152
  138. test/integration/embedders/__init__.py +0 -0
  139. test/integration/embedders/conftest.py +0 -13
  140. test/integration/embedders/test_azure_openai.py +0 -57
  141. test/integration/embedders/test_bedrock.py +0 -103
  142. test/integration/embedders/test_huggingface.py +0 -24
  143. test/integration/embedders/test_mixedbread.py +0 -71
  144. test/integration/embedders/test_octoai.py +0 -75
  145. test/integration/embedders/test_openai.py +0 -74
  146. test/integration/embedders/test_togetherai.py +0 -71
  147. test/integration/embedders/test_vertexai.py +0 -63
  148. test/integration/embedders/test_voyageai.py +0 -79
  149. test/integration/embedders/utils.py +0 -66
  150. test/integration/partitioners/__init__.py +0 -0
  151. test/integration/partitioners/test_partitioner.py +0 -76
  152. test/integration/utils.py +0 -15
  153. test/unit/__init__.py +0 -0
  154. test/unit/chunkers/__init__.py +0 -0
  155. test/unit/chunkers/test_chunkers.py +0 -49
  156. test/unit/connectors/__init__.py +0 -0
  157. test/unit/connectors/ibm_watsonx/__init__.py +0 -0
  158. test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
  159. test/unit/connectors/motherduck/__init__.py +0 -0
  160. test/unit/connectors/motherduck/test_base.py +0 -73
  161. test/unit/connectors/sql/__init__.py +0 -0
  162. test/unit/connectors/sql/test_sql.py +0 -152
  163. test/unit/connectors/test_confluence.py +0 -71
  164. test/unit/connectors/test_jira.py +0 -401
  165. test/unit/embed/__init__.py +0 -0
  166. test/unit/embed/test_mixedbreadai.py +0 -42
  167. test/unit/embed/test_octoai.py +0 -27
  168. test/unit/embed/test_openai.py +0 -28
  169. test/unit/embed/test_vertexai.py +0 -25
  170. test/unit/embed/test_voyageai.py +0 -24
  171. test/unit/embedders/__init__.py +0 -0
  172. test/unit/embedders/test_bedrock.py +0 -36
  173. test/unit/embedders/test_huggingface.py +0 -48
  174. test/unit/embedders/test_mixedbread.py +0 -37
  175. test/unit/embedders/test_octoai.py +0 -35
  176. test/unit/embedders/test_openai.py +0 -35
  177. test/unit/embedders/test_togetherai.py +0 -37
  178. test/unit/embedders/test_vertexai.py +0 -37
  179. test/unit/embedders/test_voyageai.py +0 -38
  180. test/unit/partitioners/__init__.py +0 -0
  181. test/unit/partitioners/test_partitioner.py +0 -63
  182. test/unit/test_error.py +0 -27
  183. test/unit/test_html.py +0 -112
  184. test/unit/test_interfaces.py +0 -26
  185. test/unit/test_logger.py +0 -78
  186. test/unit/test_utils.py +0 -220
  187. test/unit/utils/__init__.py +0 -0
  188. test/unit/utils/data_generator.py +0 -32
  189. unstructured_ingest-0.7.1.dist-info/METADATA +0 -383
  190. unstructured_ingest-0.7.1.dist-info/top_level.txt +0 -3
  191. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
  192. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
@@ -1,244 +0,0 @@
1
- import json
2
- import time
3
- import uuid
4
- from datetime import datetime
5
- from pathlib import Path
6
-
7
- import pytest
8
- from neo4j import AsyncGraphDatabase, Driver, GraphDatabase
9
- from neo4j.exceptions import ServiceUnavailable
10
- from pytest_check import check
11
-
12
- from test.integration.connectors.utils.constants import DESTINATION_TAG, GRAPH_DB_TAG
13
- from test.integration.connectors.utils.docker import container_context
14
- from unstructured_ingest.data_types.file_data import (
15
- FileData,
16
- FileDataSourceMetadata,
17
- SourceIdentifiers,
18
- )
19
- from unstructured_ingest.error import DestinationConnectionError
20
- from unstructured_ingest.processes.connectors.neo4j import (
21
- CONNECTOR_TYPE,
22
- Label,
23
- Neo4jAccessConfig,
24
- Neo4jConnectionConfig,
25
- Neo4jUploader,
26
- Neo4jUploaderConfig,
27
- Neo4jUploadStager,
28
- Relationship,
29
- )
30
- from unstructured_ingest.utils.chunking import elements_from_base64_gzipped_json
31
-
32
- USERNAME = "neo4j"
33
- PASSWORD = "password"
34
- URI = "neo4j://localhost:7687"
35
- DATABASE = "neo4j"
36
-
37
- EXPECTED_DOCUMENT_COUNT = 1
38
-
39
-
40
- # NOTE: Precheck tests are read-only so we utilize the same container for all tests.
41
- # If new tests require clean neo4j container, this fixture's scope should be adjusted.
42
- @pytest.fixture(autouse=True, scope="module")
43
- def _neo4j_server():
44
- with container_context(
45
- image="neo4j:latest", environment={"NEO4J_AUTH": "neo4j/password"}, ports={"7687": "7687"}
46
- ):
47
- driver = GraphDatabase.driver(uri=URI, auth=(USERNAME, PASSWORD))
48
- wait_for_connection(driver)
49
- driver.close()
50
- yield
51
-
52
-
53
- @pytest.mark.asyncio
54
- @pytest.mark.tags(DESTINATION_TAG, CONNECTOR_TYPE, GRAPH_DB_TAG)
55
- async def test_neo4j_destination(upload_file: Path, tmp_path: Path):
56
- stager = Neo4jUploadStager()
57
- uploader = Neo4jUploader(
58
- connection_config=Neo4jConnectionConfig(
59
- access_config=Neo4jAccessConfig(password=PASSWORD), # type: ignore
60
- username=USERNAME,
61
- uri=URI,
62
- database=DATABASE,
63
- ),
64
- upload_config=Neo4jUploaderConfig(),
65
- )
66
- file_data = FileData(
67
- identifier="mock-file-data",
68
- connector_type="neo4j",
69
- source_identifiers=SourceIdentifiers(
70
- filename=upload_file.name,
71
- fullpath=upload_file.name,
72
- ),
73
- metadata=FileDataSourceMetadata(
74
- date_created=str(datetime(2022, 1, 1).timestamp()),
75
- date_modified=str(datetime(2022, 1, 2).timestamp()),
76
- ),
77
- )
78
- staged_filepath = stager.run(
79
- upload_file,
80
- file_data=file_data,
81
- output_dir=tmp_path,
82
- output_filename=upload_file.name,
83
- )
84
-
85
- await uploader.run_async(staged_filepath, file_data)
86
- await validate_uploaded_graph(upload_file)
87
-
88
- modified_upload_file = tmp_path / f"modified-{upload_file.name}"
89
- with open(upload_file) as file:
90
- elements = json.load(file)
91
- for element in elements:
92
- element["element_id"] = str(uuid.uuid4())
93
-
94
- with open(modified_upload_file, "w") as file:
95
- json.dump(elements, file, indent=4)
96
-
97
- staged_filepath = stager.run(
98
- modified_upload_file,
99
- file_data=file_data,
100
- output_dir=tmp_path,
101
- output_filename=modified_upload_file.name,
102
- )
103
- await uploader.run_async(staged_filepath, file_data)
104
- await validate_uploaded_graph(modified_upload_file)
105
-
106
-
107
- @pytest.mark.tags(DESTINATION_TAG, CONNECTOR_TYPE, GRAPH_DB_TAG)
108
- class TestPrecheck:
109
- @pytest.fixture
110
- def configured_uploader(self) -> Neo4jUploader:
111
- return Neo4jUploader(
112
- connection_config=Neo4jConnectionConfig(
113
- access_config=Neo4jAccessConfig(password=PASSWORD), # type: ignore
114
- username=USERNAME,
115
- uri=URI,
116
- database=DATABASE,
117
- ),
118
- upload_config=Neo4jUploaderConfig(),
119
- )
120
-
121
- def test_succeeds(self, configured_uploader: Neo4jUploader):
122
- configured_uploader.precheck()
123
-
124
- def test_fails_on_invalid_password(self, configured_uploader: Neo4jUploader):
125
- configured_uploader.connection_config.access_config.get_secret_value().password = (
126
- "invalid-password"
127
- )
128
- with pytest.raises(
129
- DestinationConnectionError,
130
- match="{code: Neo.ClientError.Security.Unauthorized}",
131
- ):
132
- configured_uploader.precheck()
133
-
134
- def test_fails_on_invalid_username(self, configured_uploader: Neo4jUploader):
135
- configured_uploader.connection_config.username = "invalid-username"
136
- with pytest.raises(
137
- DestinationConnectionError, match="{code: Neo.ClientError.Security.Unauthorized}"
138
- ):
139
- configured_uploader.precheck()
140
-
141
- @pytest.mark.parametrize(
142
- ("uri", "expected_error_msg"),
143
- [
144
- ("neo4j://localhst:7687", "Cannot resolve address"),
145
- ("neo4j://localhost:7777", "Unable to retrieve routing information"),
146
- ],
147
- )
148
- def test_fails_on_invalid_uri(
149
- self, configured_uploader: Neo4jUploader, uri: str, expected_error_msg: str
150
- ):
151
- configured_uploader.connection_config.uri = uri
152
- with pytest.raises(DestinationConnectionError, match=expected_error_msg):
153
- configured_uploader.precheck()
154
-
155
- def test_fails_on_invalid_database(self, configured_uploader: Neo4jUploader):
156
- configured_uploader.connection_config.database = "invalid-database"
157
- with pytest.raises(
158
- DestinationConnectionError, match="{code: Neo.ClientError.Database.DatabaseNotFound}"
159
- ):
160
- configured_uploader.precheck()
161
-
162
-
163
- def wait_for_connection(driver: Driver, retries: int = 10, delay_seconds: int = 2):
164
- attempts = 0
165
- while attempts < retries:
166
- try:
167
- driver.verify_connectivity()
168
- return
169
- except ServiceUnavailable:
170
- time.sleep(delay_seconds)
171
- attempts += 1
172
-
173
- pytest.fail("Failed to connect with Neo4j server.")
174
-
175
-
176
- async def validate_uploaded_graph(upload_file: Path):
177
- with open(upload_file) as file:
178
- elements = json.load(file)
179
-
180
- for element in elements:
181
- if "orig_elements" in element["metadata"]:
182
- element["metadata"]["orig_elements"] = elements_from_base64_gzipped_json(
183
- element["metadata"]["orig_elements"]
184
- )
185
- else:
186
- element["metadata"]["orig_elements"] = []
187
-
188
- expected_chunks_count = len(elements)
189
- expected_element_count = len(
190
- {
191
- origin_element["element_id"]
192
- for chunk in elements
193
- for origin_element in chunk["metadata"]["orig_elements"]
194
- }
195
- )
196
- expected_nodes_count = expected_chunks_count + expected_element_count + EXPECTED_DOCUMENT_COUNT
197
-
198
- driver = AsyncGraphDatabase.driver(uri=URI, auth=(USERNAME, PASSWORD))
199
- try:
200
- nodes_count = len((await driver.execute_query("MATCH (n) RETURN n"))[0])
201
- chunk_nodes_count = len(
202
- (await driver.execute_query(f"MATCH (n: {Label.CHUNK.value}) RETURN n"))[0]
203
- )
204
- document_nodes_count = len(
205
- (await driver.execute_query(f"MATCH (n: {Label.DOCUMENT.value}) RETURN n"))[0]
206
- )
207
- element_nodes_count = len(
208
- (await driver.execute_query(f"MATCH (n: {Label.UNSTRUCTURED_ELEMENT.value}) RETURN n"))[
209
- 0
210
- ]
211
- )
212
- with check:
213
- assert nodes_count == expected_nodes_count
214
- with check:
215
- assert document_nodes_count == EXPECTED_DOCUMENT_COUNT
216
- with check:
217
- assert chunk_nodes_count == expected_chunks_count
218
- with check:
219
- assert element_nodes_count == expected_element_count
220
-
221
- records, _, _ = await driver.execute_query(
222
- f"""
223
- MATCH ()-[r:{Relationship.PART_OF_DOCUMENT.value}]->(:{Label.DOCUMENT.value})
224
- RETURN r
225
- """
226
- )
227
- part_of_document_count = len(records)
228
-
229
- records, _, _ = await driver.execute_query(
230
- f"""
231
- MATCH (:{Label.CHUNK.value})-[r:{Relationship.NEXT_CHUNK.value}]->(:{Label.CHUNK.value})
232
- RETURN r
233
- """
234
- )
235
- next_chunk_count = len(records)
236
-
237
- if not check.any_failures():
238
- with check:
239
- assert part_of_document_count == expected_chunks_count + expected_element_count
240
- with check:
241
- assert next_chunk_count == expected_chunks_count - 1
242
-
243
- finally:
244
- await driver.close()
@@ -1,152 +0,0 @@
1
- import os
2
-
3
- import pytest
4
-
5
- from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
6
- from test.integration.connectors.utils.validation.source import (
7
- SourceValidationConfigs,
8
- get_all_file_data,
9
- run_all_validations,
10
- update_fixtures,
11
- )
12
- from unstructured_ingest.interfaces import Downloader, Indexer
13
- from unstructured_ingest.processes.connectors.notion.connector import (
14
- CONNECTOR_TYPE,
15
- NotionAccessConfig,
16
- NotionConnectionConfig,
17
- NotionDownloader,
18
- NotionDownloaderConfig,
19
- NotionIndexer,
20
- NotionIndexerConfig,
21
- )
22
-
23
-
24
- @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
25
- def test_notion_source_database(temp_dir):
26
- # Retrieve environment variables
27
- notion_api_key = os.environ["NOTION_API_KEY"]
28
-
29
- # Create connection and indexer configurations
30
- access_config = NotionAccessConfig(notion_api_key=notion_api_key)
31
- connection_config = NotionConnectionConfig(
32
- access_config=access_config,
33
- )
34
- index_config = NotionIndexerConfig(
35
- database_ids=["1722c3765a0a8082b382ebc2c62d3f4c"], recursive=False
36
- )
37
-
38
- download_config = NotionDownloaderConfig(download_dir=temp_dir)
39
-
40
- # Instantiate indexer and downloader
41
- indexer = NotionIndexer(
42
- connection_config=connection_config,
43
- index_config=index_config,
44
- )
45
- downloader = NotionDownloader(
46
- connection_config=connection_config,
47
- download_config=download_config,
48
- )
49
-
50
- # Run the source connector validation
51
- source_connector_validation(
52
- indexer=indexer,
53
- downloader=downloader,
54
- configs=SourceValidationConfigs(
55
- test_id="notion_database",
56
- expected_num_files=1,
57
- validate_downloaded_files=True,
58
- exclude_fields_extend=["metadata.date_created", "metadata.date_modified"],
59
- ),
60
- )
61
-
62
-
63
- @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
64
- def test_notion_source_page(temp_dir):
65
- # Retrieve environment variables
66
- notion_api_key = os.environ["NOTION_API_KEY"]
67
-
68
- # Create connection and indexer configurations
69
- access_config = NotionAccessConfig(notion_api_key=notion_api_key)
70
- connection_config = NotionConnectionConfig(
71
- access_config=access_config,
72
- )
73
- index_config = NotionIndexerConfig(
74
- page_ids=["1572c3765a0a806299f0dd6999f9e4c7"], recursive=False
75
- )
76
-
77
- download_config = NotionDownloaderConfig(download_dir=temp_dir)
78
-
79
- # Instantiate indexer and downloader
80
- indexer = NotionIndexer(
81
- connection_config=connection_config,
82
- index_config=index_config,
83
- )
84
- downloader = NotionDownloader(
85
- connection_config=connection_config,
86
- download_config=download_config,
87
- )
88
-
89
- # Run the source connector validation
90
- source_connector_validation(
91
- indexer=indexer,
92
- downloader=downloader,
93
- configs=SourceValidationConfigs(
94
- test_id="notion_page",
95
- expected_num_files=1,
96
- validate_downloaded_files=True,
97
- exclude_fields_extend=["metadata.date_created", "metadata.date_modified"],
98
- ),
99
- )
100
-
101
-
102
- @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
103
- def source_connector_validation(
104
- indexer: Indexer,
105
- downloader: Downloader,
106
- configs: SourceValidationConfigs,
107
- overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
108
- ) -> None:
109
- # Run common validations on the process of running a source connector, supporting dynamic
110
- # validators that get passed in along with comparisons on the saved expected values.
111
- # If overwrite_fixtures is st to True, will ignore all validators but instead overwrite the
112
- # expected values with what gets generated by this test.
113
- all_predownload_file_data = []
114
- all_postdownload_file_data = []
115
- indexer.precheck()
116
- download_dir = downloader.download_config.download_dir
117
- test_output_dir = configs.test_output_dir()
118
-
119
- for file_data in indexer.run():
120
- assert file_data
121
- predownload_file_data = file_data.model_copy(deep=True)
122
- all_predownload_file_data.append(predownload_file_data)
123
- resp = downloader.run(file_data=file_data)
124
- if isinstance(resp, list):
125
- for r in resp:
126
- postdownload_file_data = r["file_data"].model_copy(deep=True)
127
- all_postdownload_file_data.append(postdownload_file_data)
128
- else:
129
- postdownload_file_data = resp["file_data"].model_copy(deep=True)
130
- all_postdownload_file_data.append(postdownload_file_data)
131
-
132
- if not overwrite_fixtures:
133
- print("Running validation")
134
- run_all_validations(
135
- configs=configs,
136
- predownload_file_data=all_predownload_file_data,
137
- postdownload_file_data=all_postdownload_file_data,
138
- download_dir=download_dir,
139
- test_output_dir=test_output_dir,
140
- )
141
- else:
142
- print("Running fixtures update")
143
- update_fixtures(
144
- output_dir=test_output_dir,
145
- download_dir=download_dir,
146
- all_file_data=get_all_file_data(
147
- all_predownload_file_data=all_predownload_file_data,
148
- all_postdownload_file_data=all_postdownload_file_data,
149
- ),
150
- save_downloads=configs.validate_downloaded_files,
151
- save_filedata=configs.validate_file_data,
152
- )
@@ -1,163 +0,0 @@
1
- import os
2
- import uuid
3
- from pathlib import Path
4
-
5
- import pytest
6
- from office365.graph_client import GraphClient
7
-
8
- from test.integration.connectors.utils.constants import (
9
- BLOB_STORAGE_TAG,
10
- DESTINATION_TAG,
11
- SOURCE_TAG,
12
- )
13
- from test.integration.connectors.utils.validation.source import (
14
- SourceValidationConfigs,
15
- source_connector_validation,
16
- )
17
- from test.integration.utils import requires_env
18
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
19
- from unstructured_ingest.processes.connectors.onedrive import (
20
- CONNECTOR_TYPE,
21
- OnedriveAccessConfig,
22
- OnedriveConnectionConfig,
23
- OnedriveDownloader,
24
- OnedriveDownloaderConfig,
25
- OnedriveIndexer,
26
- OnedriveIndexerConfig,
27
- OnedriveUploader,
28
- OnedriveUploaderConfig,
29
- )
30
-
31
-
32
- @pytest.fixture
33
- def onedrive_test_folder() -> str:
34
- """
35
- Pytest fixture that creates a test folder in OneDrive and deletes it after test run.
36
- """
37
- connection_config = get_connection_config()
38
- user_pname = connection_config.user_pname
39
-
40
- # Get the OneDrive client
41
- client: GraphClient = connection_config.get_client()
42
- drive = client.users[user_pname].drive
43
-
44
- # Generate a unique test folder path
45
- test_folder_path = f"utic-test-output-{uuid.uuid4()}"
46
-
47
- # Create the test folder
48
- root = drive.root
49
- folder = root.create_folder(test_folder_path).execute_query()
50
- print(f"created folder: {folder.name}")
51
- try:
52
- yield test_folder_path
53
- finally:
54
- # Teardown: delete the test folder and its contents
55
- folder.delete_object().execute_query()
56
- print(f"successfully deleted folder: {folder.name}")
57
-
58
-
59
- def get_connection_config():
60
- """
61
- Pytest fixture that provides the OnedriveConnectionConfig for tests.
62
- """
63
- client_id = os.getenv("MS_CLIENT_ID")
64
- client_secret = os.getenv("MS_CLIENT_CRED")
65
- tenant_id = os.getenv("MS_TENANT_ID")
66
- user_pname = os.getenv("MS_USER_PNAME")
67
-
68
- connection_config = OnedriveConnectionConfig(
69
- client_id=client_id,
70
- tenant=tenant_id,
71
- user_pname=user_pname,
72
- access_config=OnedriveAccessConfig(client_cred=client_secret),
73
- )
74
- return connection_config
75
-
76
-
77
- @pytest.mark.asyncio
78
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
79
- @requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
80
- async def test_onedrive_source(temp_dir):
81
- connection_config = get_connection_config()
82
- index_config = OnedriveIndexerConfig(recursive=True, path="eml")
83
-
84
- download_config = OnedriveDownloaderConfig(download_dir=temp_dir)
85
-
86
- # Instantiate indexer and downloader
87
- indexer = OnedriveIndexer(
88
- connection_config=connection_config,
89
- index_config=index_config,
90
- )
91
- downloader = OnedriveDownloader(
92
- connection_config=connection_config,
93
- download_config=download_config,
94
- )
95
-
96
- # Run the source connector validation
97
- await source_connector_validation(
98
- indexer=indexer,
99
- downloader=downloader,
100
- configs=SourceValidationConfigs(
101
- test_id="onedrive",
102
- expected_num_files=1,
103
- validate_downloaded_files=True,
104
- exclude_fields_extend=[
105
- "metadata.date_created",
106
- "metadata.date_modified",
107
- "additional_metadata.LastModified",
108
- "additional_metadata.@microsoft.graph.downloadUrl",
109
- ],
110
- ),
111
- )
112
-
113
-
114
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, BLOB_STORAGE_TAG)
115
- @requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
116
- def xtest_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
117
- """
118
- Integration test for the OneDrive destination connector.
119
-
120
- This test uploads a file to OneDrive and verifies that it exists.
121
- """
122
- connection_config = get_connection_config()
123
- # Retrieve user principal name from the connection config
124
- user_pname = connection_config.user_pname
125
-
126
- # The test folder is provided by the fixture
127
- destination_folder = onedrive_test_folder
128
- destination_fullpath = f"{destination_folder}/{upload_file.name}"
129
-
130
- # Configure the uploader with remote_url
131
- upload_config = OnedriveUploaderConfig(remote_url=f"onedrive://{destination_folder}")
132
-
133
- uploader = OnedriveUploader(
134
- connection_config=connection_config,
135
- upload_config=upload_config,
136
- )
137
-
138
- file_data = FileData(
139
- source_identifiers=SourceIdentifiers(
140
- fullpath=destination_fullpath,
141
- filename=upload_file.name,
142
- ),
143
- connector_type=CONNECTOR_TYPE,
144
- identifier="mock_file_data",
145
- )
146
- uploader.precheck()
147
- uploader.run(path=upload_file, file_data=file_data)
148
-
149
- # Verify that the file was uploaded
150
- client = connection_config.get_client()
151
- drive = client.users[user_pname].drive
152
-
153
- # Workaround: File should not have .json in the metadata.filename it comes from embedder
154
- uploaded_file = (
155
- drive.root.get_by_path(f"{destination_fullpath}.json")
156
- .select(["id", "name"])
157
- .get()
158
- .execute_query()
159
- )
160
-
161
- # Check if the file exists
162
- assert uploaded_file is not None
163
- assert uploaded_file.name == f"{upload_file.name}.json"