unstructured-ingest 0.7.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (192) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/README.md +28 -0
  3. unstructured_ingest/embed/mixedbreadai.py +0 -1
  4. unstructured_ingest/interfaces/upload_stager.py +2 -2
  5. unstructured_ingest/interfaces/uploader.py +3 -3
  6. unstructured_ingest/logger.py +2 -93
  7. unstructured_ingest/main.py +0 -0
  8. unstructured_ingest/pipeline/interfaces.py +1 -1
  9. unstructured_ingest/pipeline/pipeline.py +1 -1
  10. unstructured_ingest/processes/chunker.py +4 -0
  11. unstructured_ingest/processes/connectors/airtable.py +4 -2
  12. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +10 -0
  13. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  14. unstructured_ingest/processes/connectors/astradb.py +2 -2
  15. unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  16. unstructured_ingest/processes/connectors/confluence.py +0 -1
  17. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
  18. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
  19. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
  20. unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
  21. unstructured_ingest/processes/connectors/delta_table.py +1 -0
  22. unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
  23. unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
  24. unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
  25. unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
  26. unstructured_ingest/processes/connectors/gitlab.py +1 -2
  27. unstructured_ingest/processes/connectors/google_drive.py +0 -2
  28. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
  29. unstructured_ingest/processes/connectors/kdbai.py +1 -0
  30. unstructured_ingest/processes/connectors/outlook.py +1 -2
  31. unstructured_ingest/processes/connectors/pinecone.py +0 -1
  32. unstructured_ingest/processes/connectors/redisdb.py +28 -24
  33. unstructured_ingest/processes/connectors/salesforce.py +1 -1
  34. unstructured_ingest/processes/connectors/slack.py +1 -2
  35. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
  36. unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
  37. unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
  38. unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
  39. unstructured_ingest/processes/connectors/sql/sql.py +3 -4
  40. unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
  41. unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
  42. unstructured_ingest/processes/connectors/vectara.py +0 -2
  43. unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
  44. unstructured_ingest/processes/embedder.py +2 -2
  45. unstructured_ingest/processes/filter.py +1 -1
  46. unstructured_ingest/processes/partitioner.py +4 -0
  47. unstructured_ingest/processes/utils/blob_storage.py +2 -2
  48. unstructured_ingest/unstructured_api.py +13 -8
  49. unstructured_ingest/utils/data_prep.py +8 -32
  50. unstructured_ingest/utils/string_and_date_utils.py +3 -3
  51. unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
  52. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +54 -187
  53. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
  54. examples/__init__.py +0 -0
  55. examples/airtable.py +0 -44
  56. examples/azure_cognitive_search.py +0 -55
  57. examples/chroma.py +0 -54
  58. examples/couchbase.py +0 -55
  59. examples/databricks_volumes_dest.py +0 -55
  60. examples/databricks_volumes_source.py +0 -53
  61. examples/delta_table.py +0 -45
  62. examples/discord_example.py +0 -36
  63. examples/elasticsearch.py +0 -49
  64. examples/google_drive.py +0 -45
  65. examples/kdbai.py +0 -54
  66. examples/local.py +0 -36
  67. examples/milvus.py +0 -44
  68. examples/mongodb.py +0 -53
  69. examples/opensearch.py +0 -50
  70. examples/pinecone.py +0 -57
  71. examples/s3.py +0 -38
  72. examples/salesforce.py +0 -44
  73. examples/sharepoint.py +0 -47
  74. examples/singlestore.py +0 -49
  75. examples/sql.py +0 -90
  76. examples/vectara.py +0 -54
  77. examples/weaviate.py +0 -44
  78. test/__init__.py +0 -0
  79. test/integration/__init__.py +0 -0
  80. test/integration/chunkers/__init__.py +0 -0
  81. test/integration/chunkers/test_chunkers.py +0 -31
  82. test/integration/connectors/__init__.py +0 -0
  83. test/integration/connectors/conftest.py +0 -38
  84. test/integration/connectors/databricks/__init__.py +0 -0
  85. test/integration/connectors/databricks/test_volumes_native.py +0 -273
  86. test/integration/connectors/discord/__init__.py +0 -0
  87. test/integration/connectors/discord/test_discord.py +0 -90
  88. test/integration/connectors/duckdb/__init__.py +0 -0
  89. test/integration/connectors/duckdb/conftest.py +0 -14
  90. test/integration/connectors/duckdb/test_duckdb.py +0 -90
  91. test/integration/connectors/duckdb/test_motherduck.py +0 -95
  92. test/integration/connectors/elasticsearch/__init__.py +0 -0
  93. test/integration/connectors/elasticsearch/conftest.py +0 -34
  94. test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
  95. test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
  96. test/integration/connectors/sql/__init__.py +0 -0
  97. test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
  98. test/integration/connectors/sql/test_postgres.py +0 -201
  99. test/integration/connectors/sql/test_singlestore.py +0 -182
  100. test/integration/connectors/sql/test_snowflake.py +0 -244
  101. test/integration/connectors/sql/test_sqlite.py +0 -168
  102. test/integration/connectors/sql/test_vastdb.py +0 -34
  103. test/integration/connectors/test_astradb.py +0 -287
  104. test/integration/connectors/test_azure_ai_search.py +0 -254
  105. test/integration/connectors/test_chroma.py +0 -136
  106. test/integration/connectors/test_confluence.py +0 -111
  107. test/integration/connectors/test_delta_table.py +0 -183
  108. test/integration/connectors/test_dropbox.py +0 -151
  109. test/integration/connectors/test_github.py +0 -49
  110. test/integration/connectors/test_google_drive.py +0 -257
  111. test/integration/connectors/test_jira.py +0 -67
  112. test/integration/connectors/test_lancedb.py +0 -247
  113. test/integration/connectors/test_milvus.py +0 -208
  114. test/integration/connectors/test_mongodb.py +0 -335
  115. test/integration/connectors/test_neo4j.py +0 -244
  116. test/integration/connectors/test_notion.py +0 -152
  117. test/integration/connectors/test_onedrive.py +0 -163
  118. test/integration/connectors/test_pinecone.py +0 -387
  119. test/integration/connectors/test_qdrant.py +0 -216
  120. test/integration/connectors/test_redis.py +0 -143
  121. test/integration/connectors/test_s3.py +0 -184
  122. test/integration/connectors/test_sharepoint.py +0 -222
  123. test/integration/connectors/test_vectara.py +0 -282
  124. test/integration/connectors/test_zendesk.py +0 -120
  125. test/integration/connectors/utils/__init__.py +0 -0
  126. test/integration/connectors/utils/constants.py +0 -13
  127. test/integration/connectors/utils/docker.py +0 -151
  128. test/integration/connectors/utils/docker_compose.py +0 -59
  129. test/integration/connectors/utils/validation/__init__.py +0 -0
  130. test/integration/connectors/utils/validation/destination.py +0 -77
  131. test/integration/connectors/utils/validation/equality.py +0 -76
  132. test/integration/connectors/utils/validation/source.py +0 -331
  133. test/integration/connectors/utils/validation/utils.py +0 -36
  134. test/integration/connectors/weaviate/__init__.py +0 -0
  135. test/integration/connectors/weaviate/conftest.py +0 -15
  136. test/integration/connectors/weaviate/test_cloud.py +0 -39
  137. test/integration/connectors/weaviate/test_local.py +0 -152
  138. test/integration/embedders/__init__.py +0 -0
  139. test/integration/embedders/conftest.py +0 -13
  140. test/integration/embedders/test_azure_openai.py +0 -57
  141. test/integration/embedders/test_bedrock.py +0 -103
  142. test/integration/embedders/test_huggingface.py +0 -24
  143. test/integration/embedders/test_mixedbread.py +0 -71
  144. test/integration/embedders/test_octoai.py +0 -75
  145. test/integration/embedders/test_openai.py +0 -74
  146. test/integration/embedders/test_togetherai.py +0 -71
  147. test/integration/embedders/test_vertexai.py +0 -63
  148. test/integration/embedders/test_voyageai.py +0 -79
  149. test/integration/embedders/utils.py +0 -66
  150. test/integration/partitioners/__init__.py +0 -0
  151. test/integration/partitioners/test_partitioner.py +0 -76
  152. test/integration/utils.py +0 -15
  153. test/unit/__init__.py +0 -0
  154. test/unit/chunkers/__init__.py +0 -0
  155. test/unit/chunkers/test_chunkers.py +0 -49
  156. test/unit/connectors/__init__.py +0 -0
  157. test/unit/connectors/ibm_watsonx/__init__.py +0 -0
  158. test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
  159. test/unit/connectors/motherduck/__init__.py +0 -0
  160. test/unit/connectors/motherduck/test_base.py +0 -73
  161. test/unit/connectors/sql/__init__.py +0 -0
  162. test/unit/connectors/sql/test_sql.py +0 -152
  163. test/unit/connectors/test_confluence.py +0 -71
  164. test/unit/connectors/test_jira.py +0 -401
  165. test/unit/embed/__init__.py +0 -0
  166. test/unit/embed/test_mixedbreadai.py +0 -42
  167. test/unit/embed/test_octoai.py +0 -27
  168. test/unit/embed/test_openai.py +0 -28
  169. test/unit/embed/test_vertexai.py +0 -25
  170. test/unit/embed/test_voyageai.py +0 -24
  171. test/unit/embedders/__init__.py +0 -0
  172. test/unit/embedders/test_bedrock.py +0 -36
  173. test/unit/embedders/test_huggingface.py +0 -48
  174. test/unit/embedders/test_mixedbread.py +0 -37
  175. test/unit/embedders/test_octoai.py +0 -35
  176. test/unit/embedders/test_openai.py +0 -35
  177. test/unit/embedders/test_togetherai.py +0 -37
  178. test/unit/embedders/test_vertexai.py +0 -37
  179. test/unit/embedders/test_voyageai.py +0 -38
  180. test/unit/partitioners/__init__.py +0 -0
  181. test/unit/partitioners/test_partitioner.py +0 -63
  182. test/unit/test_error.py +0 -27
  183. test/unit/test_html.py +0 -112
  184. test/unit/test_interfaces.py +0 -26
  185. test/unit/test_logger.py +0 -78
  186. test/unit/test_utils.py +0 -220
  187. test/unit/utils/__init__.py +0 -0
  188. test/unit/utils/data_generator.py +0 -32
  189. unstructured_ingest-0.7.1.dist-info/METADATA +0 -383
  190. unstructured_ingest-0.7.1.dist-info/top_level.txt +0 -3
  191. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
  192. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
@@ -1,143 +0,0 @@
1
- import asyncio
2
- import json
3
- import os
4
- from pathlib import Path
5
- from typing import Optional
6
-
7
- import numpy as np
8
- import pytest
9
- from redis import exceptions as redis_exceptions
10
- from redis.asyncio import Redis, from_url
11
-
12
- from test.integration.connectors.utils.constants import DESTINATION_TAG, NOSQL_TAG
13
- from test.integration.utils import requires_env
14
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
15
- from unstructured_ingest.processes.connectors.redisdb import (
16
- CONNECTOR_TYPE as REDIS_CONNECTOR_TYPE,
17
- )
18
- from unstructured_ingest.processes.connectors.redisdb import (
19
- RedisAccessConfig,
20
- RedisConnectionConfig,
21
- RedisUploader,
22
- RedisUploaderConfig,
23
- )
24
-
25
-
26
- async def delete_record(client: Redis, element_id: str, key_prefix: str) -> None:
27
- key_with_prefix = f"{key_prefix}{element_id}"
28
- await client.delete(key_with_prefix)
29
-
30
-
31
- async def validate_upload(client: Redis, first_element: dict, key_prefix: str) -> None:
32
- element_id = first_element["element_id"]
33
- key_with_prefix = f"{key_prefix}{element_id}"
34
- expected_text = first_element["text"]
35
- expected_embeddings = first_element["embeddings"]
36
- async with client.pipeline(transaction=True) as pipe:
37
- try:
38
- response = await pipe.json().get(key_with_prefix, "$").execute()
39
- response = response[0][0]
40
- except redis_exceptions.ResponseError:
41
- response = await pipe.get(key_with_prefix).execute()
42
- response = json.loads(response[0])
43
-
44
- embedding_similarity = np.linalg.norm(
45
- np.array(response["embeddings"]) - np.array(expected_embeddings)
46
- )
47
-
48
- assert response is not None
49
- assert response["element_id"] == element_id
50
- assert response["text"] == expected_text
51
- assert embedding_similarity < 1e-10
52
-
53
-
54
- async def redis_destination_test(
55
- upload_file: Path,
56
- tmp_path: Path,
57
- connection_kwargs: dict,
58
- uploader_config: dict,
59
- uri: Optional[str] = None,
60
- password: Optional[str] = None,
61
- ):
62
- uploader = RedisUploader(
63
- connection_config=RedisConnectionConfig(
64
- **connection_kwargs, access_config=RedisAccessConfig(uri=uri, password=password)
65
- ),
66
- upload_config=RedisUploaderConfig(batch_size=10, **uploader_config),
67
- )
68
- key_prefix = uploader.upload_config.key_prefix
69
-
70
- file_data = FileData(
71
- source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
72
- connector_type=REDIS_CONNECTOR_TYPE,
73
- identifier="mock-file-data",
74
- )
75
- with upload_file.open() as upload_fp:
76
- elements = json.load(upload_fp)
77
- first_element = elements[0]
78
-
79
- try:
80
- if uploader.is_async():
81
- await uploader.run_data_async(data=elements, file_data=file_data)
82
-
83
- if uri:
84
- async with from_url(uri) as client:
85
- await validate_upload(
86
- client=client,
87
- first_element=first_element,
88
- key_prefix=key_prefix,
89
- )
90
- else:
91
- async with Redis(**connection_kwargs, password=password) as client:
92
- await validate_upload(
93
- client=client,
94
- first_element=first_element,
95
- key_prefix=key_prefix,
96
- )
97
- except Exception as e:
98
- raise e
99
- finally:
100
- if uri:
101
- async with from_url(uri) as client:
102
- tasks = [
103
- delete_record(client, element["element_id"], key_prefix) for element in elements
104
- ]
105
- await asyncio.gather(*tasks)
106
- else:
107
- async with Redis(**connection_kwargs, password=password) as client:
108
- tasks = [
109
- delete_record(client, element["element_id"], key_prefix) for element in elements
110
- ]
111
- await asyncio.gather(*tasks)
112
-
113
-
114
- @pytest.mark.asyncio
115
- @pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
116
- @requires_env("AZURE_REDIS_INGEST_TEST_PASSWORD")
117
- async def test_redis_destination_azure_with_password(upload_file: Path, tmp_path: Path):
118
- connection_kwargs = {
119
- "host": "utic-dashboard-dev.redis.cache.windows.net",
120
- "port": 6380,
121
- "db": 0,
122
- "ssl": True,
123
- }
124
- uploader_config = {
125
- "key_prefix": "test_ingest:",
126
- }
127
- redis_pw = os.environ["AZURE_REDIS_INGEST_TEST_PASSWORD"]
128
- await redis_destination_test(
129
- upload_file, tmp_path, connection_kwargs, uploader_config, password=redis_pw
130
- )
131
-
132
-
133
- @pytest.mark.asyncio
134
- @pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG, "redis", NOSQL_TAG)
135
- @requires_env("AZURE_REDIS_INGEST_TEST_PASSWORD")
136
- async def test_redis_destination_azure_with_uri(upload_file: Path, tmp_path: Path):
137
- connection_kwargs = {}
138
- uploader_config = {
139
- "key_prefix": "test_ingest:",
140
- }
141
- redis_pw = os.environ["AZURE_REDIS_INGEST_TEST_PASSWORD"]
142
- uri = f"rediss://:{redis_pw}@utic-dashboard-dev.redis.cache.windows.net:6380/0"
143
- await redis_destination_test(upload_file, tmp_path, connection_kwargs, uploader_config, uri=uri)
@@ -1,184 +0,0 @@
1
- import os
2
- import tempfile
3
- import uuid
4
- from pathlib import Path
5
-
6
- import pytest
7
-
8
- from test.integration.connectors.utils.constants import (
9
- BLOB_STORAGE_TAG,
10
- DESTINATION_TAG,
11
- SOURCE_TAG,
12
- env_setup_path,
13
- )
14
- from test.integration.connectors.utils.docker_compose import docker_compose_context
15
- from test.integration.connectors.utils.validation.source import (
16
- SourceValidationConfigs,
17
- source_connector_validation,
18
- )
19
- from test.integration.utils import requires_env
20
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
21
- from unstructured_ingest.errors_v2 import UserAuthError, UserError
22
- from unstructured_ingest.processes.connectors.fsspec.s3 import (
23
- CONNECTOR_TYPE,
24
- S3AccessConfig,
25
- S3ConnectionConfig,
26
- S3Downloader,
27
- S3DownloaderConfig,
28
- S3Indexer,
29
- S3IndexerConfig,
30
- S3Uploader,
31
- S3UploaderConfig,
32
- )
33
-
34
-
35
- def validate_predownload_file_data(file_data: FileData):
36
- assert file_data.connector_type == CONNECTOR_TYPE
37
- assert file_data.local_download_path is None
38
-
39
-
40
- def validate_postdownload_file_data(file_data: FileData):
41
- assert file_data.connector_type == CONNECTOR_TYPE
42
- assert file_data.local_download_path is not None
43
-
44
-
45
- @pytest.fixture
46
- def anon_connection_config() -> S3ConnectionConfig:
47
- return S3ConnectionConfig(access_config=S3AccessConfig(), anonymous=True)
48
-
49
-
50
- @pytest.mark.asyncio
51
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
52
- async def test_s3_source(anon_connection_config: S3ConnectionConfig):
53
- indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/small-pdf-set/")
54
- with tempfile.TemporaryDirectory() as tempdir:
55
- tempdir_path = Path(tempdir)
56
- download_config = S3DownloaderConfig(download_dir=tempdir_path)
57
- indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
58
- downloader = S3Downloader(
59
- connection_config=anon_connection_config, download_config=download_config
60
- )
61
- await source_connector_validation(
62
- indexer=indexer,
63
- downloader=downloader,
64
- configs=SourceValidationConfigs(
65
- test_id="s3",
66
- predownload_file_data_check=validate_predownload_file_data,
67
- postdownload_file_data_check=validate_postdownload_file_data,
68
- expected_num_files=4,
69
- ),
70
- )
71
-
72
-
73
- @pytest.mark.asyncio
74
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
75
- async def test_s3_source_special_char(anon_connection_config: S3ConnectionConfig):
76
- indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/special-characters/")
77
- with tempfile.TemporaryDirectory() as tempdir:
78
- tempdir_path = Path(tempdir)
79
- download_config = S3DownloaderConfig(download_dir=tempdir_path)
80
- indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
81
- downloader = S3Downloader(
82
- connection_config=anon_connection_config, download_config=download_config
83
- )
84
- await source_connector_validation(
85
- indexer=indexer,
86
- downloader=downloader,
87
- configs=SourceValidationConfigs(
88
- test_id="s3-specialchar",
89
- predownload_file_data_check=validate_predownload_file_data,
90
- postdownload_file_data_check=validate_postdownload_file_data,
91
- expected_num_files=1,
92
- ),
93
- )
94
-
95
-
96
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
97
- def test_s3_source_no_access(anon_connection_config: S3ConnectionConfig):
98
- indexer_config = S3IndexerConfig(remote_url="s3://utic-ingest-test-fixtures/destination/")
99
- indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
100
- with pytest.raises(UserAuthError):
101
- indexer.precheck()
102
-
103
-
104
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
105
- def test_s3_source_no_bucket(anon_connection_config: S3ConnectionConfig):
106
- indexer_config = S3IndexerConfig(remote_url="s3://fake-bucket")
107
- indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
108
- with pytest.raises(UserError):
109
- indexer.precheck()
110
-
111
-
112
- @pytest.mark.asyncio
113
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "minio", BLOB_STORAGE_TAG)
114
- async def test_s3_minio_source(anon_connection_config: S3ConnectionConfig):
115
- anon_connection_config.endpoint_url = "http://localhost:9000"
116
- indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/")
117
- with docker_compose_context(docker_compose_path=env_setup_path / "minio" / "source"):
118
- with tempfile.TemporaryDirectory() as tempdir:
119
- tempdir_path = Path(tempdir)
120
- download_config = S3DownloaderConfig(download_dir=tempdir_path)
121
- indexer = S3Indexer(
122
- connection_config=anon_connection_config, index_config=indexer_config
123
- )
124
- downloader = S3Downloader(
125
- connection_config=anon_connection_config, download_config=download_config
126
- )
127
- await source_connector_validation(
128
- indexer=indexer,
129
- downloader=downloader,
130
- configs=SourceValidationConfigs(
131
- test_id="s3-minio",
132
- predownload_file_data_check=validate_predownload_file_data,
133
- postdownload_file_data_check=validate_postdownload_file_data,
134
- expected_num_files=1,
135
- exclude_fields_extend=[
136
- "metadata.date_modified",
137
- "metadata.date_created",
138
- "additional_metadata.LastModified",
139
- ],
140
- ),
141
- )
142
-
143
-
144
- def get_aws_credentials() -> dict:
145
- access_key = os.getenv("S3_INGEST_TEST_ACCESS_KEY", None)
146
- assert access_key
147
- secret_key = os.getenv("S3_INGEST_TEST_SECRET_KEY", None)
148
- assert secret_key
149
- return {"aws_access_key_id": access_key, "aws_secret_access_key": secret_key}
150
-
151
-
152
- @pytest.mark.asyncio
153
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, BLOB_STORAGE_TAG)
154
- @requires_env("S3_INGEST_TEST_ACCESS_KEY", "S3_INGEST_TEST_SECRET_KEY")
155
- async def test_s3_destination(upload_file: Path):
156
- aws_credentials = get_aws_credentials()
157
- s3_bucket = "s3://utic-ingest-test-fixtures"
158
- destination_path = f"{s3_bucket}/destination/{uuid.uuid4()}"
159
- connection_config = S3ConnectionConfig(
160
- access_config=S3AccessConfig(
161
- key=aws_credentials["aws_access_key_id"],
162
- secret=aws_credentials["aws_secret_access_key"],
163
- ),
164
- )
165
- upload_config = S3UploaderConfig(remote_url=destination_path)
166
- uploader = S3Uploader(connection_config=connection_config, upload_config=upload_config)
167
- s3fs = uploader.fs
168
- file_data = FileData(
169
- source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
170
- connector_type=CONNECTOR_TYPE,
171
- identifier="mock file data",
172
- )
173
- try:
174
- uploader.precheck()
175
- if uploader.is_async():
176
- await uploader.run_async(path=upload_file, file_data=file_data)
177
- else:
178
- uploader.run(path=upload_file, file_data=file_data)
179
- uploaded_files = [
180
- Path(file) for file in s3fs.ls(path=destination_path) if Path(file).name != "_empty"
181
- ]
182
- assert len(uploaded_files) == 1
183
- finally:
184
- s3fs.rm(path=destination_path, recursive=True)
@@ -1,222 +0,0 @@
1
- import os
2
-
3
- import pytest
4
-
5
- from test.integration.connectors.utils.constants import BLOB_STORAGE_TAG, SOURCE_TAG
6
- from test.integration.connectors.utils.validation.source import (
7
- SourceValidationConfigs,
8
- source_connector_validation,
9
- )
10
- from test.integration.utils import requires_env
11
- from unstructured_ingest.processes.connectors.sharepoint import (
12
- CONNECTOR_TYPE,
13
- SharepointAccessConfig,
14
- SharepointConnectionConfig,
15
- SharepointDownloader,
16
- SharepointDownloaderConfig,
17
- SharepointIndexer,
18
- SharepointIndexerConfig,
19
- )
20
-
21
-
22
- def sharepoint_config():
23
- class SharepointTestConfig:
24
- def __init__(self):
25
- self.client_id = os.environ["SHAREPOINT_CLIENT_ID"]
26
- self.client_cred = os.environ["SHAREPOINT_CRED"]
27
- self.user_pname = os.environ["MS_USER_PNAME"]
28
- self.tenant = os.environ["MS_TENANT_ID"]
29
-
30
- return SharepointTestConfig()
31
-
32
-
33
- @pytest.mark.asyncio
34
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
35
- @requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "MS_TENANT_ID", "MS_USER_PNAME")
36
- async def test_sharepoint_source(temp_dir):
37
- site = "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source"
38
- config = sharepoint_config()
39
-
40
- # Create connection and indexer configurations
41
- access_config = SharepointAccessConfig(client_cred=config.client_cred)
42
- connection_config = SharepointConnectionConfig(
43
- client_id=config.client_id,
44
- site=site,
45
- tenant=config.tenant,
46
- user_pname=config.user_pname,
47
- access_config=access_config,
48
- )
49
- index_config = SharepointIndexerConfig(recursive=True)
50
-
51
- download_config = SharepointDownloaderConfig(download_dir=temp_dir)
52
-
53
- # Instantiate indexer and downloader
54
- indexer = SharepointIndexer(
55
- connection_config=connection_config,
56
- index_config=index_config,
57
- )
58
- downloader = SharepointDownloader(
59
- connection_config=connection_config,
60
- download_config=download_config,
61
- )
62
-
63
- # Run the source connector validation
64
- await source_connector_validation(
65
- indexer=indexer,
66
- downloader=downloader,
67
- configs=SourceValidationConfigs(
68
- test_id="sharepoint1",
69
- expected_num_files=4,
70
- validate_downloaded_files=True,
71
- exclude_fields_extend=[
72
- "metadata.date_created",
73
- "metadata.date_modified",
74
- "additional_metadata.LastModified",
75
- "additional_metadata.@microsoft.graph.downloadUrl",
76
- ],
77
- ),
78
- )
79
-
80
-
81
- @pytest.mark.asyncio
82
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
83
- @requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "MS_TENANT_ID", "MS_USER_PNAME")
84
- async def test_sharepoint_source_with_path(temp_dir):
85
- site = "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source"
86
- config = sharepoint_config()
87
-
88
- # Create connection and indexer configurations
89
- access_config = SharepointAccessConfig(client_cred=config.client_cred)
90
- connection_config = SharepointConnectionConfig(
91
- client_id=config.client_id,
92
- site=site,
93
- tenant=config.tenant,
94
- user_pname=config.user_pname,
95
- access_config=access_config,
96
- )
97
- index_config = SharepointIndexerConfig(recursive=True, path="Folder1")
98
-
99
- download_config = SharepointDownloaderConfig(download_dir=temp_dir)
100
-
101
- # Instantiate indexer and downloader
102
- indexer = SharepointIndexer(
103
- connection_config=connection_config,
104
- index_config=index_config,
105
- )
106
- downloader = SharepointDownloader(
107
- connection_config=connection_config,
108
- download_config=download_config,
109
- )
110
-
111
- # Run the source connector validation
112
- await source_connector_validation(
113
- indexer=indexer,
114
- downloader=downloader,
115
- configs=SourceValidationConfigs(
116
- test_id="sharepoint2",
117
- expected_num_files=2,
118
- validate_downloaded_files=True,
119
- exclude_fields_extend=[
120
- "metadata.date_created",
121
- "metadata.date_modified",
122
- "additional_metadata.LastModified",
123
- "additional_metadata.@microsoft.graph.downloadUrl",
124
- ],
125
- ),
126
- )
127
-
128
-
129
- @pytest.mark.asyncio
130
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
131
- @requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "MS_TENANT_ID", "MS_USER_PNAME")
132
- async def test_sharepoint_root_with_path(temp_dir):
133
- site = "https://unstructuredio.sharepoint.com/"
134
- config = sharepoint_config()
135
-
136
- # Create connection and indexer configurations
137
- access_config = SharepointAccessConfig(client_cred=config.client_cred)
138
- connection_config = SharepointConnectionConfig(
139
- client_id=config.client_id,
140
- site=site,
141
- tenant=config.tenant,
142
- user_pname=config.user_pname,
143
- access_config=access_config,
144
- )
145
- index_config = SharepointIndexerConfig(recursive=True, path="e2e-test-folder")
146
-
147
- download_config = SharepointDownloaderConfig(download_dir=temp_dir)
148
-
149
- # Instantiate indexer and downloader
150
- indexer = SharepointIndexer(
151
- connection_config=connection_config,
152
- index_config=index_config,
153
- )
154
- downloader = SharepointDownloader(
155
- connection_config=connection_config,
156
- download_config=download_config,
157
- )
158
-
159
- # Run the source connector validation
160
- await source_connector_validation(
161
- indexer=indexer,
162
- downloader=downloader,
163
- configs=SourceValidationConfigs(
164
- test_id="sharepoint3",
165
- expected_num_files=1,
166
- validate_downloaded_files=True,
167
- exclude_fields_extend=[
168
- "metadata.date_created",
169
- "metadata.date_modified",
170
- "additional_metadata.LastModified",
171
- "additional_metadata.@microsoft.graph.downloadUrl",
172
- ],
173
- ),
174
- )
175
-
176
-
177
- @pytest.mark.asyncio
178
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
179
- @requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "MS_TENANT_ID", "MS_USER_PNAME")
180
- async def test_sharepoint_shared_documents(temp_dir):
181
- site = "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source"
182
- config = sharepoint_config()
183
-
184
- # Create connection and indexer configurations
185
- access_config = SharepointAccessConfig(client_cred=config.client_cred)
186
- connection_config = SharepointConnectionConfig(
187
- client_id=config.client_id,
188
- site=site,
189
- tenant=config.tenant,
190
- user_pname=config.user_pname,
191
- access_config=access_config,
192
- )
193
- index_config = SharepointIndexerConfig(recursive=True, path="Shared Documents")
194
-
195
- download_config = SharepointDownloaderConfig(download_dir=temp_dir)
196
-
197
- # Instantiate indexer and downloader
198
- indexer = SharepointIndexer(
199
- connection_config=connection_config,
200
- index_config=index_config,
201
- )
202
- downloader = SharepointDownloader(
203
- connection_config=connection_config,
204
- download_config=download_config,
205
- )
206
-
207
- # Run the source connector validation
208
- await source_connector_validation(
209
- indexer=indexer,
210
- downloader=downloader,
211
- configs=SourceValidationConfigs(
212
- test_id="sharepoint4",
213
- expected_num_files=4,
214
- validate_downloaded_files=True,
215
- exclude_fields_extend=[
216
- "metadata.date_created",
217
- "metadata.date_modified",
218
- "additional_metadata.LastModified",
219
- "additional_metadata.@microsoft.graph.downloadUrl",
220
- ],
221
- ),
222
- )