unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (187) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/README.md +28 -0
  3. unstructured_ingest/embed/mixedbreadai.py +0 -1
  4. unstructured_ingest/interfaces/upload_stager.py +2 -2
  5. unstructured_ingest/interfaces/uploader.py +3 -3
  6. unstructured_ingest/main.py +0 -0
  7. unstructured_ingest/pipeline/interfaces.py +1 -1
  8. unstructured_ingest/pipeline/pipeline.py +1 -1
  9. unstructured_ingest/processes/chunker.py +4 -0
  10. unstructured_ingest/processes/connectors/airtable.py +4 -2
  11. unstructured_ingest/processes/connectors/astradb.py +2 -2
  12. unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  13. unstructured_ingest/processes/connectors/confluence.py +0 -1
  14. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
  15. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
  16. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
  17. unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
  18. unstructured_ingest/processes/connectors/delta_table.py +1 -0
  19. unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
  20. unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
  21. unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
  22. unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
  23. unstructured_ingest/processes/connectors/gitlab.py +1 -2
  24. unstructured_ingest/processes/connectors/google_drive.py +0 -2
  25. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
  26. unstructured_ingest/processes/connectors/kdbai.py +1 -0
  27. unstructured_ingest/processes/connectors/outlook.py +1 -2
  28. unstructured_ingest/processes/connectors/pinecone.py +0 -1
  29. unstructured_ingest/processes/connectors/redisdb.py +28 -24
  30. unstructured_ingest/processes/connectors/salesforce.py +1 -1
  31. unstructured_ingest/processes/connectors/slack.py +1 -2
  32. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
  33. unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
  34. unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
  35. unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
  36. unstructured_ingest/processes/connectors/sql/sql.py +3 -4
  37. unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
  38. unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
  39. unstructured_ingest/processes/connectors/vectara.py +0 -2
  40. unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
  41. unstructured_ingest/processes/embedder.py +2 -2
  42. unstructured_ingest/processes/filter.py +1 -1
  43. unstructured_ingest/processes/partitioner.py +4 -0
  44. unstructured_ingest/processes/utils/blob_storage.py +2 -2
  45. unstructured_ingest/unstructured_api.py +13 -8
  46. unstructured_ingest/utils/data_prep.py +8 -32
  47. unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
  48. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +50 -184
  49. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
  50. examples/__init__.py +0 -0
  51. examples/airtable.py +0 -44
  52. examples/azure_cognitive_search.py +0 -55
  53. examples/chroma.py +0 -54
  54. examples/couchbase.py +0 -55
  55. examples/databricks_volumes_dest.py +0 -55
  56. examples/databricks_volumes_source.py +0 -53
  57. examples/delta_table.py +0 -45
  58. examples/discord_example.py +0 -36
  59. examples/elasticsearch.py +0 -49
  60. examples/google_drive.py +0 -45
  61. examples/kdbai.py +0 -54
  62. examples/local.py +0 -36
  63. examples/milvus.py +0 -44
  64. examples/mongodb.py +0 -53
  65. examples/opensearch.py +0 -50
  66. examples/pinecone.py +0 -57
  67. examples/s3.py +0 -38
  68. examples/salesforce.py +0 -44
  69. examples/sharepoint.py +0 -47
  70. examples/singlestore.py +0 -49
  71. examples/sql.py +0 -90
  72. examples/vectara.py +0 -54
  73. examples/weaviate.py +0 -44
  74. test/__init__.py +0 -0
  75. test/integration/__init__.py +0 -0
  76. test/integration/chunkers/__init__.py +0 -0
  77. test/integration/chunkers/test_chunkers.py +0 -31
  78. test/integration/connectors/__init__.py +0 -0
  79. test/integration/connectors/conftest.py +0 -38
  80. test/integration/connectors/databricks/__init__.py +0 -0
  81. test/integration/connectors/databricks/test_volumes_native.py +0 -273
  82. test/integration/connectors/discord/__init__.py +0 -0
  83. test/integration/connectors/discord/test_discord.py +0 -90
  84. test/integration/connectors/duckdb/__init__.py +0 -0
  85. test/integration/connectors/duckdb/conftest.py +0 -14
  86. test/integration/connectors/duckdb/test_duckdb.py +0 -90
  87. test/integration/connectors/duckdb/test_motherduck.py +0 -95
  88. test/integration/connectors/elasticsearch/__init__.py +0 -0
  89. test/integration/connectors/elasticsearch/conftest.py +0 -34
  90. test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
  91. test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
  92. test/integration/connectors/sql/__init__.py +0 -0
  93. test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
  94. test/integration/connectors/sql/test_postgres.py +0 -201
  95. test/integration/connectors/sql/test_singlestore.py +0 -182
  96. test/integration/connectors/sql/test_snowflake.py +0 -244
  97. test/integration/connectors/sql/test_sqlite.py +0 -168
  98. test/integration/connectors/sql/test_vastdb.py +0 -34
  99. test/integration/connectors/test_astradb.py +0 -287
  100. test/integration/connectors/test_azure_ai_search.py +0 -254
  101. test/integration/connectors/test_chroma.py +0 -136
  102. test/integration/connectors/test_confluence.py +0 -111
  103. test/integration/connectors/test_delta_table.py +0 -183
  104. test/integration/connectors/test_dropbox.py +0 -151
  105. test/integration/connectors/test_github.py +0 -49
  106. test/integration/connectors/test_google_drive.py +0 -257
  107. test/integration/connectors/test_jira.py +0 -67
  108. test/integration/connectors/test_lancedb.py +0 -247
  109. test/integration/connectors/test_milvus.py +0 -208
  110. test/integration/connectors/test_mongodb.py +0 -335
  111. test/integration/connectors/test_neo4j.py +0 -244
  112. test/integration/connectors/test_notion.py +0 -152
  113. test/integration/connectors/test_onedrive.py +0 -163
  114. test/integration/connectors/test_pinecone.py +0 -387
  115. test/integration/connectors/test_qdrant.py +0 -216
  116. test/integration/connectors/test_redis.py +0 -143
  117. test/integration/connectors/test_s3.py +0 -184
  118. test/integration/connectors/test_sharepoint.py +0 -222
  119. test/integration/connectors/test_vectara.py +0 -282
  120. test/integration/connectors/test_zendesk.py +0 -120
  121. test/integration/connectors/utils/__init__.py +0 -0
  122. test/integration/connectors/utils/constants.py +0 -13
  123. test/integration/connectors/utils/docker.py +0 -151
  124. test/integration/connectors/utils/docker_compose.py +0 -59
  125. test/integration/connectors/utils/validation/__init__.py +0 -0
  126. test/integration/connectors/utils/validation/destination.py +0 -77
  127. test/integration/connectors/utils/validation/equality.py +0 -76
  128. test/integration/connectors/utils/validation/source.py +0 -331
  129. test/integration/connectors/utils/validation/utils.py +0 -36
  130. test/integration/connectors/weaviate/__init__.py +0 -0
  131. test/integration/connectors/weaviate/conftest.py +0 -15
  132. test/integration/connectors/weaviate/test_cloud.py +0 -39
  133. test/integration/connectors/weaviate/test_local.py +0 -152
  134. test/integration/embedders/__init__.py +0 -0
  135. test/integration/embedders/conftest.py +0 -13
  136. test/integration/embedders/test_azure_openai.py +0 -57
  137. test/integration/embedders/test_bedrock.py +0 -103
  138. test/integration/embedders/test_huggingface.py +0 -24
  139. test/integration/embedders/test_mixedbread.py +0 -71
  140. test/integration/embedders/test_octoai.py +0 -75
  141. test/integration/embedders/test_openai.py +0 -74
  142. test/integration/embedders/test_togetherai.py +0 -71
  143. test/integration/embedders/test_vertexai.py +0 -63
  144. test/integration/embedders/test_voyageai.py +0 -79
  145. test/integration/embedders/utils.py +0 -66
  146. test/integration/partitioners/__init__.py +0 -0
  147. test/integration/partitioners/test_partitioner.py +0 -76
  148. test/integration/utils.py +0 -15
  149. test/unit/__init__.py +0 -0
  150. test/unit/chunkers/__init__.py +0 -0
  151. test/unit/chunkers/test_chunkers.py +0 -49
  152. test/unit/connectors/__init__.py +0 -0
  153. test/unit/connectors/ibm_watsonx/__init__.py +0 -0
  154. test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
  155. test/unit/connectors/motherduck/__init__.py +0 -0
  156. test/unit/connectors/motherduck/test_base.py +0 -73
  157. test/unit/connectors/sql/__init__.py +0 -0
  158. test/unit/connectors/sql/test_sql.py +0 -152
  159. test/unit/connectors/test_confluence.py +0 -71
  160. test/unit/connectors/test_jira.py +0 -401
  161. test/unit/embed/__init__.py +0 -0
  162. test/unit/embed/test_mixedbreadai.py +0 -42
  163. test/unit/embed/test_octoai.py +0 -27
  164. test/unit/embed/test_openai.py +0 -28
  165. test/unit/embed/test_vertexai.py +0 -25
  166. test/unit/embed/test_voyageai.py +0 -24
  167. test/unit/embedders/__init__.py +0 -0
  168. test/unit/embedders/test_bedrock.py +0 -36
  169. test/unit/embedders/test_huggingface.py +0 -48
  170. test/unit/embedders/test_mixedbread.py +0 -37
  171. test/unit/embedders/test_octoai.py +0 -35
  172. test/unit/embedders/test_openai.py +0 -35
  173. test/unit/embedders/test_togetherai.py +0 -37
  174. test/unit/embedders/test_vertexai.py +0 -37
  175. test/unit/embedders/test_voyageai.py +0 -38
  176. test/unit/partitioners/__init__.py +0 -0
  177. test/unit/partitioners/test_partitioner.py +0 -63
  178. test/unit/test_error.py +0 -27
  179. test/unit/test_html.py +0 -112
  180. test/unit/test_interfaces.py +0 -26
  181. test/unit/test_utils.py +0 -220
  182. test/unit/utils/__init__.py +0 -0
  183. test/unit/utils/data_generator.py +0 -32
  184. unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
  185. unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
  186. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
  187. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
@@ -1,143 +0,0 @@
1
- import asyncio
2
- import json
3
- import os
4
- from pathlib import Path
5
- from typing import Optional
6
-
7
- import numpy as np
8
- import pytest
9
- from redis import exceptions as redis_exceptions
10
- from redis.asyncio import Redis, from_url
11
-
12
- from test.integration.connectors.utils.constants import DESTINATION_TAG, NOSQL_TAG
13
- from test.integration.utils import requires_env
14
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
15
- from unstructured_ingest.processes.connectors.redisdb import (
16
- CONNECTOR_TYPE as REDIS_CONNECTOR_TYPE,
17
- )
18
- from unstructured_ingest.processes.connectors.redisdb import (
19
- RedisAccessConfig,
20
- RedisConnectionConfig,
21
- RedisUploader,
22
- RedisUploaderConfig,
23
- )
24
-
25
-
26
- async def delete_record(client: Redis, element_id: str, key_prefix: str) -> None:
27
- key_with_prefix = f"{key_prefix}{element_id}"
28
- await client.delete(key_with_prefix)
29
-
30
-
31
- async def validate_upload(client: Redis, first_element: dict, key_prefix: str) -> None:
32
- element_id = first_element["element_id"]
33
- key_with_prefix = f"{key_prefix}{element_id}"
34
- expected_text = first_element["text"]
35
- expected_embeddings = first_element["embeddings"]
36
- async with client.pipeline(transaction=True) as pipe:
37
- try:
38
- response = await pipe.json().get(key_with_prefix, "$").execute()
39
- response = response[0][0]
40
- except redis_exceptions.ResponseError:
41
- response = await pipe.get(key_with_prefix).execute()
42
- response = json.loads(response[0])
43
-
44
- embedding_similarity = np.linalg.norm(
45
- np.array(response["embeddings"]) - np.array(expected_embeddings)
46
- )
47
-
48
- assert response is not None
49
- assert response["element_id"] == element_id
50
- assert response["text"] == expected_text
51
- assert embedding_similarity < 1e-10
52
-
53
-
54
- async def redis_destination_test(
55
- upload_file: Path,
56
- tmp_path: Path,
57
- connection_kwargs: dict,
58
- uploader_config: dict,
59
- uri: Optional[str] = None,
60
- password: Optional[str] = None,
61
- ):
62
- uploader = RedisUploader(
63
- connection_config=RedisConnectionConfig(
64
- **connection_kwargs, access_config=RedisAccessConfig(uri=uri, password=password)
65
- ),
66
- upload_config=RedisUploaderConfig(batch_size=10, **uploader_config),
67
- )
68
- key_prefix = uploader.upload_config.key_prefix
69
-
70
- file_data = FileData(
71
- source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
72
- connector_type=REDIS_CONNECTOR_TYPE,
73
- identifier="mock-file-data",
74
- )
75
- with upload_file.open() as upload_fp:
76
- elements = json.load(upload_fp)
77
- first_element = elements[0]
78
-
79
- try:
80
- if uploader.is_async():
81
- await uploader.run_data_async(data=elements, file_data=file_data)
82
-
83
- if uri:
84
- async with from_url(uri) as client:
85
- await validate_upload(
86
- client=client,
87
- first_element=first_element,
88
- key_prefix=key_prefix,
89
- )
90
- else:
91
- async with Redis(**connection_kwargs, password=password) as client:
92
- await validate_upload(
93
- client=client,
94
- first_element=first_element,
95
- key_prefix=key_prefix,
96
- )
97
- except Exception as e:
98
- raise e
99
- finally:
100
- if uri:
101
- async with from_url(uri) as client:
102
- tasks = [
103
- delete_record(client, element["element_id"], key_prefix) for element in elements
104
- ]
105
- await asyncio.gather(*tasks)
106
- else:
107
- async with Redis(**connection_kwargs, password=password) as client:
108
- tasks = [
109
- delete_record(client, element["element_id"], key_prefix) for element in elements
110
- ]
111
- await asyncio.gather(*tasks)
112
-
113
-
114
- @pytest.mark.asyncio
115
- @pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
116
- @requires_env("AZURE_REDIS_INGEST_TEST_PASSWORD")
117
- async def test_redis_destination_azure_with_password(upload_file: Path, tmp_path: Path):
118
- connection_kwargs = {
119
- "host": "utic-dashboard-dev.redis.cache.windows.net",
120
- "port": 6380,
121
- "db": 0,
122
- "ssl": True,
123
- }
124
- uploader_config = {
125
- "key_prefix": "test_ingest:",
126
- }
127
- redis_pw = os.environ["AZURE_REDIS_INGEST_TEST_PASSWORD"]
128
- await redis_destination_test(
129
- upload_file, tmp_path, connection_kwargs, uploader_config, password=redis_pw
130
- )
131
-
132
-
133
- @pytest.mark.asyncio
134
- @pytest.mark.tags(REDIS_CONNECTOR_TYPE, DESTINATION_TAG, "redis", NOSQL_TAG)
135
- @requires_env("AZURE_REDIS_INGEST_TEST_PASSWORD")
136
- async def test_redis_destination_azure_with_uri(upload_file: Path, tmp_path: Path):
137
- connection_kwargs = {}
138
- uploader_config = {
139
- "key_prefix": "test_ingest:",
140
- }
141
- redis_pw = os.environ["AZURE_REDIS_INGEST_TEST_PASSWORD"]
142
- uri = f"rediss://:{redis_pw}@utic-dashboard-dev.redis.cache.windows.net:6380/0"
143
- await redis_destination_test(upload_file, tmp_path, connection_kwargs, uploader_config, uri=uri)
@@ -1,184 +0,0 @@
1
- import os
2
- import tempfile
3
- import uuid
4
- from pathlib import Path
5
-
6
- import pytest
7
-
8
- from test.integration.connectors.utils.constants import (
9
- BLOB_STORAGE_TAG,
10
- DESTINATION_TAG,
11
- SOURCE_TAG,
12
- env_setup_path,
13
- )
14
- from test.integration.connectors.utils.docker_compose import docker_compose_context
15
- from test.integration.connectors.utils.validation.source import (
16
- SourceValidationConfigs,
17
- source_connector_validation,
18
- )
19
- from test.integration.utils import requires_env
20
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
21
- from unstructured_ingest.errors_v2 import UserAuthError, UserError
22
- from unstructured_ingest.processes.connectors.fsspec.s3 import (
23
- CONNECTOR_TYPE,
24
- S3AccessConfig,
25
- S3ConnectionConfig,
26
- S3Downloader,
27
- S3DownloaderConfig,
28
- S3Indexer,
29
- S3IndexerConfig,
30
- S3Uploader,
31
- S3UploaderConfig,
32
- )
33
-
34
-
35
- def validate_predownload_file_data(file_data: FileData):
36
- assert file_data.connector_type == CONNECTOR_TYPE
37
- assert file_data.local_download_path is None
38
-
39
-
40
- def validate_postdownload_file_data(file_data: FileData):
41
- assert file_data.connector_type == CONNECTOR_TYPE
42
- assert file_data.local_download_path is not None
43
-
44
-
45
- @pytest.fixture
46
- def anon_connection_config() -> S3ConnectionConfig:
47
- return S3ConnectionConfig(access_config=S3AccessConfig(), anonymous=True)
48
-
49
-
50
- @pytest.mark.asyncio
51
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
52
- async def test_s3_source(anon_connection_config: S3ConnectionConfig):
53
- indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/small-pdf-set/")
54
- with tempfile.TemporaryDirectory() as tempdir:
55
- tempdir_path = Path(tempdir)
56
- download_config = S3DownloaderConfig(download_dir=tempdir_path)
57
- indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
58
- downloader = S3Downloader(
59
- connection_config=anon_connection_config, download_config=download_config
60
- )
61
- await source_connector_validation(
62
- indexer=indexer,
63
- downloader=downloader,
64
- configs=SourceValidationConfigs(
65
- test_id="s3",
66
- predownload_file_data_check=validate_predownload_file_data,
67
- postdownload_file_data_check=validate_postdownload_file_data,
68
- expected_num_files=4,
69
- ),
70
- )
71
-
72
-
73
- @pytest.mark.asyncio
74
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
75
- async def test_s3_source_special_char(anon_connection_config: S3ConnectionConfig):
76
- indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/special-characters/")
77
- with tempfile.TemporaryDirectory() as tempdir:
78
- tempdir_path = Path(tempdir)
79
- download_config = S3DownloaderConfig(download_dir=tempdir_path)
80
- indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
81
- downloader = S3Downloader(
82
- connection_config=anon_connection_config, download_config=download_config
83
- )
84
- await source_connector_validation(
85
- indexer=indexer,
86
- downloader=downloader,
87
- configs=SourceValidationConfigs(
88
- test_id="s3-specialchar",
89
- predownload_file_data_check=validate_predownload_file_data,
90
- postdownload_file_data_check=validate_postdownload_file_data,
91
- expected_num_files=1,
92
- ),
93
- )
94
-
95
-
96
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
97
- def test_s3_source_no_access(anon_connection_config: S3ConnectionConfig):
98
- indexer_config = S3IndexerConfig(remote_url="s3://utic-ingest-test-fixtures/destination/")
99
- indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
100
- with pytest.raises(UserAuthError):
101
- indexer.precheck()
102
-
103
-
104
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
105
- def test_s3_source_no_bucket(anon_connection_config: S3ConnectionConfig):
106
- indexer_config = S3IndexerConfig(remote_url="s3://fake-bucket")
107
- indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
108
- with pytest.raises(UserError):
109
- indexer.precheck()
110
-
111
-
112
- @pytest.mark.asyncio
113
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "minio", BLOB_STORAGE_TAG)
114
- async def test_s3_minio_source(anon_connection_config: S3ConnectionConfig):
115
- anon_connection_config.endpoint_url = "http://localhost:9000"
116
- indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/")
117
- with docker_compose_context(docker_compose_path=env_setup_path / "minio" / "source"):
118
- with tempfile.TemporaryDirectory() as tempdir:
119
- tempdir_path = Path(tempdir)
120
- download_config = S3DownloaderConfig(download_dir=tempdir_path)
121
- indexer = S3Indexer(
122
- connection_config=anon_connection_config, index_config=indexer_config
123
- )
124
- downloader = S3Downloader(
125
- connection_config=anon_connection_config, download_config=download_config
126
- )
127
- await source_connector_validation(
128
- indexer=indexer,
129
- downloader=downloader,
130
- configs=SourceValidationConfigs(
131
- test_id="s3-minio",
132
- predownload_file_data_check=validate_predownload_file_data,
133
- postdownload_file_data_check=validate_postdownload_file_data,
134
- expected_num_files=1,
135
- exclude_fields_extend=[
136
- "metadata.date_modified",
137
- "metadata.date_created",
138
- "additional_metadata.LastModified",
139
- ],
140
- ),
141
- )
142
-
143
-
144
- def get_aws_credentials() -> dict:
145
- access_key = os.getenv("S3_INGEST_TEST_ACCESS_KEY", None)
146
- assert access_key
147
- secret_key = os.getenv("S3_INGEST_TEST_SECRET_KEY", None)
148
- assert secret_key
149
- return {"aws_access_key_id": access_key, "aws_secret_access_key": secret_key}
150
-
151
-
152
- @pytest.mark.asyncio
153
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, BLOB_STORAGE_TAG)
154
- @requires_env("S3_INGEST_TEST_ACCESS_KEY", "S3_INGEST_TEST_SECRET_KEY")
155
- async def test_s3_destination(upload_file: Path):
156
- aws_credentials = get_aws_credentials()
157
- s3_bucket = "s3://utic-ingest-test-fixtures"
158
- destination_path = f"{s3_bucket}/destination/{uuid.uuid4()}"
159
- connection_config = S3ConnectionConfig(
160
- access_config=S3AccessConfig(
161
- key=aws_credentials["aws_access_key_id"],
162
- secret=aws_credentials["aws_secret_access_key"],
163
- ),
164
- )
165
- upload_config = S3UploaderConfig(remote_url=destination_path)
166
- uploader = S3Uploader(connection_config=connection_config, upload_config=upload_config)
167
- s3fs = uploader.fs
168
- file_data = FileData(
169
- source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
170
- connector_type=CONNECTOR_TYPE,
171
- identifier="mock file data",
172
- )
173
- try:
174
- uploader.precheck()
175
- if uploader.is_async():
176
- await uploader.run_async(path=upload_file, file_data=file_data)
177
- else:
178
- uploader.run(path=upload_file, file_data=file_data)
179
- uploaded_files = [
180
- Path(file) for file in s3fs.ls(path=destination_path) if Path(file).name != "_empty"
181
- ]
182
- assert len(uploaded_files) == 1
183
- finally:
184
- s3fs.rm(path=destination_path, recursive=True)
@@ -1,222 +0,0 @@
1
- import os
2
-
3
- import pytest
4
-
5
- from test.integration.connectors.utils.constants import BLOB_STORAGE_TAG, SOURCE_TAG
6
- from test.integration.connectors.utils.validation.source import (
7
- SourceValidationConfigs,
8
- source_connector_validation,
9
- )
10
- from test.integration.utils import requires_env
11
- from unstructured_ingest.processes.connectors.sharepoint import (
12
- CONNECTOR_TYPE,
13
- SharepointAccessConfig,
14
- SharepointConnectionConfig,
15
- SharepointDownloader,
16
- SharepointDownloaderConfig,
17
- SharepointIndexer,
18
- SharepointIndexerConfig,
19
- )
20
-
21
-
22
- def sharepoint_config():
23
- class SharepointTestConfig:
24
- def __init__(self):
25
- self.client_id = os.environ["SHAREPOINT_CLIENT_ID"]
26
- self.client_cred = os.environ["SHAREPOINT_CRED"]
27
- self.user_pname = os.environ["MS_USER_PNAME"]
28
- self.tenant = os.environ["MS_TENANT_ID"]
29
-
30
- return SharepointTestConfig()
31
-
32
-
33
- @pytest.mark.asyncio
34
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
35
- @requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "MS_TENANT_ID", "MS_USER_PNAME")
36
- async def test_sharepoint_source(temp_dir):
37
- site = "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source"
38
- config = sharepoint_config()
39
-
40
- # Create connection and indexer configurations
41
- access_config = SharepointAccessConfig(client_cred=config.client_cred)
42
- connection_config = SharepointConnectionConfig(
43
- client_id=config.client_id,
44
- site=site,
45
- tenant=config.tenant,
46
- user_pname=config.user_pname,
47
- access_config=access_config,
48
- )
49
- index_config = SharepointIndexerConfig(recursive=True)
50
-
51
- download_config = SharepointDownloaderConfig(download_dir=temp_dir)
52
-
53
- # Instantiate indexer and downloader
54
- indexer = SharepointIndexer(
55
- connection_config=connection_config,
56
- index_config=index_config,
57
- )
58
- downloader = SharepointDownloader(
59
- connection_config=connection_config,
60
- download_config=download_config,
61
- )
62
-
63
- # Run the source connector validation
64
- await source_connector_validation(
65
- indexer=indexer,
66
- downloader=downloader,
67
- configs=SourceValidationConfigs(
68
- test_id="sharepoint1",
69
- expected_num_files=4,
70
- validate_downloaded_files=True,
71
- exclude_fields_extend=[
72
- "metadata.date_created",
73
- "metadata.date_modified",
74
- "additional_metadata.LastModified",
75
- "additional_metadata.@microsoft.graph.downloadUrl",
76
- ],
77
- ),
78
- )
79
-
80
-
81
- @pytest.mark.asyncio
82
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
83
- @requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "MS_TENANT_ID", "MS_USER_PNAME")
84
- async def test_sharepoint_source_with_path(temp_dir):
85
- site = "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source"
86
- config = sharepoint_config()
87
-
88
- # Create connection and indexer configurations
89
- access_config = SharepointAccessConfig(client_cred=config.client_cred)
90
- connection_config = SharepointConnectionConfig(
91
- client_id=config.client_id,
92
- site=site,
93
- tenant=config.tenant,
94
- user_pname=config.user_pname,
95
- access_config=access_config,
96
- )
97
- index_config = SharepointIndexerConfig(recursive=True, path="Folder1")
98
-
99
- download_config = SharepointDownloaderConfig(download_dir=temp_dir)
100
-
101
- # Instantiate indexer and downloader
102
- indexer = SharepointIndexer(
103
- connection_config=connection_config,
104
- index_config=index_config,
105
- )
106
- downloader = SharepointDownloader(
107
- connection_config=connection_config,
108
- download_config=download_config,
109
- )
110
-
111
- # Run the source connector validation
112
- await source_connector_validation(
113
- indexer=indexer,
114
- downloader=downloader,
115
- configs=SourceValidationConfigs(
116
- test_id="sharepoint2",
117
- expected_num_files=2,
118
- validate_downloaded_files=True,
119
- exclude_fields_extend=[
120
- "metadata.date_created",
121
- "metadata.date_modified",
122
- "additional_metadata.LastModified",
123
- "additional_metadata.@microsoft.graph.downloadUrl",
124
- ],
125
- ),
126
- )
127
-
128
-
129
- @pytest.mark.asyncio
130
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
131
- @requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "MS_TENANT_ID", "MS_USER_PNAME")
132
- async def test_sharepoint_root_with_path(temp_dir):
133
- site = "https://unstructuredio.sharepoint.com/"
134
- config = sharepoint_config()
135
-
136
- # Create connection and indexer configurations
137
- access_config = SharepointAccessConfig(client_cred=config.client_cred)
138
- connection_config = SharepointConnectionConfig(
139
- client_id=config.client_id,
140
- site=site,
141
- tenant=config.tenant,
142
- user_pname=config.user_pname,
143
- access_config=access_config,
144
- )
145
- index_config = SharepointIndexerConfig(recursive=True, path="e2e-test-folder")
146
-
147
- download_config = SharepointDownloaderConfig(download_dir=temp_dir)
148
-
149
- # Instantiate indexer and downloader
150
- indexer = SharepointIndexer(
151
- connection_config=connection_config,
152
- index_config=index_config,
153
- )
154
- downloader = SharepointDownloader(
155
- connection_config=connection_config,
156
- download_config=download_config,
157
- )
158
-
159
- # Run the source connector validation
160
- await source_connector_validation(
161
- indexer=indexer,
162
- downloader=downloader,
163
- configs=SourceValidationConfigs(
164
- test_id="sharepoint3",
165
- expected_num_files=1,
166
- validate_downloaded_files=True,
167
- exclude_fields_extend=[
168
- "metadata.date_created",
169
- "metadata.date_modified",
170
- "additional_metadata.LastModified",
171
- "additional_metadata.@microsoft.graph.downloadUrl",
172
- ],
173
- ),
174
- )
175
-
176
-
177
- @pytest.mark.asyncio
178
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
179
- @requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "MS_TENANT_ID", "MS_USER_PNAME")
180
- async def test_sharepoint_shared_documents(temp_dir):
181
- site = "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source"
182
- config = sharepoint_config()
183
-
184
- # Create connection and indexer configurations
185
- access_config = SharepointAccessConfig(client_cred=config.client_cred)
186
- connection_config = SharepointConnectionConfig(
187
- client_id=config.client_id,
188
- site=site,
189
- tenant=config.tenant,
190
- user_pname=config.user_pname,
191
- access_config=access_config,
192
- )
193
- index_config = SharepointIndexerConfig(recursive=True, path="Shared Documents")
194
-
195
- download_config = SharepointDownloaderConfig(download_dir=temp_dir)
196
-
197
- # Instantiate indexer and downloader
198
- indexer = SharepointIndexer(
199
- connection_config=connection_config,
200
- index_config=index_config,
201
- )
202
- downloader = SharepointDownloader(
203
- connection_config=connection_config,
204
- download_config=download_config,
205
- )
206
-
207
- # Run the source connector validation
208
- await source_connector_validation(
209
- indexer=indexer,
210
- downloader=downloader,
211
- configs=SourceValidationConfigs(
212
- test_id="sharepoint4",
213
- expected_num_files=4,
214
- validate_downloaded_files=True,
215
- exclude_fields_extend=[
216
- "metadata.date_created",
217
- "metadata.date_modified",
218
- "additional_metadata.LastModified",
219
- "additional_metadata.@microsoft.graph.downloadUrl",
220
- ],
221
- ),
222
- )