unstructured-ingest 0.7.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (192) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/README.md +28 -0
  3. unstructured_ingest/embed/mixedbreadai.py +0 -1
  4. unstructured_ingest/interfaces/upload_stager.py +2 -2
  5. unstructured_ingest/interfaces/uploader.py +3 -3
  6. unstructured_ingest/logger.py +2 -93
  7. unstructured_ingest/main.py +0 -0
  8. unstructured_ingest/pipeline/interfaces.py +1 -1
  9. unstructured_ingest/pipeline/pipeline.py +1 -1
  10. unstructured_ingest/processes/chunker.py +4 -0
  11. unstructured_ingest/processes/connectors/airtable.py +4 -2
  12. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +10 -0
  13. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  14. unstructured_ingest/processes/connectors/astradb.py +2 -2
  15. unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  16. unstructured_ingest/processes/connectors/confluence.py +0 -1
  17. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
  18. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
  19. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
  20. unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
  21. unstructured_ingest/processes/connectors/delta_table.py +1 -0
  22. unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
  23. unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
  24. unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
  25. unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
  26. unstructured_ingest/processes/connectors/gitlab.py +1 -2
  27. unstructured_ingest/processes/connectors/google_drive.py +0 -2
  28. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
  29. unstructured_ingest/processes/connectors/kdbai.py +1 -0
  30. unstructured_ingest/processes/connectors/outlook.py +1 -2
  31. unstructured_ingest/processes/connectors/pinecone.py +0 -1
  32. unstructured_ingest/processes/connectors/redisdb.py +28 -24
  33. unstructured_ingest/processes/connectors/salesforce.py +1 -1
  34. unstructured_ingest/processes/connectors/slack.py +1 -2
  35. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
  36. unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
  37. unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
  38. unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
  39. unstructured_ingest/processes/connectors/sql/sql.py +3 -4
  40. unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
  41. unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
  42. unstructured_ingest/processes/connectors/vectara.py +0 -2
  43. unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
  44. unstructured_ingest/processes/embedder.py +2 -2
  45. unstructured_ingest/processes/filter.py +1 -1
  46. unstructured_ingest/processes/partitioner.py +4 -0
  47. unstructured_ingest/processes/utils/blob_storage.py +2 -2
  48. unstructured_ingest/unstructured_api.py +13 -8
  49. unstructured_ingest/utils/data_prep.py +8 -32
  50. unstructured_ingest/utils/string_and_date_utils.py +3 -3
  51. unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
  52. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +54 -187
  53. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
  54. examples/__init__.py +0 -0
  55. examples/airtable.py +0 -44
  56. examples/azure_cognitive_search.py +0 -55
  57. examples/chroma.py +0 -54
  58. examples/couchbase.py +0 -55
  59. examples/databricks_volumes_dest.py +0 -55
  60. examples/databricks_volumes_source.py +0 -53
  61. examples/delta_table.py +0 -45
  62. examples/discord_example.py +0 -36
  63. examples/elasticsearch.py +0 -49
  64. examples/google_drive.py +0 -45
  65. examples/kdbai.py +0 -54
  66. examples/local.py +0 -36
  67. examples/milvus.py +0 -44
  68. examples/mongodb.py +0 -53
  69. examples/opensearch.py +0 -50
  70. examples/pinecone.py +0 -57
  71. examples/s3.py +0 -38
  72. examples/salesforce.py +0 -44
  73. examples/sharepoint.py +0 -47
  74. examples/singlestore.py +0 -49
  75. examples/sql.py +0 -90
  76. examples/vectara.py +0 -54
  77. examples/weaviate.py +0 -44
  78. test/__init__.py +0 -0
  79. test/integration/__init__.py +0 -0
  80. test/integration/chunkers/__init__.py +0 -0
  81. test/integration/chunkers/test_chunkers.py +0 -31
  82. test/integration/connectors/__init__.py +0 -0
  83. test/integration/connectors/conftest.py +0 -38
  84. test/integration/connectors/databricks/__init__.py +0 -0
  85. test/integration/connectors/databricks/test_volumes_native.py +0 -273
  86. test/integration/connectors/discord/__init__.py +0 -0
  87. test/integration/connectors/discord/test_discord.py +0 -90
  88. test/integration/connectors/duckdb/__init__.py +0 -0
  89. test/integration/connectors/duckdb/conftest.py +0 -14
  90. test/integration/connectors/duckdb/test_duckdb.py +0 -90
  91. test/integration/connectors/duckdb/test_motherduck.py +0 -95
  92. test/integration/connectors/elasticsearch/__init__.py +0 -0
  93. test/integration/connectors/elasticsearch/conftest.py +0 -34
  94. test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
  95. test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
  96. test/integration/connectors/sql/__init__.py +0 -0
  97. test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
  98. test/integration/connectors/sql/test_postgres.py +0 -201
  99. test/integration/connectors/sql/test_singlestore.py +0 -182
  100. test/integration/connectors/sql/test_snowflake.py +0 -244
  101. test/integration/connectors/sql/test_sqlite.py +0 -168
  102. test/integration/connectors/sql/test_vastdb.py +0 -34
  103. test/integration/connectors/test_astradb.py +0 -287
  104. test/integration/connectors/test_azure_ai_search.py +0 -254
  105. test/integration/connectors/test_chroma.py +0 -136
  106. test/integration/connectors/test_confluence.py +0 -111
  107. test/integration/connectors/test_delta_table.py +0 -183
  108. test/integration/connectors/test_dropbox.py +0 -151
  109. test/integration/connectors/test_github.py +0 -49
  110. test/integration/connectors/test_google_drive.py +0 -257
  111. test/integration/connectors/test_jira.py +0 -67
  112. test/integration/connectors/test_lancedb.py +0 -247
  113. test/integration/connectors/test_milvus.py +0 -208
  114. test/integration/connectors/test_mongodb.py +0 -335
  115. test/integration/connectors/test_neo4j.py +0 -244
  116. test/integration/connectors/test_notion.py +0 -152
  117. test/integration/connectors/test_onedrive.py +0 -163
  118. test/integration/connectors/test_pinecone.py +0 -387
  119. test/integration/connectors/test_qdrant.py +0 -216
  120. test/integration/connectors/test_redis.py +0 -143
  121. test/integration/connectors/test_s3.py +0 -184
  122. test/integration/connectors/test_sharepoint.py +0 -222
  123. test/integration/connectors/test_vectara.py +0 -282
  124. test/integration/connectors/test_zendesk.py +0 -120
  125. test/integration/connectors/utils/__init__.py +0 -0
  126. test/integration/connectors/utils/constants.py +0 -13
  127. test/integration/connectors/utils/docker.py +0 -151
  128. test/integration/connectors/utils/docker_compose.py +0 -59
  129. test/integration/connectors/utils/validation/__init__.py +0 -0
  130. test/integration/connectors/utils/validation/destination.py +0 -77
  131. test/integration/connectors/utils/validation/equality.py +0 -76
  132. test/integration/connectors/utils/validation/source.py +0 -331
  133. test/integration/connectors/utils/validation/utils.py +0 -36
  134. test/integration/connectors/weaviate/__init__.py +0 -0
  135. test/integration/connectors/weaviate/conftest.py +0 -15
  136. test/integration/connectors/weaviate/test_cloud.py +0 -39
  137. test/integration/connectors/weaviate/test_local.py +0 -152
  138. test/integration/embedders/__init__.py +0 -0
  139. test/integration/embedders/conftest.py +0 -13
  140. test/integration/embedders/test_azure_openai.py +0 -57
  141. test/integration/embedders/test_bedrock.py +0 -103
  142. test/integration/embedders/test_huggingface.py +0 -24
  143. test/integration/embedders/test_mixedbread.py +0 -71
  144. test/integration/embedders/test_octoai.py +0 -75
  145. test/integration/embedders/test_openai.py +0 -74
  146. test/integration/embedders/test_togetherai.py +0 -71
  147. test/integration/embedders/test_vertexai.py +0 -63
  148. test/integration/embedders/test_voyageai.py +0 -79
  149. test/integration/embedders/utils.py +0 -66
  150. test/integration/partitioners/__init__.py +0 -0
  151. test/integration/partitioners/test_partitioner.py +0 -76
  152. test/integration/utils.py +0 -15
  153. test/unit/__init__.py +0 -0
  154. test/unit/chunkers/__init__.py +0 -0
  155. test/unit/chunkers/test_chunkers.py +0 -49
  156. test/unit/connectors/__init__.py +0 -0
  157. test/unit/connectors/ibm_watsonx/__init__.py +0 -0
  158. test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
  159. test/unit/connectors/motherduck/__init__.py +0 -0
  160. test/unit/connectors/motherduck/test_base.py +0 -73
  161. test/unit/connectors/sql/__init__.py +0 -0
  162. test/unit/connectors/sql/test_sql.py +0 -152
  163. test/unit/connectors/test_confluence.py +0 -71
  164. test/unit/connectors/test_jira.py +0 -401
  165. test/unit/embed/__init__.py +0 -0
  166. test/unit/embed/test_mixedbreadai.py +0 -42
  167. test/unit/embed/test_octoai.py +0 -27
  168. test/unit/embed/test_openai.py +0 -28
  169. test/unit/embed/test_vertexai.py +0 -25
  170. test/unit/embed/test_voyageai.py +0 -24
  171. test/unit/embedders/__init__.py +0 -0
  172. test/unit/embedders/test_bedrock.py +0 -36
  173. test/unit/embedders/test_huggingface.py +0 -48
  174. test/unit/embedders/test_mixedbread.py +0 -37
  175. test/unit/embedders/test_octoai.py +0 -35
  176. test/unit/embedders/test_openai.py +0 -35
  177. test/unit/embedders/test_togetherai.py +0 -37
  178. test/unit/embedders/test_vertexai.py +0 -37
  179. test/unit/embedders/test_voyageai.py +0 -38
  180. test/unit/partitioners/__init__.py +0 -0
  181. test/unit/partitioners/test_partitioner.py +0 -63
  182. test/unit/test_error.py +0 -27
  183. test/unit/test_html.py +0 -112
  184. test/unit/test_interfaces.py +0 -26
  185. test/unit/test_logger.py +0 -78
  186. test/unit/test_utils.py +0 -220
  187. test/unit/utils/__init__.py +0 -0
  188. test/unit/utils/data_generator.py +0 -32
  189. unstructured_ingest-0.7.1.dist-info/METADATA +0 -383
  190. unstructured_ingest-0.7.1.dist-info/top_level.txt +0 -3
  191. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
  192. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
@@ -1,111 +0,0 @@
1
- import os
2
-
3
- import pytest
4
-
5
- from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
6
- from test.integration.connectors.utils.validation.source import (
7
- SourceValidationConfigs,
8
- source_connector_validation,
9
- )
10
- from test.integration.utils import requires_env
11
- from unstructured_ingest.processes.connectors.confluence import (
12
- CONNECTOR_TYPE,
13
- ConfluenceAccessConfig,
14
- ConfluenceConnectionConfig,
15
- ConfluenceDownloader,
16
- ConfluenceDownloaderConfig,
17
- ConfluenceIndexer,
18
- ConfluenceIndexerConfig,
19
- )
20
-
21
-
22
- @pytest.mark.asyncio
23
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
24
- @requires_env("CONFLUENCE_USER_EMAIL", "CONFLUENCE_API_TOKEN")
25
- async def test_confluence_source(temp_dir):
26
- # Retrieve environment variables
27
- confluence_url = "https://unstructured-ingest-test.atlassian.net"
28
- user_email = os.environ["CONFLUENCE_USER_EMAIL"]
29
- api_token = os.environ["CONFLUENCE_API_TOKEN"]
30
- spaces = ["testteamsp", "MFS"]
31
-
32
- # Create connection and indexer configurations
33
- access_config = ConfluenceAccessConfig(api_token=api_token)
34
- connection_config = ConfluenceConnectionConfig(
35
- url=confluence_url,
36
- username=user_email,
37
- access_config=access_config,
38
- )
39
- index_config = ConfluenceIndexerConfig(
40
- max_num_of_spaces=500,
41
- max_num_of_docs_from_each_space=100,
42
- spaces=spaces,
43
- )
44
-
45
- download_config = ConfluenceDownloaderConfig(download_dir=temp_dir)
46
-
47
- # Instantiate indexer and downloader
48
- indexer = ConfluenceIndexer(
49
- connection_config=connection_config,
50
- index_config=index_config,
51
- )
52
- downloader = ConfluenceDownloader(
53
- connection_config=connection_config,
54
- download_config=download_config,
55
- )
56
-
57
- # Run the source connector validation
58
- await source_connector_validation(
59
- indexer=indexer,
60
- downloader=downloader,
61
- configs=SourceValidationConfigs(
62
- test_id="confluence",
63
- expected_num_files=11,
64
- validate_downloaded_files=True,
65
- ),
66
- )
67
-
68
-
69
- @pytest.mark.asyncio
70
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
71
- @requires_env("CONFLUENCE_USER_EMAIL", "CONFLUENCE_API_TOKEN")
72
- async def test_confluence_source_large(temp_dir):
73
- # Retrieve environment variables
74
- confluence_url = "https://unstructured-ingest-test.atlassian.net"
75
- user_email = os.environ["CONFLUENCE_USER_EMAIL"]
76
- api_token = os.environ["CONFLUENCE_API_TOKEN"]
77
- spaces = ["testteamsp1"]
78
-
79
- # Create connection and indexer configurations
80
- access_config = ConfluenceAccessConfig(api_token=api_token)
81
- connection_config = ConfluenceConnectionConfig(
82
- url=confluence_url,
83
- username=user_email,
84
- access_config=access_config,
85
- )
86
- index_config = ConfluenceIndexerConfig(
87
- max_num_of_spaces=10,
88
- max_num_of_docs_from_each_space=250,
89
- spaces=spaces,
90
- )
91
-
92
- download_config = ConfluenceDownloaderConfig(download_dir=temp_dir)
93
-
94
- # Instantiate indexer and downloader
95
- indexer = ConfluenceIndexer(
96
- connection_config=connection_config,
97
- index_config=index_config,
98
- )
99
- downloader = ConfluenceDownloader(
100
- connection_config=connection_config,
101
- download_config=download_config,
102
- )
103
-
104
- # Run the source connector validation
105
- await source_connector_validation(
106
- indexer=indexer,
107
- downloader=downloader,
108
- configs=SourceValidationConfigs(
109
- test_id="confluence_large", expected_num_files=250, validate_file_data=False
110
- ),
111
- )
@@ -1,183 +0,0 @@
1
- import multiprocessing
2
- import os
3
- from pathlib import Path
4
-
5
- import pytest
6
- from deltalake import DeltaTable
7
- from fsspec import get_filesystem_class
8
-
9
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
10
- from test.integration.utils import requires_env
11
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
12
- from unstructured_ingest.processes.connectors.delta_table import (
13
- CONNECTOR_TYPE,
14
- DeltaTableAccessConfig,
15
- DeltaTableConnectionConfig,
16
- DeltaTableUploader,
17
- DeltaTableUploaderConfig,
18
- DeltaTableUploadStager,
19
- DeltaTableUploadStagerConfig,
20
- )
21
-
22
- multiprocessing.set_start_method("spawn")
23
-
24
-
25
- @pytest.mark.asyncio
26
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
27
- async def test_delta_table_destination_local(upload_file: Path, temp_dir: Path):
28
- destination_path = str(temp_dir)
29
- connection_config = DeltaTableConnectionConfig(
30
- access_config=DeltaTableAccessConfig(),
31
- table_uri=destination_path,
32
- )
33
- stager_config = DeltaTableUploadStagerConfig()
34
- stager = DeltaTableUploadStager(upload_stager_config=stager_config)
35
- new_upload_file = stager.run(
36
- elements_filepath=upload_file,
37
- output_dir=temp_dir,
38
- output_filename=upload_file.name,
39
- )
40
-
41
- upload_config = DeltaTableUploaderConfig()
42
- uploader = DeltaTableUploader(connection_config=connection_config, upload_config=upload_config)
43
- file_data = FileData(
44
- source_identifiers=SourceIdentifiers(
45
- fullpath=upload_file.name, filename=new_upload_file.name
46
- ),
47
- connector_type=CONNECTOR_TYPE,
48
- identifier="mock file data",
49
- )
50
-
51
- if uploader.is_async():
52
- await uploader.run_async(path=new_upload_file, file_data=file_data)
53
- else:
54
- uploader.run(path=new_upload_file, file_data=file_data)
55
- delta_table_path = os.path.join(destination_path, upload_file.name)
56
- delta_table = DeltaTable(table_uri=delta_table_path)
57
- df = delta_table.to_pandas()
58
-
59
- EXPECTED_COLUMNS = 10
60
- EXPECTED_ROWS = 22
61
- assert (
62
- len(df) == EXPECTED_ROWS
63
- ), f"Number of rows in table vs expected: {len(df)}/{EXPECTED_ROWS}"
64
- assert (
65
- len(df.columns) == EXPECTED_COLUMNS
66
- ), f"Number of columns in table vs expected: {len(df.columns)}/{EXPECTED_COLUMNS}"
67
-
68
-
69
- def get_aws_credentials() -> dict:
70
- access_key = os.getenv("S3_INGEST_TEST_ACCESS_KEY", None)
71
- assert access_key
72
- secret_key = os.getenv("S3_INGEST_TEST_SECRET_KEY", None)
73
- assert secret_key
74
- return {
75
- "AWS_ACCESS_KEY_ID": access_key,
76
- "AWS_SECRET_ACCESS_KEY": secret_key,
77
- "AWS_REGION": "us-east-2",
78
- }
79
-
80
-
81
- @pytest.mark.asyncio
82
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
83
- @requires_env("S3_INGEST_TEST_ACCESS_KEY", "S3_INGEST_TEST_SECRET_KEY")
84
- async def test_delta_table_destination_s3(upload_file: Path, temp_dir: Path):
85
- aws_credentials = get_aws_credentials()
86
- s3_bucket = "s3://utic-platform-test-destination"
87
- destination_path = f"{s3_bucket}/destination/test"
88
- connection_config = DeltaTableConnectionConfig(
89
- access_config=DeltaTableAccessConfig(
90
- aws_access_key_id=aws_credentials["AWS_ACCESS_KEY_ID"],
91
- aws_secret_access_key=aws_credentials["AWS_SECRET_ACCESS_KEY"],
92
- ),
93
- aws_region=aws_credentials["AWS_REGION"],
94
- table_uri=destination_path,
95
- )
96
- stager_config = DeltaTableUploadStagerConfig()
97
- stager = DeltaTableUploadStager(upload_stager_config=stager_config)
98
- new_upload_file = stager.run(
99
- elements_filepath=upload_file,
100
- output_dir=temp_dir,
101
- output_filename=upload_file.name,
102
- )
103
-
104
- upload_config = DeltaTableUploaderConfig()
105
- uploader = DeltaTableUploader(connection_config=connection_config, upload_config=upload_config)
106
- file_data = FileData(
107
- source_identifiers=SourceIdentifiers(
108
- fullpath=upload_file.name, filename=new_upload_file.name
109
- ),
110
- connector_type=CONNECTOR_TYPE,
111
- identifier="mock file data",
112
- )
113
-
114
- try:
115
- uploader.precheck()
116
- if uploader.is_async():
117
- await uploader.run_async(path=new_upload_file, file_data=file_data)
118
- else:
119
- uploader.run(path=new_upload_file, file_data=file_data)
120
- delta_table_path = os.path.join(destination_path, upload_file.name)
121
- delta_table = DeltaTable(table_uri=delta_table_path, storage_options=aws_credentials)
122
- df = delta_table.to_pandas()
123
-
124
- EXPECTED_COLUMNS = 10
125
- EXPECTED_ROWS = 22
126
- assert (
127
- len(df) == EXPECTED_ROWS
128
- ), f"Number of rows in table vs expected: {len(df)}/{EXPECTED_ROWS}"
129
- assert (
130
- len(df.columns) == EXPECTED_COLUMNS
131
- ), f"Number of columns in table vs expected: {len(df.columns)}/{EXPECTED_COLUMNS}"
132
- finally:
133
- s3fs = get_filesystem_class("s3")(
134
- key=aws_credentials["AWS_ACCESS_KEY_ID"],
135
- secret=aws_credentials["AWS_SECRET_ACCESS_KEY"],
136
- )
137
- s3fs.rm(path=destination_path, recursive=True)
138
-
139
-
140
- @pytest.mark.asyncio
141
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
142
- @requires_env("S3_INGEST_TEST_ACCESS_KEY", "S3_INGEST_TEST_SECRET_KEY")
143
- async def test_delta_table_destination_s3_bad_creds(upload_file: Path, temp_dir: Path):
144
- aws_credentials = {
145
- "AWS_ACCESS_KEY_ID": "bad key",
146
- "AWS_SECRET_ACCESS_KEY": "bad secret",
147
- "AWS_REGION": "us-east-2",
148
- }
149
- s3_bucket = "s3://utic-platform-test-destination"
150
- destination_path = f"{s3_bucket}/destination/test"
151
- connection_config = DeltaTableConnectionConfig(
152
- access_config=DeltaTableAccessConfig(
153
- aws_access_key_id=aws_credentials["AWS_ACCESS_KEY_ID"],
154
- aws_secret_access_key=aws_credentials["AWS_SECRET_ACCESS_KEY"],
155
- ),
156
- aws_region=aws_credentials["AWS_REGION"],
157
- table_uri=destination_path,
158
- )
159
- stager_config = DeltaTableUploadStagerConfig()
160
- stager = DeltaTableUploadStager(upload_stager_config=stager_config)
161
- new_upload_file = stager.run(
162
- elements_filepath=upload_file,
163
- output_dir=temp_dir,
164
- output_filename=upload_file.name,
165
- )
166
-
167
- upload_config = DeltaTableUploaderConfig()
168
- uploader = DeltaTableUploader(connection_config=connection_config, upload_config=upload_config)
169
- file_data = FileData(
170
- source_identifiers=SourceIdentifiers(
171
- fullpath=upload_file.name, filename=new_upload_file.name
172
- ),
173
- connector_type=CONNECTOR_TYPE,
174
- identifier="mock file data",
175
- )
176
-
177
- with pytest.raises(Exception) as excinfo:
178
- if uploader.is_async():
179
- await uploader.run_async(path=new_upload_file, file_data=file_data)
180
- else:
181
- uploader.run(path=new_upload_file, file_data=file_data)
182
-
183
- assert "403 Forbidden" in str(excinfo.value), f"Exception message did not match: {str(excinfo)}"
@@ -1,151 +0,0 @@
1
- import os
2
-
3
- import pytest
4
- import requests
5
-
6
- from test.integration.connectors.utils.constants import (
7
- BLOB_STORAGE_TAG,
8
- SOURCE_TAG,
9
- )
10
- from test.integration.connectors.utils.validation.source import (
11
- SourceValidationConfigs,
12
- source_connector_validation,
13
- )
14
- from test.integration.utils import requires_env
15
- from unstructured_ingest.processes.connectors.fsspec.dropbox import (
16
- CONNECTOR_TYPE as DROPBOX_CONNECTOR_TYPE,
17
- )
18
- from unstructured_ingest.processes.connectors.fsspec.dropbox import (
19
- DropboxAccessConfig,
20
- DropboxConnectionConfig,
21
- DropboxDownloader,
22
- DropboxDownloaderConfig,
23
- DropboxIndexer,
24
- DropboxIndexerConfig,
25
- )
26
-
27
-
28
- @pytest.mark.asyncio
29
- @pytest.mark.tags(DROPBOX_CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
30
- @requires_env("DROPBOX_REFRESH_TOKEN", "DROPBOX_APP_KEY", "DROPBOX_APP_SECRET")
31
- async def test_dropbox_source(temp_dir):
32
- """
33
- Integration test for the Dropbox source connector.
34
-
35
- This test indexes data from dropbox://test-input/ and downloads the resulting files,
36
- then compares them to fixture data.
37
- """
38
- refresh_token = os.getenv("DROPBOX_REFRESH_TOKEN")
39
- app_key = os.getenv("DROPBOX_APP_KEY")
40
- app_secret = os.getenv("DROPBOX_APP_SECRET")
41
-
42
- connection_config = DropboxConnectionConfig(
43
- access_config=DropboxAccessConfig(
44
- refresh_token=refresh_token,
45
- app_key=app_key,
46
- app_secret=app_secret,
47
- )
48
- )
49
-
50
- index_config = DropboxIndexerConfig(
51
- recursive=True,
52
- remote_url="dropbox://test-input",
53
- )
54
- downloader_config = DropboxDownloaderConfig(download_dir=temp_dir)
55
-
56
- indexer = DropboxIndexer(
57
- connection_config=connection_config,
58
- index_config=index_config,
59
- )
60
- downloader = DropboxDownloader(
61
- connection_config=connection_config,
62
- download_config=downloader_config,
63
- )
64
-
65
- await source_connector_validation(
66
- indexer=indexer,
67
- downloader=downloader,
68
- configs=SourceValidationConfigs(
69
- test_id="dropbox",
70
- expected_num_files=4,
71
- validate_downloaded_files=True,
72
- exclude_fields_extend=[
73
- "metadata.date_created",
74
- "metadata.date_modified",
75
- ],
76
- ),
77
- )
78
-
79
-
80
- @pytest.mark.asyncio
81
- @pytest.mark.tags(DROPBOX_CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
82
- @requires_env("DROPBOX_REFRESH_TOKEN", "DROPBOX_APP_KEY", "DROPBOX_APP_SECRET")
83
- async def test_dropbox_short_lived_token_via_refresh(temp_dir):
84
- """
85
- Demonstrates manually generating an access token from refresh credentials,
86
- then passing ONLY the short-lived token to the Dropbox connector
87
- (no app_key, app_secret, or refresh_token in the actual connection config).
88
-
89
- This effectively mimics an external system that hands us a short-lived token.
90
- """
91
- refresh_token = os.getenv("DROPBOX_REFRESH_TOKEN")
92
- app_key = os.getenv("DROPBOX_APP_KEY")
93
- app_secret = os.getenv("DROPBOX_APP_SECRET")
94
-
95
- # Manually request a short-lived token from Dropbox's OAuth endpoint
96
- # This call is basically what the connector code does internally,
97
- # but we're doing it here in the test so we can pass only the short-lived token later.
98
- response = requests.post(
99
- "https://api.dropboxapi.com/oauth2/token",
100
- data={
101
- "grant_type": "refresh_token",
102
- "refresh_token": refresh_token,
103
- },
104
- auth=(app_key, app_secret),
105
- timeout=30, # seconds
106
- )
107
- response.raise_for_status()
108
- data = response.json()
109
- short_lived_token = data["access_token"]
110
- print("Acquired an access token from Dropbox")
111
-
112
- # Build connection config with ONLY the short-lived token
113
- # We omit refresh_token, app_key, and app_secret to confirm that
114
- # our connector can operate purely on the short-lived token.
115
- connection_config = DropboxConnectionConfig(
116
- access_config=DropboxAccessConfig(
117
- token=short_lived_token,
118
- app_key=None,
119
- app_secret=None,
120
- refresh_token=None,
121
- )
122
- )
123
-
124
- index_config = DropboxIndexerConfig(
125
- recursive=True,
126
- remote_url="dropbox://test-input",
127
- )
128
- downloader_config = DropboxDownloaderConfig(download_dir=temp_dir)
129
-
130
- indexer = DropboxIndexer(
131
- connection_config=connection_config,
132
- index_config=index_config,
133
- )
134
- downloader = DropboxDownloader(
135
- connection_config=connection_config,
136
- download_config=downloader_config,
137
- )
138
-
139
- await source_connector_validation(
140
- indexer=indexer,
141
- downloader=downloader,
142
- configs=SourceValidationConfigs(
143
- test_id="dropbox_short_lived_via_refresh",
144
- expected_num_files=4,
145
- validate_downloaded_files=True,
146
- exclude_fields_extend=[
147
- "metadata.date_created",
148
- "metadata.date_modified",
149
- ],
150
- ),
151
- )
@@ -1,49 +0,0 @@
1
- import os
2
-
3
- import pytest
4
-
5
- from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
6
- from test.integration.connectors.utils.validation.source import (
7
- SourceValidationConfigs,
8
- source_connector_validation,
9
- )
10
- from test.integration.utils import requires_env
11
- from unstructured_ingest.processes.connectors.github import (
12
- CONNECTOR_TYPE,
13
- GithubAccessConfig,
14
- GithubConnectionConfig,
15
- GithubDownloader,
16
- GithubDownloaderConfig,
17
- GithubIndexer,
18
- GithubIndexerConfig,
19
- )
20
-
21
-
22
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
23
- @pytest.mark.asyncio
24
- @requires_env("GH_READ_ONLY_ACCESS_TOKEN")
25
- async def test_github_source(temp_dir):
26
- access_token = os.environ["GH_READ_ONLY_ACCESS_TOKEN"]
27
- connection_config = GithubConnectionConfig(
28
- access_config=GithubAccessConfig(access_token=access_token),
29
- url="dcneiner/Downloadify",
30
- )
31
-
32
- indexer = GithubIndexer(
33
- connection_config=connection_config,
34
- index_config=GithubIndexerConfig(file_glob=["*.txt", "*.html"]),
35
- )
36
-
37
- downloader = GithubDownloader(
38
- connection_config=connection_config,
39
- download_config=GithubDownloaderConfig(download_dir=temp_dir),
40
- )
41
-
42
- # Run the source connector validation
43
- await source_connector_validation(
44
- indexer=indexer,
45
- downloader=downloader,
46
- configs=SourceValidationConfigs(
47
- test_id="github", expected_num_files=2, validate_downloaded_files=True
48
- ),
49
- )