unstructured-ingest 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (78) hide show
  1. test/integration/connectors/test_astradb.py +109 -0
  2. test/integration/connectors/test_azure_cog_search.py +233 -0
  3. test/integration/connectors/test_confluence.py +113 -0
  4. test/integration/connectors/test_kafka.py +167 -0
  5. test/integration/connectors/test_onedrive.py +112 -0
  6. test/integration/connectors/test_pinecone.py +161 -0
  7. test/integration/connectors/test_qdrant.py +137 -0
  8. test/integration/connectors/test_s3.py +23 -0
  9. test/integration/connectors/utils/docker.py +2 -1
  10. test/integration/connectors/utils/validation.py +73 -22
  11. test/unit/v2/__init__.py +0 -0
  12. test/unit/v2/chunkers/__init__.py +0 -0
  13. test/unit/v2/chunkers/test_chunkers.py +49 -0
  14. test/unit/v2/connectors/__init__.py +0 -0
  15. test/unit/v2/embedders/__init__.py +0 -0
  16. test/unit/v2/embedders/test_bedrock.py +36 -0
  17. test/unit/v2/embedders/test_huggingface.py +48 -0
  18. test/unit/v2/embedders/test_mixedbread.py +37 -0
  19. test/unit/v2/embedders/test_octoai.py +35 -0
  20. test/unit/v2/embedders/test_openai.py +35 -0
  21. test/unit/v2/embedders/test_togetherai.py +37 -0
  22. test/unit/v2/embedders/test_vertexai.py +37 -0
  23. test/unit/v2/embedders/test_voyageai.py +38 -0
  24. test/unit/v2/partitioners/__init__.py +0 -0
  25. test/unit/v2/partitioners/test_partitioner.py +63 -0
  26. test/unit/v2/utils/__init__.py +0 -0
  27. test/unit/v2/utils/data_generator.py +32 -0
  28. unstructured_ingest/__version__.py +1 -1
  29. unstructured_ingest/cli/cmds/__init__.py +2 -2
  30. unstructured_ingest/cli/cmds/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
  31. unstructured_ingest/connector/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
  32. unstructured_ingest/connector/kafka.py +0 -1
  33. unstructured_ingest/interfaces.py +7 -7
  34. unstructured_ingest/runner/writers/__init__.py +2 -2
  35. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  36. unstructured_ingest/v2/constants.py +2 -0
  37. unstructured_ingest/v2/processes/chunker.py +2 -2
  38. unstructured_ingest/v2/processes/connectors/__init__.py +16 -5
  39. unstructured_ingest/v2/processes/connectors/airtable.py +2 -2
  40. unstructured_ingest/v2/processes/connectors/astradb.py +33 -21
  41. unstructured_ingest/v2/processes/connectors/{azure_cognitive_search.py → azure_ai_search.py} +112 -35
  42. unstructured_ingest/v2/processes/connectors/confluence.py +195 -0
  43. unstructured_ingest/v2/processes/connectors/couchbase.py +1 -0
  44. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +2 -4
  45. unstructured_ingest/v2/processes/connectors/delta_table.py +17 -5
  46. unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -0
  47. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +28 -10
  48. unstructured_ingest/v2/processes/connectors/gitlab.py +267 -0
  49. unstructured_ingest/v2/processes/connectors/google_drive.py +3 -3
  50. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  51. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +118 -0
  52. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +251 -0
  53. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  54. unstructured_ingest/v2/processes/connectors/onedrive.py +165 -5
  55. unstructured_ingest/v2/processes/connectors/outlook.py +2 -2
  56. unstructured_ingest/v2/processes/connectors/pinecone.py +83 -12
  57. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  58. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  59. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  60. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +168 -0
  61. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  62. unstructured_ingest/v2/processes/connectors/sharepoint.py +3 -2
  63. unstructured_ingest/v2/processes/connectors/slack.py +2 -2
  64. unstructured_ingest/v2/processes/connectors/sql/postgres.py +16 -8
  65. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +3 -1
  66. unstructured_ingest/v2/processes/connectors/sql/sql.py +2 -4
  67. unstructured_ingest/v2/processes/partitioner.py +14 -3
  68. unstructured_ingest/v2/unstructured_api.py +24 -10
  69. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/METADATA +17 -16
  70. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/RECORD +77 -41
  71. unstructured_ingest/runner/writers/azure_cognitive_search.py +0 -24
  72. /test/integration/embedders/{togetherai.py → test_togetherai.py} +0 -0
  73. /test/unit/{test_interfaces_v2.py → v2/test_interfaces.py} +0 -0
  74. /test/unit/{test_utils_v2.py → v2/test_utils.py} +0 -0
  75. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/LICENSE.md +0 -0
  76. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/WHEEL +0 -0
  77. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/entry_points.txt +0 -0
  78. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,109 @@
1
+ import json
2
+ import os
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from uuid import uuid4
6
+
7
+ import pytest
8
+ from astrapy import Collection
9
+ from astrapy import DataAPIClient as AstraDBClient
10
+
11
+ from test.integration.connectors.utils.constants import (
12
+ DESTINATION_TAG,
13
+ )
14
+ from test.integration.utils import requires_env
15
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
16
+ from unstructured_ingest.v2.processes.connectors.astradb import (
17
+ CONNECTOR_TYPE,
18
+ AstraDBAccessConfig,
19
+ AstraDBConnectionConfig,
20
+ AstraDBUploader,
21
+ AstraDBUploaderConfig,
22
+ AstraDBUploadStager,
23
+ )
24
+
25
+
26
+ @dataclass(frozen=True)
27
+ class EnvData:
28
+ api_endpoint: str
29
+ token: str
30
+
31
+
32
+ def get_env_data() -> EnvData:
33
+ return EnvData(
34
+ api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
35
+ token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
36
+ )
37
+
38
+
39
+ @pytest.fixture
40
+ def collection(upload_file: Path) -> Collection:
41
+ random_id = str(uuid4())[:8]
42
+ collection_name = f"utic_test_{random_id}"
43
+ with upload_file.open("r") as upload_fp:
44
+ upload_data = json.load(upload_fp)
45
+ first_content = upload_data[0]
46
+ embeddings = first_content["embeddings"]
47
+ embedding_dimension = len(embeddings)
48
+ my_client = AstraDBClient()
49
+ env_data = get_env_data()
50
+ astra_db = my_client.get_database(
51
+ api_endpoint=env_data.api_endpoint,
52
+ token=env_data.token,
53
+ )
54
+ collection = astra_db.create_collection(collection_name, dimension=embedding_dimension)
55
+ try:
56
+ yield collection
57
+ finally:
58
+ astra_db.drop_collection(collection)
59
+
60
+
61
+ @pytest.mark.asyncio
62
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
63
+ @requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
64
+ async def test_azure_ai_search_destination(
65
+ upload_file: Path,
66
+ collection: Collection,
67
+ tmp_path: Path,
68
+ ):
69
+ file_data = FileData(
70
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
71
+ connector_type=CONNECTOR_TYPE,
72
+ identifier="mock file data",
73
+ )
74
+ stager = AstraDBUploadStager()
75
+ env_data = get_env_data()
76
+ uploader = AstraDBUploader(
77
+ connection_config=AstraDBConnectionConfig(
78
+ access_config=AstraDBAccessConfig(
79
+ api_endpoint=env_data.api_endpoint, token=env_data.token
80
+ ),
81
+ ),
82
+ upload_config=AstraDBUploaderConfig(collection_name=collection.name),
83
+ )
84
+ staged_filepath = stager.run(
85
+ elements_filepath=upload_file,
86
+ file_data=file_data,
87
+ output_dir=tmp_path,
88
+ output_filename=upload_file.name,
89
+ )
90
+ uploader.precheck()
91
+ uploader.run(path=staged_filepath, file_data=file_data)
92
+
93
+ # Run validation
94
+ with staged_filepath.open() as f:
95
+ staged_elements = json.load(f)
96
+ expected_count = len(staged_elements)
97
+ current_count = collection.count_documents(filter={}, upper_bound=expected_count * 2)
98
+ assert current_count == expected_count, (
99
+ f"Expected count ({expected_count}) doesn't match how "
100
+ f"much came back from collection: {current_count}"
101
+ )
102
+
103
+ # Rerun and make sure the same documents get updated
104
+ uploader.run(path=staged_filepath, file_data=file_data)
105
+ current_count = collection.count_documents(filter={}, upper_bound=expected_count * 2)
106
+ assert current_count == expected_count, (
107
+ f"Expected count ({expected_count}) doesn't match how "
108
+ f"much came back from collection: {current_count}"
109
+ )
@@ -0,0 +1,233 @@
1
+ import json
2
+ import os
3
+ import time
4
+ from pathlib import Path
5
+ from uuid import uuid4
6
+
7
+ import pytest
8
+ from azure.core.credentials import AzureKeyCredential
9
+ from azure.search.documents import SearchClient
10
+ from azure.search.documents.indexes import SearchIndexClient
11
+ from azure.search.documents.indexes.models import (
12
+ ComplexField,
13
+ CorsOptions,
14
+ HnswAlgorithmConfiguration,
15
+ HnswParameters,
16
+ SearchField,
17
+ SearchFieldDataType,
18
+ SearchIndex,
19
+ SimpleField,
20
+ VectorSearch,
21
+ VectorSearchAlgorithmMetric,
22
+ VectorSearchProfile,
23
+ )
24
+
25
+ from test.integration.connectors.utils.constants import (
26
+ DESTINATION_TAG,
27
+ )
28
+ from test.integration.utils import requires_env
29
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
30
+ from unstructured_ingest.v2.processes.connectors.azure_ai_search import (
31
+ CONNECTOR_TYPE,
32
+ RECORD_ID_LABEL,
33
+ AzureAISearchAccessConfig,
34
+ AzureAISearchConnectionConfig,
35
+ AzureAISearchUploader,
36
+ AzureAISearchUploaderConfig,
37
+ AzureAISearchUploadStager,
38
+ AzureAISearchUploadStagerConfig,
39
+ )
40
+
41
+ repo_path = Path(__file__).parent.resolve()
42
+
43
+ API_KEY = "AZURE_SEARCH_API_KEY"
44
+ ENDPOINT = "https://ingest-test-azure-cognitive-search.search.windows.net"
45
+
46
+
47
+ def get_api_key() -> str:
48
+ key = os.environ[API_KEY]
49
+ return key
50
+
51
+
52
+ def get_fields() -> list:
53
+ data_source_fields = [
54
+ SimpleField(name="url", type=SearchFieldDataType.String),
55
+ SimpleField(name="version", type=SearchFieldDataType.String),
56
+ SimpleField(name="date_created", type=SearchFieldDataType.DateTimeOffset),
57
+ SimpleField(name="date_modified", type=SearchFieldDataType.DateTimeOffset),
58
+ SimpleField(name="date_processed", type=SearchFieldDataType.DateTimeOffset),
59
+ SimpleField(name="permissions_data", type=SearchFieldDataType.String),
60
+ SimpleField(name="record_locator", type=SearchFieldDataType.String),
61
+ ]
62
+ coordinates_fields = [
63
+ SimpleField(name="system", type=SearchFieldDataType.String),
64
+ SimpleField(name="layout_width", type=SearchFieldDataType.Double),
65
+ SimpleField(name="layout_height", type=SearchFieldDataType.Double),
66
+ SimpleField(name="points", type=SearchFieldDataType.String),
67
+ ]
68
+ metadata_fields = [
69
+ SimpleField(name="orig_elements", type=SearchFieldDataType.String),
70
+ SimpleField(name="category_depth", type=SearchFieldDataType.Int32),
71
+ SimpleField(name="parent_id", type=SearchFieldDataType.String),
72
+ SimpleField(name="attached_to_filename", type=SearchFieldDataType.String),
73
+ SimpleField(name="filetype", type=SearchFieldDataType.String),
74
+ SimpleField(name="last_modified", type=SearchFieldDataType.DateTimeOffset),
75
+ SimpleField(name="is_continuation", type=SearchFieldDataType.Boolean),
76
+ SimpleField(name="file_directory", type=SearchFieldDataType.String),
77
+ SimpleField(name="filename", type=SearchFieldDataType.String),
78
+ ComplexField(name="data_source", fields=data_source_fields),
79
+ ComplexField(name="coordinates", fields=coordinates_fields),
80
+ SimpleField(
81
+ name="languages", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
82
+ ),
83
+ SimpleField(name="page_number", type=SearchFieldDataType.String),
84
+ SimpleField(name="links", type=SearchFieldDataType.Collection(SearchFieldDataType.String)),
85
+ SimpleField(name="page_name", type=SearchFieldDataType.String),
86
+ SimpleField(name="url", type=SearchFieldDataType.String),
87
+ SimpleField(
88
+ name="link_urls", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
89
+ ),
90
+ SimpleField(
91
+ name="link_texts", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
92
+ ),
93
+ SimpleField(
94
+ name="sent_from", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
95
+ ),
96
+ SimpleField(
97
+ name="sent_to", type=SearchFieldDataType.Collection(SearchFieldDataType.String)
98
+ ),
99
+ SimpleField(name="subject", type=SearchFieldDataType.String),
100
+ SimpleField(name="section", type=SearchFieldDataType.String),
101
+ SimpleField(name="header_footer_type", type=SearchFieldDataType.String),
102
+ SimpleField(
103
+ name="emphasized_text_contents",
104
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
105
+ ),
106
+ SimpleField(
107
+ name="emphasized_text_tags",
108
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
109
+ ),
110
+ SimpleField(name="text_as_html", type=SearchFieldDataType.String),
111
+ SimpleField(name="regex_metadata", type=SearchFieldDataType.String),
112
+ SimpleField(name="detection_class_prob", type=SearchFieldDataType.Double),
113
+ ]
114
+ fields = [
115
+ SimpleField(name="id", type=SearchFieldDataType.String, key=True),
116
+ SimpleField(name=RECORD_ID_LABEL, type=SearchFieldDataType.String, filterable=True),
117
+ SimpleField(name="element_id", type=SearchFieldDataType.String),
118
+ SimpleField(name="text", type=SearchFieldDataType.String),
119
+ SimpleField(name="type", type=SearchFieldDataType.String),
120
+ ComplexField(name="metadata", fields=metadata_fields),
121
+ SearchField(
122
+ name="embeddings",
123
+ type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
124
+ vector_search_dimensions=384,
125
+ vector_search_profile_name="embeddings-config-profile",
126
+ ),
127
+ ]
128
+ return fields
129
+
130
+
131
+ def get_vector_search() -> VectorSearch:
132
+ return VectorSearch(
133
+ algorithms=[
134
+ HnswAlgorithmConfiguration(
135
+ name="hnsw-config",
136
+ parameters=HnswParameters(
137
+ metric=VectorSearchAlgorithmMetric.COSINE,
138
+ ),
139
+ )
140
+ ],
141
+ profiles=[
142
+ VectorSearchProfile(
143
+ name="embeddings-config-profile", algorithm_configuration_name="hnsw-config"
144
+ )
145
+ ],
146
+ )
147
+
148
+
149
+ def get_search_index_client() -> SearchIndexClient:
150
+ api_key = get_api_key()
151
+ return SearchIndexClient(ENDPOINT, AzureKeyCredential(api_key))
152
+
153
+
154
+ @pytest.fixture
155
+ def index() -> str:
156
+ random_id = str(uuid4())[:8]
157
+ index_name = f"utic-test-{random_id}"
158
+ client = get_search_index_client()
159
+ index = SearchIndex(
160
+ name=index_name,
161
+ fields=get_fields(),
162
+ vector_search=get_vector_search(),
163
+ cors_options=CorsOptions(allowed_origins=["*"], max_age_in_seconds=60),
164
+ )
165
+ print(f"creating index: {index_name}")
166
+ client.create_index(index=index)
167
+ try:
168
+ yield index_name
169
+ finally:
170
+ print(f"deleting index: {index_name}")
171
+ client.delete_index(index)
172
+
173
+
174
+ def validate_count(
175
+ search_client: SearchClient, expected_count: int, retries: int = 10, interval: int = 1
176
+ ) -> None:
177
+ index_count = search_client.get_document_count()
178
+ if index_count == expected_count:
179
+ return
180
+ tries = 0
181
+ while tries < retries:
182
+ time.sleep(interval)
183
+ index_count = search_client.get_document_count()
184
+ if index_count == expected_count:
185
+ break
186
+ assert index_count == expected_count, (
187
+ f"Expected count ({expected_count}) doesn't match how "
188
+ f"much came back from index: {index_count}"
189
+ )
190
+
191
+
192
+ @pytest.mark.asyncio
193
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
194
+ @requires_env("AZURE_SEARCH_API_KEY")
195
+ async def test_azure_ai_search_destination(
196
+ upload_file: Path,
197
+ index: str,
198
+ tmp_path: Path,
199
+ ):
200
+ file_data = FileData(
201
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
202
+ connector_type=CONNECTOR_TYPE,
203
+ identifier="mock file data",
204
+ )
205
+ stager = AzureAISearchUploadStager(upload_stager_config=AzureAISearchUploadStagerConfig())
206
+
207
+ uploader = AzureAISearchUploader(
208
+ connection_config=AzureAISearchConnectionConfig(
209
+ access_config=AzureAISearchAccessConfig(key=get_api_key()),
210
+ endpoint=ENDPOINT,
211
+ index=index,
212
+ ),
213
+ upload_config=AzureAISearchUploaderConfig(),
214
+ )
215
+ staged_filepath = stager.run(
216
+ elements_filepath=upload_file,
217
+ file_data=file_data,
218
+ output_dir=tmp_path,
219
+ output_filename=upload_file.name,
220
+ )
221
+ uploader.precheck()
222
+ uploader.run(path=staged_filepath, file_data=file_data)
223
+
224
+ # Run validation
225
+ with staged_filepath.open() as f:
226
+ staged_elements = json.load(f)
227
+ expected_count = len(staged_elements)
228
+ search_client: SearchClient = uploader.connection_config.get_search_client()
229
+ validate_count(search_client=search_client, expected_count=expected_count)
230
+
231
+ # Rerun and make sure the same documents get updated
232
+ uploader.run(path=staged_filepath, file_data=file_data)
233
+ validate_count(search_client=search_client, expected_count=expected_count)
@@ -0,0 +1,113 @@
1
+ import os
2
+
3
+ import pytest
4
+
5
+ from test.integration.connectors.utils.constants import (
6
+ SOURCE_TAG,
7
+ )
8
+ from test.integration.connectors.utils.validation import (
9
+ ValidationConfigs,
10
+ source_connector_validation,
11
+ )
12
+ from test.integration.utils import requires_env
13
+ from unstructured_ingest.v2.processes.connectors.confluence import (
14
+ CONNECTOR_TYPE,
15
+ ConfluenceAccessConfig,
16
+ ConfluenceConnectionConfig,
17
+ ConfluenceDownloader,
18
+ ConfluenceDownloaderConfig,
19
+ ConfluenceIndexer,
20
+ ConfluenceIndexerConfig,
21
+ )
22
+
23
+
24
+ @pytest.mark.asyncio
25
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
26
+ @requires_env("CONFLUENCE_USER_EMAIL", "CONFLUENCE_API_TOKEN")
27
+ async def test_confluence_source(temp_dir):
28
+ # Retrieve environment variables
29
+ confluence_url = "https://unstructured-ingest-test.atlassian.net"
30
+ user_email = os.environ["CONFLUENCE_USER_EMAIL"]
31
+ api_token = os.environ["CONFLUENCE_API_TOKEN"]
32
+ spaces = ["testteamsp", "MFS"]
33
+
34
+ # Create connection and indexer configurations
35
+ access_config = ConfluenceAccessConfig(api_token=api_token)
36
+ connection_config = ConfluenceConnectionConfig(
37
+ url=confluence_url,
38
+ user_email=user_email,
39
+ access_config=access_config,
40
+ )
41
+ index_config = ConfluenceIndexerConfig(
42
+ max_num_of_spaces=500,
43
+ max_num_of_docs_from_each_space=100,
44
+ spaces=spaces,
45
+ )
46
+
47
+ download_config = ConfluenceDownloaderConfig(download_dir=temp_dir)
48
+
49
+ # Instantiate indexer and downloader
50
+ indexer = ConfluenceIndexer(
51
+ connection_config=connection_config,
52
+ index_config=index_config,
53
+ )
54
+ downloader = ConfluenceDownloader(
55
+ connection_config=connection_config,
56
+ download_config=download_config,
57
+ )
58
+
59
+ # Run the source connector validation
60
+ await source_connector_validation(
61
+ indexer=indexer,
62
+ downloader=downloader,
63
+ configs=ValidationConfigs(
64
+ test_id="confluence",
65
+ expected_num_files=11,
66
+ validate_downloaded_files=True,
67
+ ),
68
+ )
69
+
70
+
71
+ @pytest.mark.asyncio
72
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
73
+ @requires_env("CONFLUENCE_USER_EMAIL", "CONFLUENCE_API_TOKEN")
74
+ async def test_confluence_source_large(temp_dir):
75
+ # Retrieve environment variables
76
+ confluence_url = "https://unstructured-ingest-test.atlassian.net"
77
+ user_email = os.environ["CONFLUENCE_USER_EMAIL"]
78
+ api_token = os.environ["CONFLUENCE_API_TOKEN"]
79
+ spaces = ["testteamsp1"]
80
+
81
+ # Create connection and indexer configurations
82
+ access_config = ConfluenceAccessConfig(api_token=api_token)
83
+ connection_config = ConfluenceConnectionConfig(
84
+ url=confluence_url,
85
+ user_email=user_email,
86
+ access_config=access_config,
87
+ )
88
+ index_config = ConfluenceIndexerConfig(
89
+ max_num_of_spaces=10,
90
+ max_num_of_docs_from_each_space=250,
91
+ spaces=spaces,
92
+ )
93
+
94
+ download_config = ConfluenceDownloaderConfig(download_dir=temp_dir)
95
+
96
+ # Instantiate indexer and downloader
97
+ indexer = ConfluenceIndexer(
98
+ connection_config=connection_config,
99
+ index_config=index_config,
100
+ )
101
+ downloader = ConfluenceDownloader(
102
+ connection_config=connection_config,
103
+ download_config=download_config,
104
+ )
105
+
106
+ # Run the source connector validation
107
+ await source_connector_validation(
108
+ indexer=indexer,
109
+ downloader=downloader,
110
+ configs=ValidationConfigs(
111
+ test_id="confluence_large", expected_num_files=250, validate_file_data=False
112
+ ),
113
+ )
@@ -0,0 +1,167 @@
1
+ import json
2
+ import tempfile
3
+ from pathlib import Path
4
+
5
+ import pytest
6
+ from confluent_kafka import Consumer, KafkaError, KafkaException, Producer
7
+ from confluent_kafka.admin import AdminClient, NewTopic
8
+
9
+ from test.integration.connectors.utils.constants import (
10
+ DESTINATION_TAG,
11
+ SOURCE_TAG,
12
+ env_setup_path,
13
+ )
14
+ from test.integration.connectors.utils.docker_compose import docker_compose_context
15
+ from test.integration.connectors.utils.validation import (
16
+ ValidationConfigs,
17
+ source_connector_validation,
18
+ )
19
+ from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
20
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
21
+ from unstructured_ingest.v2.processes.connectors.kafka.local import (
22
+ CONNECTOR_TYPE,
23
+ LocalKafkaConnectionConfig,
24
+ LocalKafkaDownloader,
25
+ LocalKafkaDownloaderConfig,
26
+ LocalKafkaIndexer,
27
+ LocalKafkaIndexerConfig,
28
+ LocalKafkaUploader,
29
+ LocalKafkaUploaderConfig,
30
+ )
31
+
32
+ SEED_MESSAGES = 10
33
+ TOPIC = "fake-topic"
34
+
35
+
36
+ @pytest.fixture
37
+ def docker_compose_ctx():
38
+ with docker_compose_context(docker_compose_path=env_setup_path / "kafka") as ctx:
39
+ yield ctx
40
+
41
+
42
+ @pytest.fixture
43
+ def kafka_seed_topic(docker_compose_ctx) -> str:
44
+ conf = {
45
+ "bootstrap.servers": "localhost:29092",
46
+ }
47
+ producer = Producer(conf)
48
+ for i in range(SEED_MESSAGES):
49
+ message = f"This is some text for message {i}"
50
+ producer.produce(topic=TOPIC, value=message)
51
+ producer.flush(timeout=10)
52
+ print(f"kafka topic {TOPIC} seeded with {SEED_MESSAGES} messages")
53
+ return TOPIC
54
+
55
+
56
+ @pytest.fixture
57
+ def kafka_upload_topic(docker_compose_ctx) -> str:
58
+ conf = {
59
+ "bootstrap.servers": "localhost:29092",
60
+ }
61
+ admin_client = AdminClient(conf)
62
+ admin_client.create_topics([NewTopic(TOPIC, 1, 1)])
63
+ return TOPIC
64
+
65
+
66
+ @pytest.mark.asyncio
67
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
68
+ async def test_kafka_source_local(kafka_seed_topic: str):
69
+ connection_config = LocalKafkaConnectionConfig(bootstrap_server="localhost", port=29092)
70
+ with tempfile.TemporaryDirectory() as tempdir:
71
+ tempdir_path = Path(tempdir)
72
+ download_config = LocalKafkaDownloaderConfig(download_dir=tempdir_path)
73
+ indexer = LocalKafkaIndexer(
74
+ connection_config=connection_config,
75
+ index_config=LocalKafkaIndexerConfig(topic=kafka_seed_topic, num_messages_to_consume=5),
76
+ )
77
+ downloader = LocalKafkaDownloader(
78
+ connection_config=connection_config, download_config=download_config
79
+ )
80
+ indexer.precheck()
81
+ await source_connector_validation(
82
+ indexer=indexer,
83
+ downloader=downloader,
84
+ configs=ValidationConfigs(
85
+ test_id="kafka", expected_num_files=5, validate_downloaded_files=True
86
+ ),
87
+ )
88
+
89
+
90
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
91
+ def test_kafak_source_local_precheck_fail():
92
+ connection_config = LocalKafkaConnectionConfig(bootstrap_server="localhost", port=29092)
93
+ indexer = LocalKafkaIndexer(
94
+ connection_config=connection_config,
95
+ index_config=LocalKafkaIndexerConfig(topic=TOPIC, num_messages_to_consume=5),
96
+ )
97
+ with pytest.raises(SourceConnectionError):
98
+ indexer.precheck()
99
+
100
+
101
+ def get_all_messages(topic: str, max_empty_messages: int = 5) -> list[dict]:
102
+ conf = {
103
+ "bootstrap.servers": "localhost:29092",
104
+ "group.id": "default_group_id",
105
+ "enable.auto.commit": "false",
106
+ "auto.offset.reset": "earliest",
107
+ }
108
+ consumer = Consumer(conf)
109
+ consumer.subscribe([topic])
110
+ messages = []
111
+ try:
112
+ empty_count = 0
113
+ while empty_count < max_empty_messages:
114
+ msg = consumer.poll(timeout=1)
115
+ if msg is None:
116
+ empty_count += 1
117
+ continue
118
+ if msg.error():
119
+ if msg.error().code() == KafkaError._PARTITION_EOF:
120
+ break
121
+ else:
122
+ raise KafkaException(msg.error())
123
+ try:
124
+ message = json.loads(msg.value().decode("utf8"))
125
+ messages.append(message)
126
+ finally:
127
+ consumer.commit(asynchronous=False)
128
+ finally:
129
+ print("closing consumer")
130
+ consumer.close()
131
+ return messages
132
+
133
+
134
+ @pytest.mark.asyncio
135
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
136
+ async def test_kafka_destination_local(upload_file: Path, kafka_upload_topic: str):
137
+ uploader = LocalKafkaUploader(
138
+ connection_config=LocalKafkaConnectionConfig(bootstrap_server="localhost", port=29092),
139
+ upload_config=LocalKafkaUploaderConfig(topic=TOPIC, batch_size=10),
140
+ )
141
+ file_data = FileData(
142
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
143
+ connector_type=CONNECTOR_TYPE,
144
+ identifier="mock file data",
145
+ )
146
+ uploader.precheck()
147
+ if uploader.is_async():
148
+ await uploader.run_async(path=upload_file, file_data=file_data)
149
+ else:
150
+ uploader.run(path=upload_file, file_data=file_data)
151
+ all_messages = get_all_messages(topic=kafka_upload_topic)
152
+ with upload_file.open("r") as upload_fs:
153
+ content_to_upload = json.load(upload_fs)
154
+ assert len(all_messages) == len(content_to_upload), (
155
+ f"expected number of messages ({len(content_to_upload)}) doesn't "
156
+ f"match how many messages read off of kakfa topic {kafka_upload_topic}: {len(all_messages)}"
157
+ )
158
+
159
+
160
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
161
+ def test_kafak_destination_local_precheck_fail():
162
+ uploader = LocalKafkaUploader(
163
+ connection_config=LocalKafkaConnectionConfig(bootstrap_server="localhost", port=29092),
164
+ upload_config=LocalKafkaUploaderConfig(topic=TOPIC, batch_size=10),
165
+ )
166
+ with pytest.raises(DestinationConnectionError):
167
+ uploader.precheck()