unstructured-ingest 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (44) hide show
  1. test/integration/connectors/sql/test_singlestore.py +156 -0
  2. test/integration/connectors/test_confluence.py +113 -0
  3. test/integration/connectors/test_kafka.py +67 -0
  4. test/integration/connectors/test_onedrive.py +112 -0
  5. test/integration/connectors/test_qdrant.py +137 -0
  6. test/integration/connectors/test_s3.py +1 -1
  7. test/integration/connectors/utils/docker.py +2 -1
  8. test/integration/connectors/utils/docker_compose.py +23 -8
  9. test/integration/connectors/utils/validation.py +73 -22
  10. unstructured_ingest/__version__.py +1 -1
  11. unstructured_ingest/connector/kafka.py +0 -1
  12. unstructured_ingest/interfaces.py +7 -7
  13. unstructured_ingest/v2/interfaces/file_data.py +1 -0
  14. unstructured_ingest/v2/processes/chunker.py +2 -2
  15. unstructured_ingest/v2/processes/connectors/__init__.py +15 -7
  16. unstructured_ingest/v2/processes/connectors/astradb.py +278 -55
  17. unstructured_ingest/v2/processes/connectors/confluence.py +195 -0
  18. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +5 -5
  19. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +2 -10
  20. unstructured_ingest/v2/processes/connectors/gitlab.py +267 -0
  21. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +13 -0
  22. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +82 -0
  23. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +196 -0
  24. unstructured_ingest/v2/processes/connectors/kafka/local.py +75 -0
  25. unstructured_ingest/v2/processes/connectors/onedrive.py +163 -2
  26. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  27. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  28. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  29. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +168 -0
  30. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  31. unstructured_ingest/v2/processes/connectors/sql/__init__.py +5 -0
  32. unstructured_ingest/v2/processes/connectors/sql/postgres.py +1 -20
  33. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +168 -0
  34. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
  35. unstructured_ingest/v2/processes/connectors/sql/sql.py +15 -6
  36. unstructured_ingest/v2/processes/partitioner.py +14 -3
  37. unstructured_ingest/v2/unstructured_api.py +25 -11
  38. {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/METADATA +17 -17
  39. {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/RECORD +43 -27
  40. unstructured_ingest/v2/processes/connectors/singlestore.py +0 -156
  41. {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/LICENSE.md +0 -0
  42. {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/WHEEL +0 -0
  43. {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/entry_points.txt +0 -0
  44. {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,156 @@
1
+ import tempfile
2
+ from contextlib import contextmanager
3
+ from pathlib import Path
4
+
5
+ import pandas as pd
6
+ import pytest
7
+ import singlestoredb as s2
8
+
9
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, env_setup_path
10
+ from test.integration.connectors.utils.docker_compose import docker_compose_context
11
+ from test.integration.connectors.utils.validation import (
12
+ ValidationConfigs,
13
+ source_connector_validation,
14
+ )
15
+ from unstructured_ingest.v2.interfaces import FileData
16
+ from unstructured_ingest.v2.processes.connectors.sql.singlestore import (
17
+ CONNECTOR_TYPE,
18
+ SingleStoreAccessConfig,
19
+ SingleStoreConnectionConfig,
20
+ SingleStoreDownloader,
21
+ SingleStoreDownloaderConfig,
22
+ SingleStoreIndexer,
23
+ SingleStoreIndexerConfig,
24
+ SingleStoreUploader,
25
+ SingleStoreUploaderConfig,
26
+ SingleStoreUploadStager,
27
+ )
28
+
29
+ SEED_DATA_ROWS = 20
30
+
31
+
32
+ @contextmanager
33
+ def singlestore_download_setup(connect_params: dict) -> None:
34
+ with docker_compose_context(
35
+ docker_compose_path=env_setup_path / "sql" / "singlestore" / "source"
36
+ ):
37
+ with s2.connect(**connect_params) as connection:
38
+ with connection.cursor() as cursor:
39
+ for i in range(SEED_DATA_ROWS):
40
+ sql_statment = f"INSERT INTO cars (brand, price) VALUES " f"('brand_{i}', {i})"
41
+ cursor.execute(sql_statment)
42
+ connection.commit()
43
+ yield
44
+
45
+
46
+ @pytest.mark.asyncio
47
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
48
+ async def test_singlestore_source():
49
+ connect_params = {
50
+ "host": "localhost",
51
+ "port": 3306,
52
+ "database": "ingest_test",
53
+ "user": "root",
54
+ "password": "password",
55
+ }
56
+ with singlestore_download_setup(connect_params=connect_params):
57
+ with tempfile.TemporaryDirectory() as tmpdir:
58
+ connection_config = SingleStoreConnectionConfig(
59
+ host=connect_params["host"],
60
+ port=connect_params["port"],
61
+ database=connect_params["database"],
62
+ user=connect_params["user"],
63
+ access_config=SingleStoreAccessConfig(password=connect_params["password"]),
64
+ )
65
+ indexer = SingleStoreIndexer(
66
+ connection_config=connection_config,
67
+ index_config=SingleStoreIndexerConfig(
68
+ table_name="cars", id_column="car_id", batch_size=5
69
+ ),
70
+ )
71
+ downloader = SingleStoreDownloader(
72
+ connection_config=connection_config,
73
+ download_config=SingleStoreDownloaderConfig(
74
+ fields=["car_id", "brand"], download_dir=Path(tmpdir)
75
+ ),
76
+ )
77
+ await source_connector_validation(
78
+ indexer=indexer,
79
+ downloader=downloader,
80
+ configs=ValidationConfigs(
81
+ test_id="singlestore",
82
+ expected_num_files=SEED_DATA_ROWS,
83
+ expected_number_indexed_file_data=4,
84
+ validate_downloaded_files=True,
85
+ ),
86
+ )
87
+
88
+
89
+ def validate_destination(
90
+ connect_params: dict,
91
+ expected_num_elements: int,
92
+ ):
93
+ with s2.connect(**connect_params) as connection:
94
+ with connection.cursor() as cursor:
95
+ query = "select count(*) from elements;"
96
+ cursor.execute(query)
97
+ count = cursor.fetchone()[0]
98
+ assert (
99
+ count == expected_num_elements
100
+ ), f"dest check failed: got {count}, expected {expected_num_elements}"
101
+
102
+
103
+ @pytest.mark.asyncio
104
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
105
+ async def test_singlestore_destination(upload_file: Path):
106
+ mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
107
+ with docker_compose_context(
108
+ docker_compose_path=env_setup_path / "sql" / "singlestore" / "destination"
109
+ ):
110
+ with tempfile.TemporaryDirectory() as tmpdir:
111
+ stager = SingleStoreUploadStager()
112
+ stager_params = {
113
+ "elements_filepath": upload_file,
114
+ "file_data": mock_file_data,
115
+ "output_dir": Path(tmpdir),
116
+ "output_filename": "test_db",
117
+ }
118
+ if stager.is_async():
119
+ staged_path = await stager.run_async(**stager_params)
120
+ else:
121
+ staged_path = stager.run(**stager_params)
122
+
123
+ # The stager should append the `.json` suffix to the output filename passed in.
124
+ assert staged_path.name == "test_db.json"
125
+
126
+ connect_params = {
127
+ "host": "localhost",
128
+ "port": 3306,
129
+ "database": "ingest_test",
130
+ "user": "root",
131
+ "password": "password",
132
+ }
133
+
134
+ uploader = SingleStoreUploader(
135
+ connection_config=SingleStoreConnectionConfig(
136
+ host=connect_params["host"],
137
+ port=connect_params["port"],
138
+ database=connect_params["database"],
139
+ user=connect_params["user"],
140
+ access_config=SingleStoreAccessConfig(password=connect_params["password"]),
141
+ ),
142
+ upload_config=SingleStoreUploaderConfig(
143
+ table_name="elements",
144
+ ),
145
+ )
146
+ if uploader.is_async():
147
+ await uploader.run_async(path=staged_path, file_data=mock_file_data)
148
+ else:
149
+ uploader.run(path=staged_path, file_data=mock_file_data)
150
+
151
+ staged_df = pd.read_json(staged_path, orient="records", lines=True)
152
+ expected_num_elements = len(staged_df)
153
+ validate_destination(
154
+ connect_params=connect_params,
155
+ expected_num_elements=expected_num_elements,
156
+ )
@@ -0,0 +1,113 @@
1
+ import os
2
+
3
+ import pytest
4
+
5
+ from test.integration.connectors.utils.constants import (
6
+ SOURCE_TAG,
7
+ )
8
+ from test.integration.connectors.utils.validation import (
9
+ ValidationConfigs,
10
+ source_connector_validation,
11
+ )
12
+ from test.integration.utils import requires_env
13
+ from unstructured_ingest.v2.processes.connectors.confluence import (
14
+ CONNECTOR_TYPE,
15
+ ConfluenceAccessConfig,
16
+ ConfluenceConnectionConfig,
17
+ ConfluenceDownloader,
18
+ ConfluenceDownloaderConfig,
19
+ ConfluenceIndexer,
20
+ ConfluenceIndexerConfig,
21
+ )
22
+
23
+
24
+ @pytest.mark.asyncio
25
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
26
+ @requires_env("CONFLUENCE_USER_EMAIL", "CONFLUENCE_API_TOKEN")
27
+ async def test_confluence_source(temp_dir):
28
+ # Retrieve environment variables
29
+ confluence_url = "https://unstructured-ingest-test.atlassian.net"
30
+ user_email = os.environ["CONFLUENCE_USER_EMAIL"]
31
+ api_token = os.environ["CONFLUENCE_API_TOKEN"]
32
+ spaces = ["testteamsp", "MFS"]
33
+
34
+ # Create connection and indexer configurations
35
+ access_config = ConfluenceAccessConfig(api_token=api_token)
36
+ connection_config = ConfluenceConnectionConfig(
37
+ url=confluence_url,
38
+ user_email=user_email,
39
+ access_config=access_config,
40
+ )
41
+ index_config = ConfluenceIndexerConfig(
42
+ max_num_of_spaces=500,
43
+ max_num_of_docs_from_each_space=100,
44
+ spaces=spaces,
45
+ )
46
+
47
+ download_config = ConfluenceDownloaderConfig(download_dir=temp_dir)
48
+
49
+ # Instantiate indexer and downloader
50
+ indexer = ConfluenceIndexer(
51
+ connection_config=connection_config,
52
+ index_config=index_config,
53
+ )
54
+ downloader = ConfluenceDownloader(
55
+ connection_config=connection_config,
56
+ download_config=download_config,
57
+ )
58
+
59
+ # Run the source connector validation
60
+ await source_connector_validation(
61
+ indexer=indexer,
62
+ downloader=downloader,
63
+ configs=ValidationConfigs(
64
+ test_id="confluence",
65
+ expected_num_files=11,
66
+ validate_downloaded_files=True,
67
+ ),
68
+ )
69
+
70
+
71
+ @pytest.mark.asyncio
72
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
73
+ @requires_env("CONFLUENCE_USER_EMAIL", "CONFLUENCE_API_TOKEN")
74
+ async def test_confluence_source_large(temp_dir):
75
+ # Retrieve environment variables
76
+ confluence_url = "https://unstructured-ingest-test.atlassian.net"
77
+ user_email = os.environ["CONFLUENCE_USER_EMAIL"]
78
+ api_token = os.environ["CONFLUENCE_API_TOKEN"]
79
+ spaces = ["testteamsp1"]
80
+
81
+ # Create connection and indexer configurations
82
+ access_config = ConfluenceAccessConfig(api_token=api_token)
83
+ connection_config = ConfluenceConnectionConfig(
84
+ url=confluence_url,
85
+ user_email=user_email,
86
+ access_config=access_config,
87
+ )
88
+ index_config = ConfluenceIndexerConfig(
89
+ max_num_of_spaces=10,
90
+ max_num_of_docs_from_each_space=250,
91
+ spaces=spaces,
92
+ )
93
+
94
+ download_config = ConfluenceDownloaderConfig(download_dir=temp_dir)
95
+
96
+ # Instantiate indexer and downloader
97
+ indexer = ConfluenceIndexer(
98
+ connection_config=connection_config,
99
+ index_config=index_config,
100
+ )
101
+ downloader = ConfluenceDownloader(
102
+ connection_config=connection_config,
103
+ download_config=download_config,
104
+ )
105
+
106
+ # Run the source connector validation
107
+ await source_connector_validation(
108
+ indexer=indexer,
109
+ downloader=downloader,
110
+ configs=ValidationConfigs(
111
+ test_id="confluence_large", expected_num_files=250, validate_file_data=False
112
+ ),
113
+ )
@@ -0,0 +1,67 @@
1
+ import socket
2
+ import tempfile
3
+ from pathlib import Path
4
+
5
+ import pytest
6
+ from confluent_kafka import Producer
7
+
8
+ from test.integration.connectors.utils.constants import (
9
+ SOURCE_TAG,
10
+ env_setup_path,
11
+ )
12
+ from test.integration.connectors.utils.docker_compose import docker_compose_context
13
+ from test.integration.connectors.utils.validation import (
14
+ ValidationConfigs,
15
+ source_connector_validation,
16
+ )
17
+ from unstructured_ingest.v2.processes.connectors.kafka.local import (
18
+ CONNECTOR_TYPE,
19
+ LocalKafkaConnectionConfig,
20
+ LocalKafkaDownloader,
21
+ LocalKafkaDownloaderConfig,
22
+ LocalKafkaIndexer,
23
+ LocalKafkaIndexerConfig,
24
+ )
25
+
26
+ SEED_MESSAGES = 10
27
+ TOPIC = "fake-topic"
28
+
29
+
30
+ @pytest.fixture
31
+ def kafka_seed_topic() -> str:
32
+ with docker_compose_context(docker_compose_path=env_setup_path / "kafka"):
33
+ conf = {
34
+ "bootstrap.servers": "localhost:29092",
35
+ "client.id": socket.gethostname(),
36
+ "message.max.bytes": 10485760,
37
+ }
38
+ producer = Producer(conf)
39
+ for i in range(SEED_MESSAGES):
40
+ message = f"This is some text for message {i}"
41
+ producer.produce(topic=TOPIC, value=message)
42
+ producer.flush(timeout=10)
43
+ print(f"kafka topic {TOPIC} seeded with {SEED_MESSAGES} messages")
44
+ yield TOPIC
45
+
46
+
47
+ @pytest.mark.asyncio
48
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
49
+ async def test_kafka_source_local(kafka_seed_topic: str):
50
+ connection_config = LocalKafkaConnectionConfig(bootstrap_server="localhost", port=29092)
51
+ with tempfile.TemporaryDirectory() as tempdir:
52
+ tempdir_path = Path(tempdir)
53
+ download_config = LocalKafkaDownloaderConfig(download_dir=tempdir_path)
54
+ indexer = LocalKafkaIndexer(
55
+ connection_config=connection_config,
56
+ index_config=LocalKafkaIndexerConfig(topic=kafka_seed_topic, num_messages_to_consume=5),
57
+ )
58
+ downloader = LocalKafkaDownloader(
59
+ connection_config=connection_config, download_config=download_config
60
+ )
61
+ await source_connector_validation(
62
+ indexer=indexer,
63
+ downloader=downloader,
64
+ configs=ValidationConfigs(
65
+ test_id="kafka", expected_num_files=5, validate_downloaded_files=True
66
+ ),
67
+ )
@@ -0,0 +1,112 @@
1
+ import os
2
+ import uuid
3
+ from pathlib import Path
4
+
5
+ import pytest
6
+ from office365.graph_client import GraphClient
7
+
8
+ from test.integration.connectors.utils.constants import (
9
+ DESTINATION_TAG,
10
+ )
11
+ from test.integration.utils import requires_env
12
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
13
+ from unstructured_ingest.v2.processes.connectors.onedrive import (
14
+ CONNECTOR_TYPE,
15
+ OnedriveAccessConfig,
16
+ OnedriveConnectionConfig,
17
+ OnedriveUploader,
18
+ OnedriveUploaderConfig,
19
+ )
20
+
21
+
22
+ @pytest.fixture
23
+ def onedrive_test_folder() -> str:
24
+ """
25
+ Pytest fixture that creates a test folder in OneDrive and deletes it after test run.
26
+ """
27
+ connection_config = get_connection_config()
28
+ user_pname = connection_config.user_pname
29
+
30
+ # Get the OneDrive client
31
+ client: GraphClient = connection_config.get_client()
32
+ drive = client.users[user_pname].drive
33
+
34
+ # Generate a unique test folder path
35
+ test_folder_path = f"utic-test-output-{uuid.uuid4()}"
36
+
37
+ # Create the test folder
38
+ root = drive.root
39
+ folder = root.create_folder(test_folder_path).execute_query()
40
+ print(f"created folder: {folder.name}")
41
+ try:
42
+ yield test_folder_path
43
+ finally:
44
+ # Teardown: delete the test folder and its contents
45
+ folder.delete_object().execute_query()
46
+ print(f"successfully deleted folder: {folder.name}")
47
+
48
+
49
+ def get_connection_config():
50
+ """
51
+ Pytest fixture that provides the OnedriveConnectionConfig for tests.
52
+ """
53
+ client_id = os.getenv("MS_CLIENT_ID")
54
+ client_secret = os.getenv("MS_CLIENT_CRED")
55
+ tenant_id = os.getenv("MS_TENANT_ID")
56
+ user_pname = os.getenv("MS_USER_PNAME")
57
+
58
+ connection_config = OnedriveConnectionConfig(
59
+ client_id=client_id,
60
+ tenant=tenant_id,
61
+ user_pname=user_pname,
62
+ access_config=OnedriveAccessConfig(client_cred=client_secret),
63
+ )
64
+ return connection_config
65
+
66
+
67
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
68
+ @requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
69
+ def test_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
70
+ """
71
+ Integration test for the OneDrive destination connector.
72
+
73
+ This test uploads a file to OneDrive and verifies that it exists.
74
+ """
75
+ connection_config = get_connection_config()
76
+ # Retrieve user principal name from the connection config
77
+ user_pname = connection_config.user_pname
78
+
79
+ # The test folder is provided by the fixture
80
+ destination_folder = onedrive_test_folder
81
+ destination_fullpath = f"{destination_folder}/{upload_file.name}"
82
+
83
+ # Configure the uploader with remote_url
84
+ upload_config = OnedriveUploaderConfig(remote_url=f"onedrive://{destination_folder}")
85
+
86
+ uploader = OnedriveUploader(
87
+ connection_config=connection_config,
88
+ upload_config=upload_config,
89
+ )
90
+
91
+ file_data = FileData(
92
+ source_identifiers=SourceIdentifiers(
93
+ fullpath=destination_fullpath,
94
+ filename=upload_file.name,
95
+ ),
96
+ connector_type=CONNECTOR_TYPE,
97
+ identifier="mock_file_data",
98
+ )
99
+ uploader.precheck()
100
+ uploader.run(path=upload_file, file_data=file_data)
101
+
102
+ # Verify that the file was uploaded
103
+ client = connection_config.get_client()
104
+ drive = client.users[user_pname].drive
105
+
106
+ uploaded_file = (
107
+ drive.root.get_by_path(destination_fullpath).select(["id", "name"]).get().execute_query()
108
+ )
109
+
110
+ # Check if the file exists
111
+ assert uploaded_file is not None
112
+ assert uploaded_file.name == upload_file.name
@@ -0,0 +1,137 @@
1
+ import json
2
+ import uuid
3
+ from contextlib import asynccontextmanager
4
+ from pathlib import Path
5
+ from typing import AsyncGenerator
6
+
7
+ import pytest
8
+ from qdrant_client import AsyncQdrantClient
9
+
10
+ from test.integration.connectors.utils.constants import DESTINATION_TAG
11
+ from test.integration.connectors.utils.docker import container_context
12
+ from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
13
+ from unstructured_ingest.v2.processes.connectors.qdrant.local import (
14
+ CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE,
15
+ )
16
+ from unstructured_ingest.v2.processes.connectors.qdrant.local import (
17
+ LocalQdrantConnectionConfig,
18
+ LocalQdrantUploader,
19
+ LocalQdrantUploaderConfig,
20
+ LocalQdrantUploadStager,
21
+ LocalQdrantUploadStagerConfig,
22
+ )
23
+ from unstructured_ingest.v2.processes.connectors.qdrant.server import (
24
+ CONNECTOR_TYPE as SERVER_CONNECTOR_TYPE,
25
+ )
26
+ from unstructured_ingest.v2.processes.connectors.qdrant.server import (
27
+ ServerQdrantConnectionConfig,
28
+ ServerQdrantUploader,
29
+ ServerQdrantUploaderConfig,
30
+ ServerQdrantUploadStager,
31
+ ServerQdrantUploadStagerConfig,
32
+ )
33
+
34
+ COLLECTION_NAME = f"test-coll-{uuid.uuid4().hex[:12]}"
35
+ VECTORS_CONFIG = {"size": 384, "distance": "Cosine"}
36
+
37
+
38
+ @asynccontextmanager
39
+ async def qdrant_client(client_params: dict) -> AsyncGenerator[AsyncQdrantClient, None]:
40
+ client = AsyncQdrantClient(**client_params)
41
+ try:
42
+ yield client
43
+ finally:
44
+ await client.close()
45
+
46
+
47
+ async def validate_upload(client: AsyncQdrantClient, upload_file: Path):
48
+ with upload_file.open() as upload_fp:
49
+ elements = json.load(upload_fp)
50
+ expected_point_count = len(elements)
51
+ first_element = elements[0]
52
+ expected_text = first_element["text"]
53
+ embeddings = first_element["embeddings"]
54
+ collection = await client.get_collection(COLLECTION_NAME)
55
+ assert collection.points_count == expected_point_count
56
+
57
+ response = await client.query_points(COLLECTION_NAME, query=embeddings, limit=1)
58
+ assert response.points[0].payload is not None
59
+ assert response.points[0].payload["text"] == expected_text
60
+
61
+
62
+ @pytest.mark.asyncio
63
+ @pytest.mark.tags(LOCAL_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant")
64
+ async def test_qdrant_destination_local(upload_file: Path, tmp_path: Path):
65
+ connection_kwargs = {"path": str(tmp_path / "qdrant")}
66
+ async with qdrant_client(connection_kwargs) as client:
67
+ await client.create_collection(COLLECTION_NAME, vectors_config=VECTORS_CONFIG)
68
+ AsyncQdrantClient(**connection_kwargs)
69
+ stager = LocalQdrantUploadStager(
70
+ upload_stager_config=LocalQdrantUploadStagerConfig(),
71
+ )
72
+ uploader = LocalQdrantUploader(
73
+ connection_config=LocalQdrantConnectionConfig(**connection_kwargs),
74
+ upload_config=LocalQdrantUploaderConfig(collection_name=COLLECTION_NAME),
75
+ )
76
+
77
+ file_data = FileData(
78
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
79
+ connector_type=LOCAL_CONNECTOR_TYPE,
80
+ identifier="mock-file-data",
81
+ )
82
+
83
+ staged_upload_file = stager.run(
84
+ elements_filepath=upload_file,
85
+ file_data=file_data,
86
+ output_dir=tmp_path,
87
+ output_filename=upload_file.name,
88
+ )
89
+
90
+ if uploader.is_async():
91
+ await uploader.run_async(path=staged_upload_file, file_data=file_data)
92
+ else:
93
+ uploader.run(path=upload_file, file_data=file_data)
94
+ async with qdrant_client(connection_kwargs) as client:
95
+ await validate_upload(client=client, upload_file=upload_file)
96
+
97
+
98
+ @pytest.fixture
99
+ def docker_context():
100
+ with container_context(image="qdrant/qdrant:latest", ports={"6333": "6333"}) as container:
101
+ yield container
102
+
103
+
104
+ @pytest.mark.asyncio
105
+ @pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant")
106
+ async def test_qdrant_destination_server(upload_file: Path, tmp_path: Path, docker_context):
107
+ connection_kwargs = {"location": "http://localhost:6333"}
108
+ async with qdrant_client(connection_kwargs) as client:
109
+ await client.create_collection(COLLECTION_NAME, vectors_config=VECTORS_CONFIG)
110
+ AsyncQdrantClient(**connection_kwargs)
111
+ stager = ServerQdrantUploadStager(
112
+ upload_stager_config=ServerQdrantUploadStagerConfig(),
113
+ )
114
+ uploader = ServerQdrantUploader(
115
+ connection_config=ServerQdrantConnectionConfig(**connection_kwargs),
116
+ upload_config=ServerQdrantUploaderConfig(collection_name=COLLECTION_NAME),
117
+ )
118
+
119
+ file_data = FileData(
120
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
121
+ connector_type=SERVER_CONNECTOR_TYPE,
122
+ identifier="mock-file-data",
123
+ )
124
+
125
+ staged_upload_file = stager.run(
126
+ elements_filepath=upload_file,
127
+ file_data=file_data,
128
+ output_dir=tmp_path,
129
+ output_filename=upload_file.name,
130
+ )
131
+
132
+ if uploader.is_async():
133
+ await uploader.run_async(path=staged_upload_file, file_data=file_data)
134
+ else:
135
+ uploader.run(path=upload_file, file_data=file_data)
136
+ async with qdrant_client(connection_kwargs) as client:
137
+ await validate_upload(client=client, upload_file=upload_file)
@@ -85,7 +85,7 @@ async def test_s3_source_no_access(anon_connection_config: S3ConnectionConfig):
85
85
  async def test_s3_minio_source(anon_connection_config: S3ConnectionConfig):
86
86
  anon_connection_config.endpoint_url = "http://localhost:9000"
87
87
  indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/")
88
- with docker_compose_context(docker_compose_path=env_setup_path / "minio"):
88
+ with docker_compose_context(docker_compose_path=env_setup_path / "minio" / "source"):
89
89
  with tempfile.TemporaryDirectory() as tempdir:
90
90
  tempdir_path = Path(tempdir)
91
91
  download_config = S3DownloaderConfig(download_dir=tempdir_path)
@@ -47,14 +47,15 @@ def healthcheck_wait(container: Container, timeout: int = 10) -> None:
47
47
 
48
48
  @contextmanager
49
49
  def container_context(
50
- docker_client: docker.DockerClient,
51
50
  image: str,
52
51
  ports: dict,
53
52
  environment: Optional[dict] = None,
54
53
  volumes: Optional[dict] = None,
55
54
  healthcheck: Optional[dict] = None,
56
55
  healthcheck_timeout: int = 10,
56
+ docker_client: Optional[docker.DockerClient] = None,
57
57
  ):
58
+ docker_client = docker_client or docker.from_env()
58
59
  container: Optional[Container] = None
59
60
  try:
60
61
  container = get_container(
@@ -3,6 +3,23 @@ from contextlib import contextmanager
3
3
  from pathlib import Path
4
4
 
5
5
 
6
+ def docker_compose_down(docker_compose_path: Path):
7
+ cmd = f"docker compose -f {docker_compose_path.resolve()} down --remove-orphans -v --rmi all"
8
+ print(f"Running command: {cmd}")
9
+ final_resp = subprocess.run(
10
+ cmd,
11
+ shell=True,
12
+ capture_output=True,
13
+ )
14
+ if final_resp.returncode != 0:
15
+ print("STDOUT: {}".format(final_resp.stdout.decode("utf-8")))
16
+ print("STDERR: {}".format(final_resp.stderr.decode("utf-8")))
17
+
18
+
19
+ def run_cleanup(docker_compose_path: Path):
20
+ docker_compose_down(docker_compose_path=docker_compose_path)
21
+
22
+
6
23
  @contextmanager
7
24
  def docker_compose_context(docker_compose_path: Path):
8
25
  # Dynamically run a specific docker compose file and make sure it gets cleanup by
@@ -30,15 +47,13 @@ def docker_compose_context(docker_compose_path: Path):
30
47
  if resp:
31
48
  print("STDOUT: {}".format(resp.stdout.decode("utf-8")))
32
49
  print("STDERR: {}".format(resp.stderr.decode("utf-8")))
33
- raise e
34
- finally:
35
- cmd = f"docker compose -f {docker_compose_path.resolve()} down --remove-orphans -v"
36
- print(f"Running command: {cmd}")
37
- final_resp = subprocess.run(
50
+ cmd = f"docker compose -f {docker_compose_path.resolve()} logs"
51
+ logs = subprocess.run(
38
52
  cmd,
39
53
  shell=True,
40
54
  capture_output=True,
41
55
  )
42
- if final_resp.returncode != 0:
43
- print("STDOUT: {}".format(final_resp.stdout.decode("utf-8")))
44
- print("STDERR: {}".format(final_resp.stderr.decode("utf-8")))
56
+ print("DOCKER LOGS: {}".format(logs.stdout.decode("utf-8")))
57
+ raise e
58
+ finally:
59
+ run_cleanup(docker_compose_path=docker_compose_path)