unstructured-ingest 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/sql/test_singlestore.py +156 -0
- test/integration/connectors/test_confluence.py +113 -0
- test/integration/connectors/test_kafka.py +67 -0
- test/integration/connectors/test_onedrive.py +112 -0
- test/integration/connectors/test_qdrant.py +137 -0
- test/integration/connectors/test_s3.py +1 -1
- test/integration/connectors/utils/docker.py +2 -1
- test/integration/connectors/utils/docker_compose.py +23 -8
- test/integration/connectors/utils/validation.py +73 -22
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/connector/kafka.py +0 -1
- unstructured_ingest/interfaces.py +7 -7
- unstructured_ingest/v2/interfaces/file_data.py +1 -0
- unstructured_ingest/v2/processes/chunker.py +2 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +15 -7
- unstructured_ingest/v2/processes/connectors/astradb.py +278 -55
- unstructured_ingest/v2/processes/connectors/confluence.py +195 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +5 -5
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +2 -10
- unstructured_ingest/v2/processes/connectors/gitlab.py +267 -0
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +13 -0
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +82 -0
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +196 -0
- unstructured_ingest/v2/processes/connectors/kafka/local.py +75 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +163 -2
- unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +168 -0
- unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +5 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +1 -20
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +168 -0
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/sql.py +15 -6
- unstructured_ingest/v2/processes/partitioner.py +14 -3
- unstructured_ingest/v2/unstructured_api.py +25 -11
- {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/METADATA +17 -17
- {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/RECORD +43 -27
- unstructured_ingest/v2/processes/connectors/singlestore.py +0 -156
- {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
import tempfile
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import pytest
|
|
7
|
+
import singlestoredb as s2
|
|
8
|
+
|
|
9
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, env_setup_path
|
|
10
|
+
from test.integration.connectors.utils.docker_compose import docker_compose_context
|
|
11
|
+
from test.integration.connectors.utils.validation import (
|
|
12
|
+
ValidationConfigs,
|
|
13
|
+
source_connector_validation,
|
|
14
|
+
)
|
|
15
|
+
from unstructured_ingest.v2.interfaces import FileData
|
|
16
|
+
from unstructured_ingest.v2.processes.connectors.sql.singlestore import (
|
|
17
|
+
CONNECTOR_TYPE,
|
|
18
|
+
SingleStoreAccessConfig,
|
|
19
|
+
SingleStoreConnectionConfig,
|
|
20
|
+
SingleStoreDownloader,
|
|
21
|
+
SingleStoreDownloaderConfig,
|
|
22
|
+
SingleStoreIndexer,
|
|
23
|
+
SingleStoreIndexerConfig,
|
|
24
|
+
SingleStoreUploader,
|
|
25
|
+
SingleStoreUploaderConfig,
|
|
26
|
+
SingleStoreUploadStager,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
SEED_DATA_ROWS = 20
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@contextmanager
|
|
33
|
+
def singlestore_download_setup(connect_params: dict) -> None:
|
|
34
|
+
with docker_compose_context(
|
|
35
|
+
docker_compose_path=env_setup_path / "sql" / "singlestore" / "source"
|
|
36
|
+
):
|
|
37
|
+
with s2.connect(**connect_params) as connection:
|
|
38
|
+
with connection.cursor() as cursor:
|
|
39
|
+
for i in range(SEED_DATA_ROWS):
|
|
40
|
+
sql_statment = f"INSERT INTO cars (brand, price) VALUES " f"('brand_{i}', {i})"
|
|
41
|
+
cursor.execute(sql_statment)
|
|
42
|
+
connection.commit()
|
|
43
|
+
yield
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@pytest.mark.asyncio
|
|
47
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
|
|
48
|
+
async def test_singlestore_source():
|
|
49
|
+
connect_params = {
|
|
50
|
+
"host": "localhost",
|
|
51
|
+
"port": 3306,
|
|
52
|
+
"database": "ingest_test",
|
|
53
|
+
"user": "root",
|
|
54
|
+
"password": "password",
|
|
55
|
+
}
|
|
56
|
+
with singlestore_download_setup(connect_params=connect_params):
|
|
57
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
58
|
+
connection_config = SingleStoreConnectionConfig(
|
|
59
|
+
host=connect_params["host"],
|
|
60
|
+
port=connect_params["port"],
|
|
61
|
+
database=connect_params["database"],
|
|
62
|
+
user=connect_params["user"],
|
|
63
|
+
access_config=SingleStoreAccessConfig(password=connect_params["password"]),
|
|
64
|
+
)
|
|
65
|
+
indexer = SingleStoreIndexer(
|
|
66
|
+
connection_config=connection_config,
|
|
67
|
+
index_config=SingleStoreIndexerConfig(
|
|
68
|
+
table_name="cars", id_column="car_id", batch_size=5
|
|
69
|
+
),
|
|
70
|
+
)
|
|
71
|
+
downloader = SingleStoreDownloader(
|
|
72
|
+
connection_config=connection_config,
|
|
73
|
+
download_config=SingleStoreDownloaderConfig(
|
|
74
|
+
fields=["car_id", "brand"], download_dir=Path(tmpdir)
|
|
75
|
+
),
|
|
76
|
+
)
|
|
77
|
+
await source_connector_validation(
|
|
78
|
+
indexer=indexer,
|
|
79
|
+
downloader=downloader,
|
|
80
|
+
configs=ValidationConfigs(
|
|
81
|
+
test_id="singlestore",
|
|
82
|
+
expected_num_files=SEED_DATA_ROWS,
|
|
83
|
+
expected_number_indexed_file_data=4,
|
|
84
|
+
validate_downloaded_files=True,
|
|
85
|
+
),
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def validate_destination(
|
|
90
|
+
connect_params: dict,
|
|
91
|
+
expected_num_elements: int,
|
|
92
|
+
):
|
|
93
|
+
with s2.connect(**connect_params) as connection:
|
|
94
|
+
with connection.cursor() as cursor:
|
|
95
|
+
query = "select count(*) from elements;"
|
|
96
|
+
cursor.execute(query)
|
|
97
|
+
count = cursor.fetchone()[0]
|
|
98
|
+
assert (
|
|
99
|
+
count == expected_num_elements
|
|
100
|
+
), f"dest check failed: got {count}, expected {expected_num_elements}"
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@pytest.mark.asyncio
|
|
104
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
|
|
105
|
+
async def test_singlestore_destination(upload_file: Path):
|
|
106
|
+
mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
|
|
107
|
+
with docker_compose_context(
|
|
108
|
+
docker_compose_path=env_setup_path / "sql" / "singlestore" / "destination"
|
|
109
|
+
):
|
|
110
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
111
|
+
stager = SingleStoreUploadStager()
|
|
112
|
+
stager_params = {
|
|
113
|
+
"elements_filepath": upload_file,
|
|
114
|
+
"file_data": mock_file_data,
|
|
115
|
+
"output_dir": Path(tmpdir),
|
|
116
|
+
"output_filename": "test_db",
|
|
117
|
+
}
|
|
118
|
+
if stager.is_async():
|
|
119
|
+
staged_path = await stager.run_async(**stager_params)
|
|
120
|
+
else:
|
|
121
|
+
staged_path = stager.run(**stager_params)
|
|
122
|
+
|
|
123
|
+
# The stager should append the `.json` suffix to the output filename passed in.
|
|
124
|
+
assert staged_path.name == "test_db.json"
|
|
125
|
+
|
|
126
|
+
connect_params = {
|
|
127
|
+
"host": "localhost",
|
|
128
|
+
"port": 3306,
|
|
129
|
+
"database": "ingest_test",
|
|
130
|
+
"user": "root",
|
|
131
|
+
"password": "password",
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
uploader = SingleStoreUploader(
|
|
135
|
+
connection_config=SingleStoreConnectionConfig(
|
|
136
|
+
host=connect_params["host"],
|
|
137
|
+
port=connect_params["port"],
|
|
138
|
+
database=connect_params["database"],
|
|
139
|
+
user=connect_params["user"],
|
|
140
|
+
access_config=SingleStoreAccessConfig(password=connect_params["password"]),
|
|
141
|
+
),
|
|
142
|
+
upload_config=SingleStoreUploaderConfig(
|
|
143
|
+
table_name="elements",
|
|
144
|
+
),
|
|
145
|
+
)
|
|
146
|
+
if uploader.is_async():
|
|
147
|
+
await uploader.run_async(path=staged_path, file_data=mock_file_data)
|
|
148
|
+
else:
|
|
149
|
+
uploader.run(path=staged_path, file_data=mock_file_data)
|
|
150
|
+
|
|
151
|
+
staged_df = pd.read_json(staged_path, orient="records", lines=True)
|
|
152
|
+
expected_num_elements = len(staged_df)
|
|
153
|
+
validate_destination(
|
|
154
|
+
connect_params=connect_params,
|
|
155
|
+
expected_num_elements=expected_num_elements,
|
|
156
|
+
)
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from test.integration.connectors.utils.constants import (
|
|
6
|
+
SOURCE_TAG,
|
|
7
|
+
)
|
|
8
|
+
from test.integration.connectors.utils.validation import (
|
|
9
|
+
ValidationConfigs,
|
|
10
|
+
source_connector_validation,
|
|
11
|
+
)
|
|
12
|
+
from test.integration.utils import requires_env
|
|
13
|
+
from unstructured_ingest.v2.processes.connectors.confluence import (
|
|
14
|
+
CONNECTOR_TYPE,
|
|
15
|
+
ConfluenceAccessConfig,
|
|
16
|
+
ConfluenceConnectionConfig,
|
|
17
|
+
ConfluenceDownloader,
|
|
18
|
+
ConfluenceDownloaderConfig,
|
|
19
|
+
ConfluenceIndexer,
|
|
20
|
+
ConfluenceIndexerConfig,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@pytest.mark.asyncio
|
|
25
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
26
|
+
@requires_env("CONFLUENCE_USER_EMAIL", "CONFLUENCE_API_TOKEN")
|
|
27
|
+
async def test_confluence_source(temp_dir):
|
|
28
|
+
# Retrieve environment variables
|
|
29
|
+
confluence_url = "https://unstructured-ingest-test.atlassian.net"
|
|
30
|
+
user_email = os.environ["CONFLUENCE_USER_EMAIL"]
|
|
31
|
+
api_token = os.environ["CONFLUENCE_API_TOKEN"]
|
|
32
|
+
spaces = ["testteamsp", "MFS"]
|
|
33
|
+
|
|
34
|
+
# Create connection and indexer configurations
|
|
35
|
+
access_config = ConfluenceAccessConfig(api_token=api_token)
|
|
36
|
+
connection_config = ConfluenceConnectionConfig(
|
|
37
|
+
url=confluence_url,
|
|
38
|
+
user_email=user_email,
|
|
39
|
+
access_config=access_config,
|
|
40
|
+
)
|
|
41
|
+
index_config = ConfluenceIndexerConfig(
|
|
42
|
+
max_num_of_spaces=500,
|
|
43
|
+
max_num_of_docs_from_each_space=100,
|
|
44
|
+
spaces=spaces,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
download_config = ConfluenceDownloaderConfig(download_dir=temp_dir)
|
|
48
|
+
|
|
49
|
+
# Instantiate indexer and downloader
|
|
50
|
+
indexer = ConfluenceIndexer(
|
|
51
|
+
connection_config=connection_config,
|
|
52
|
+
index_config=index_config,
|
|
53
|
+
)
|
|
54
|
+
downloader = ConfluenceDownloader(
|
|
55
|
+
connection_config=connection_config,
|
|
56
|
+
download_config=download_config,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# Run the source connector validation
|
|
60
|
+
await source_connector_validation(
|
|
61
|
+
indexer=indexer,
|
|
62
|
+
downloader=downloader,
|
|
63
|
+
configs=ValidationConfigs(
|
|
64
|
+
test_id="confluence",
|
|
65
|
+
expected_num_files=11,
|
|
66
|
+
validate_downloaded_files=True,
|
|
67
|
+
),
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@pytest.mark.asyncio
|
|
72
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
73
|
+
@requires_env("CONFLUENCE_USER_EMAIL", "CONFLUENCE_API_TOKEN")
|
|
74
|
+
async def test_confluence_source_large(temp_dir):
|
|
75
|
+
# Retrieve environment variables
|
|
76
|
+
confluence_url = "https://unstructured-ingest-test.atlassian.net"
|
|
77
|
+
user_email = os.environ["CONFLUENCE_USER_EMAIL"]
|
|
78
|
+
api_token = os.environ["CONFLUENCE_API_TOKEN"]
|
|
79
|
+
spaces = ["testteamsp1"]
|
|
80
|
+
|
|
81
|
+
# Create connection and indexer configurations
|
|
82
|
+
access_config = ConfluenceAccessConfig(api_token=api_token)
|
|
83
|
+
connection_config = ConfluenceConnectionConfig(
|
|
84
|
+
url=confluence_url,
|
|
85
|
+
user_email=user_email,
|
|
86
|
+
access_config=access_config,
|
|
87
|
+
)
|
|
88
|
+
index_config = ConfluenceIndexerConfig(
|
|
89
|
+
max_num_of_spaces=10,
|
|
90
|
+
max_num_of_docs_from_each_space=250,
|
|
91
|
+
spaces=spaces,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
download_config = ConfluenceDownloaderConfig(download_dir=temp_dir)
|
|
95
|
+
|
|
96
|
+
# Instantiate indexer and downloader
|
|
97
|
+
indexer = ConfluenceIndexer(
|
|
98
|
+
connection_config=connection_config,
|
|
99
|
+
index_config=index_config,
|
|
100
|
+
)
|
|
101
|
+
downloader = ConfluenceDownloader(
|
|
102
|
+
connection_config=connection_config,
|
|
103
|
+
download_config=download_config,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# Run the source connector validation
|
|
107
|
+
await source_connector_validation(
|
|
108
|
+
indexer=indexer,
|
|
109
|
+
downloader=downloader,
|
|
110
|
+
configs=ValidationConfigs(
|
|
111
|
+
test_id="confluence_large", expected_num_files=250, validate_file_data=False
|
|
112
|
+
),
|
|
113
|
+
)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import socket
|
|
2
|
+
import tempfile
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
from confluent_kafka import Producer
|
|
7
|
+
|
|
8
|
+
from test.integration.connectors.utils.constants import (
|
|
9
|
+
SOURCE_TAG,
|
|
10
|
+
env_setup_path,
|
|
11
|
+
)
|
|
12
|
+
from test.integration.connectors.utils.docker_compose import docker_compose_context
|
|
13
|
+
from test.integration.connectors.utils.validation import (
|
|
14
|
+
ValidationConfigs,
|
|
15
|
+
source_connector_validation,
|
|
16
|
+
)
|
|
17
|
+
from unstructured_ingest.v2.processes.connectors.kafka.local import (
|
|
18
|
+
CONNECTOR_TYPE,
|
|
19
|
+
LocalKafkaConnectionConfig,
|
|
20
|
+
LocalKafkaDownloader,
|
|
21
|
+
LocalKafkaDownloaderConfig,
|
|
22
|
+
LocalKafkaIndexer,
|
|
23
|
+
LocalKafkaIndexerConfig,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
SEED_MESSAGES = 10
|
|
27
|
+
TOPIC = "fake-topic"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@pytest.fixture
|
|
31
|
+
def kafka_seed_topic() -> str:
|
|
32
|
+
with docker_compose_context(docker_compose_path=env_setup_path / "kafka"):
|
|
33
|
+
conf = {
|
|
34
|
+
"bootstrap.servers": "localhost:29092",
|
|
35
|
+
"client.id": socket.gethostname(),
|
|
36
|
+
"message.max.bytes": 10485760,
|
|
37
|
+
}
|
|
38
|
+
producer = Producer(conf)
|
|
39
|
+
for i in range(SEED_MESSAGES):
|
|
40
|
+
message = f"This is some text for message {i}"
|
|
41
|
+
producer.produce(topic=TOPIC, value=message)
|
|
42
|
+
producer.flush(timeout=10)
|
|
43
|
+
print(f"kafka topic {TOPIC} seeded with {SEED_MESSAGES} messages")
|
|
44
|
+
yield TOPIC
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@pytest.mark.asyncio
|
|
48
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
49
|
+
async def test_kafka_source_local(kafka_seed_topic: str):
|
|
50
|
+
connection_config = LocalKafkaConnectionConfig(bootstrap_server="localhost", port=29092)
|
|
51
|
+
with tempfile.TemporaryDirectory() as tempdir:
|
|
52
|
+
tempdir_path = Path(tempdir)
|
|
53
|
+
download_config = LocalKafkaDownloaderConfig(download_dir=tempdir_path)
|
|
54
|
+
indexer = LocalKafkaIndexer(
|
|
55
|
+
connection_config=connection_config,
|
|
56
|
+
index_config=LocalKafkaIndexerConfig(topic=kafka_seed_topic, num_messages_to_consume=5),
|
|
57
|
+
)
|
|
58
|
+
downloader = LocalKafkaDownloader(
|
|
59
|
+
connection_config=connection_config, download_config=download_config
|
|
60
|
+
)
|
|
61
|
+
await source_connector_validation(
|
|
62
|
+
indexer=indexer,
|
|
63
|
+
downloader=downloader,
|
|
64
|
+
configs=ValidationConfigs(
|
|
65
|
+
test_id="kafka", expected_num_files=5, validate_downloaded_files=True
|
|
66
|
+
),
|
|
67
|
+
)
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import uuid
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
from office365.graph_client import GraphClient
|
|
7
|
+
|
|
8
|
+
from test.integration.connectors.utils.constants import (
|
|
9
|
+
DESTINATION_TAG,
|
|
10
|
+
)
|
|
11
|
+
from test.integration.utils import requires_env
|
|
12
|
+
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
13
|
+
from unstructured_ingest.v2.processes.connectors.onedrive import (
|
|
14
|
+
CONNECTOR_TYPE,
|
|
15
|
+
OnedriveAccessConfig,
|
|
16
|
+
OnedriveConnectionConfig,
|
|
17
|
+
OnedriveUploader,
|
|
18
|
+
OnedriveUploaderConfig,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@pytest.fixture
|
|
23
|
+
def onedrive_test_folder() -> str:
|
|
24
|
+
"""
|
|
25
|
+
Pytest fixture that creates a test folder in OneDrive and deletes it after test run.
|
|
26
|
+
"""
|
|
27
|
+
connection_config = get_connection_config()
|
|
28
|
+
user_pname = connection_config.user_pname
|
|
29
|
+
|
|
30
|
+
# Get the OneDrive client
|
|
31
|
+
client: GraphClient = connection_config.get_client()
|
|
32
|
+
drive = client.users[user_pname].drive
|
|
33
|
+
|
|
34
|
+
# Generate a unique test folder path
|
|
35
|
+
test_folder_path = f"utic-test-output-{uuid.uuid4()}"
|
|
36
|
+
|
|
37
|
+
# Create the test folder
|
|
38
|
+
root = drive.root
|
|
39
|
+
folder = root.create_folder(test_folder_path).execute_query()
|
|
40
|
+
print(f"created folder: {folder.name}")
|
|
41
|
+
try:
|
|
42
|
+
yield test_folder_path
|
|
43
|
+
finally:
|
|
44
|
+
# Teardown: delete the test folder and its contents
|
|
45
|
+
folder.delete_object().execute_query()
|
|
46
|
+
print(f"successfully deleted folder: {folder.name}")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_connection_config():
|
|
50
|
+
"""
|
|
51
|
+
Pytest fixture that provides the OnedriveConnectionConfig for tests.
|
|
52
|
+
"""
|
|
53
|
+
client_id = os.getenv("MS_CLIENT_ID")
|
|
54
|
+
client_secret = os.getenv("MS_CLIENT_CRED")
|
|
55
|
+
tenant_id = os.getenv("MS_TENANT_ID")
|
|
56
|
+
user_pname = os.getenv("MS_USER_PNAME")
|
|
57
|
+
|
|
58
|
+
connection_config = OnedriveConnectionConfig(
|
|
59
|
+
client_id=client_id,
|
|
60
|
+
tenant=tenant_id,
|
|
61
|
+
user_pname=user_pname,
|
|
62
|
+
access_config=OnedriveAccessConfig(client_cred=client_secret),
|
|
63
|
+
)
|
|
64
|
+
return connection_config
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
68
|
+
@requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
|
|
69
|
+
def test_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
|
|
70
|
+
"""
|
|
71
|
+
Integration test for the OneDrive destination connector.
|
|
72
|
+
|
|
73
|
+
This test uploads a file to OneDrive and verifies that it exists.
|
|
74
|
+
"""
|
|
75
|
+
connection_config = get_connection_config()
|
|
76
|
+
# Retrieve user principal name from the connection config
|
|
77
|
+
user_pname = connection_config.user_pname
|
|
78
|
+
|
|
79
|
+
# The test folder is provided by the fixture
|
|
80
|
+
destination_folder = onedrive_test_folder
|
|
81
|
+
destination_fullpath = f"{destination_folder}/{upload_file.name}"
|
|
82
|
+
|
|
83
|
+
# Configure the uploader with remote_url
|
|
84
|
+
upload_config = OnedriveUploaderConfig(remote_url=f"onedrive://{destination_folder}")
|
|
85
|
+
|
|
86
|
+
uploader = OnedriveUploader(
|
|
87
|
+
connection_config=connection_config,
|
|
88
|
+
upload_config=upload_config,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
file_data = FileData(
|
|
92
|
+
source_identifiers=SourceIdentifiers(
|
|
93
|
+
fullpath=destination_fullpath,
|
|
94
|
+
filename=upload_file.name,
|
|
95
|
+
),
|
|
96
|
+
connector_type=CONNECTOR_TYPE,
|
|
97
|
+
identifier="mock_file_data",
|
|
98
|
+
)
|
|
99
|
+
uploader.precheck()
|
|
100
|
+
uploader.run(path=upload_file, file_data=file_data)
|
|
101
|
+
|
|
102
|
+
# Verify that the file was uploaded
|
|
103
|
+
client = connection_config.get_client()
|
|
104
|
+
drive = client.users[user_pname].drive
|
|
105
|
+
|
|
106
|
+
uploaded_file = (
|
|
107
|
+
drive.root.get_by_path(destination_fullpath).select(["id", "name"]).get().execute_query()
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# Check if the file exists
|
|
111
|
+
assert uploaded_file is not None
|
|
112
|
+
assert uploaded_file.name == upload_file.name
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import uuid
|
|
3
|
+
from contextlib import asynccontextmanager
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import AsyncGenerator
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
from qdrant_client import AsyncQdrantClient
|
|
9
|
+
|
|
10
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG
|
|
11
|
+
from test.integration.connectors.utils.docker import container_context
|
|
12
|
+
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
13
|
+
from unstructured_ingest.v2.processes.connectors.qdrant.local import (
|
|
14
|
+
CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE,
|
|
15
|
+
)
|
|
16
|
+
from unstructured_ingest.v2.processes.connectors.qdrant.local import (
|
|
17
|
+
LocalQdrantConnectionConfig,
|
|
18
|
+
LocalQdrantUploader,
|
|
19
|
+
LocalQdrantUploaderConfig,
|
|
20
|
+
LocalQdrantUploadStager,
|
|
21
|
+
LocalQdrantUploadStagerConfig,
|
|
22
|
+
)
|
|
23
|
+
from unstructured_ingest.v2.processes.connectors.qdrant.server import (
|
|
24
|
+
CONNECTOR_TYPE as SERVER_CONNECTOR_TYPE,
|
|
25
|
+
)
|
|
26
|
+
from unstructured_ingest.v2.processes.connectors.qdrant.server import (
|
|
27
|
+
ServerQdrantConnectionConfig,
|
|
28
|
+
ServerQdrantUploader,
|
|
29
|
+
ServerQdrantUploaderConfig,
|
|
30
|
+
ServerQdrantUploadStager,
|
|
31
|
+
ServerQdrantUploadStagerConfig,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
COLLECTION_NAME = f"test-coll-{uuid.uuid4().hex[:12]}"
|
|
35
|
+
VECTORS_CONFIG = {"size": 384, "distance": "Cosine"}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@asynccontextmanager
|
|
39
|
+
async def qdrant_client(client_params: dict) -> AsyncGenerator[AsyncQdrantClient, None]:
|
|
40
|
+
client = AsyncQdrantClient(**client_params)
|
|
41
|
+
try:
|
|
42
|
+
yield client
|
|
43
|
+
finally:
|
|
44
|
+
await client.close()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
async def validate_upload(client: AsyncQdrantClient, upload_file: Path):
|
|
48
|
+
with upload_file.open() as upload_fp:
|
|
49
|
+
elements = json.load(upload_fp)
|
|
50
|
+
expected_point_count = len(elements)
|
|
51
|
+
first_element = elements[0]
|
|
52
|
+
expected_text = first_element["text"]
|
|
53
|
+
embeddings = first_element["embeddings"]
|
|
54
|
+
collection = await client.get_collection(COLLECTION_NAME)
|
|
55
|
+
assert collection.points_count == expected_point_count
|
|
56
|
+
|
|
57
|
+
response = await client.query_points(COLLECTION_NAME, query=embeddings, limit=1)
|
|
58
|
+
assert response.points[0].payload is not None
|
|
59
|
+
assert response.points[0].payload["text"] == expected_text
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@pytest.mark.asyncio
|
|
63
|
+
@pytest.mark.tags(LOCAL_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant")
|
|
64
|
+
async def test_qdrant_destination_local(upload_file: Path, tmp_path: Path):
|
|
65
|
+
connection_kwargs = {"path": str(tmp_path / "qdrant")}
|
|
66
|
+
async with qdrant_client(connection_kwargs) as client:
|
|
67
|
+
await client.create_collection(COLLECTION_NAME, vectors_config=VECTORS_CONFIG)
|
|
68
|
+
AsyncQdrantClient(**connection_kwargs)
|
|
69
|
+
stager = LocalQdrantUploadStager(
|
|
70
|
+
upload_stager_config=LocalQdrantUploadStagerConfig(),
|
|
71
|
+
)
|
|
72
|
+
uploader = LocalQdrantUploader(
|
|
73
|
+
connection_config=LocalQdrantConnectionConfig(**connection_kwargs),
|
|
74
|
+
upload_config=LocalQdrantUploaderConfig(collection_name=COLLECTION_NAME),
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
file_data = FileData(
|
|
78
|
+
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
79
|
+
connector_type=LOCAL_CONNECTOR_TYPE,
|
|
80
|
+
identifier="mock-file-data",
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
staged_upload_file = stager.run(
|
|
84
|
+
elements_filepath=upload_file,
|
|
85
|
+
file_data=file_data,
|
|
86
|
+
output_dir=tmp_path,
|
|
87
|
+
output_filename=upload_file.name,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
if uploader.is_async():
|
|
91
|
+
await uploader.run_async(path=staged_upload_file, file_data=file_data)
|
|
92
|
+
else:
|
|
93
|
+
uploader.run(path=upload_file, file_data=file_data)
|
|
94
|
+
async with qdrant_client(connection_kwargs) as client:
|
|
95
|
+
await validate_upload(client=client, upload_file=upload_file)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@pytest.fixture
|
|
99
|
+
def docker_context():
|
|
100
|
+
with container_context(image="qdrant/qdrant:latest", ports={"6333": "6333"}) as container:
|
|
101
|
+
yield container
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@pytest.mark.asyncio
|
|
105
|
+
@pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant")
|
|
106
|
+
async def test_qdrant_destination_server(upload_file: Path, tmp_path: Path, docker_context):
|
|
107
|
+
connection_kwargs = {"location": "http://localhost:6333"}
|
|
108
|
+
async with qdrant_client(connection_kwargs) as client:
|
|
109
|
+
await client.create_collection(COLLECTION_NAME, vectors_config=VECTORS_CONFIG)
|
|
110
|
+
AsyncQdrantClient(**connection_kwargs)
|
|
111
|
+
stager = ServerQdrantUploadStager(
|
|
112
|
+
upload_stager_config=ServerQdrantUploadStagerConfig(),
|
|
113
|
+
)
|
|
114
|
+
uploader = ServerQdrantUploader(
|
|
115
|
+
connection_config=ServerQdrantConnectionConfig(**connection_kwargs),
|
|
116
|
+
upload_config=ServerQdrantUploaderConfig(collection_name=COLLECTION_NAME),
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
file_data = FileData(
|
|
120
|
+
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
121
|
+
connector_type=SERVER_CONNECTOR_TYPE,
|
|
122
|
+
identifier="mock-file-data",
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
staged_upload_file = stager.run(
|
|
126
|
+
elements_filepath=upload_file,
|
|
127
|
+
file_data=file_data,
|
|
128
|
+
output_dir=tmp_path,
|
|
129
|
+
output_filename=upload_file.name,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
if uploader.is_async():
|
|
133
|
+
await uploader.run_async(path=staged_upload_file, file_data=file_data)
|
|
134
|
+
else:
|
|
135
|
+
uploader.run(path=upload_file, file_data=file_data)
|
|
136
|
+
async with qdrant_client(connection_kwargs) as client:
|
|
137
|
+
await validate_upload(client=client, upload_file=upload_file)
|
|
@@ -85,7 +85,7 @@ async def test_s3_source_no_access(anon_connection_config: S3ConnectionConfig):
|
|
|
85
85
|
async def test_s3_minio_source(anon_connection_config: S3ConnectionConfig):
|
|
86
86
|
anon_connection_config.endpoint_url = "http://localhost:9000"
|
|
87
87
|
indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/")
|
|
88
|
-
with docker_compose_context(docker_compose_path=env_setup_path / "minio"):
|
|
88
|
+
with docker_compose_context(docker_compose_path=env_setup_path / "minio" / "source"):
|
|
89
89
|
with tempfile.TemporaryDirectory() as tempdir:
|
|
90
90
|
tempdir_path = Path(tempdir)
|
|
91
91
|
download_config = S3DownloaderConfig(download_dir=tempdir_path)
|
|
@@ -47,14 +47,15 @@ def healthcheck_wait(container: Container, timeout: int = 10) -> None:
|
|
|
47
47
|
|
|
48
48
|
@contextmanager
|
|
49
49
|
def container_context(
|
|
50
|
-
docker_client: docker.DockerClient,
|
|
51
50
|
image: str,
|
|
52
51
|
ports: dict,
|
|
53
52
|
environment: Optional[dict] = None,
|
|
54
53
|
volumes: Optional[dict] = None,
|
|
55
54
|
healthcheck: Optional[dict] = None,
|
|
56
55
|
healthcheck_timeout: int = 10,
|
|
56
|
+
docker_client: Optional[docker.DockerClient] = None,
|
|
57
57
|
):
|
|
58
|
+
docker_client = docker_client or docker.from_env()
|
|
58
59
|
container: Optional[Container] = None
|
|
59
60
|
try:
|
|
60
61
|
container = get_container(
|
|
@@ -3,6 +3,23 @@ from contextlib import contextmanager
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
5
|
|
|
6
|
+
def docker_compose_down(docker_compose_path: Path):
|
|
7
|
+
cmd = f"docker compose -f {docker_compose_path.resolve()} down --remove-orphans -v --rmi all"
|
|
8
|
+
print(f"Running command: {cmd}")
|
|
9
|
+
final_resp = subprocess.run(
|
|
10
|
+
cmd,
|
|
11
|
+
shell=True,
|
|
12
|
+
capture_output=True,
|
|
13
|
+
)
|
|
14
|
+
if final_resp.returncode != 0:
|
|
15
|
+
print("STDOUT: {}".format(final_resp.stdout.decode("utf-8")))
|
|
16
|
+
print("STDERR: {}".format(final_resp.stderr.decode("utf-8")))
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def run_cleanup(docker_compose_path: Path):
|
|
20
|
+
docker_compose_down(docker_compose_path=docker_compose_path)
|
|
21
|
+
|
|
22
|
+
|
|
6
23
|
@contextmanager
|
|
7
24
|
def docker_compose_context(docker_compose_path: Path):
|
|
8
25
|
# Dynamically run a specific docker compose file and make sure it gets cleanup by
|
|
@@ -30,15 +47,13 @@ def docker_compose_context(docker_compose_path: Path):
|
|
|
30
47
|
if resp:
|
|
31
48
|
print("STDOUT: {}".format(resp.stdout.decode("utf-8")))
|
|
32
49
|
print("STDERR: {}".format(resp.stderr.decode("utf-8")))
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
cmd = f"docker compose -f {docker_compose_path.resolve()} down --remove-orphans -v"
|
|
36
|
-
print(f"Running command: {cmd}")
|
|
37
|
-
final_resp = subprocess.run(
|
|
50
|
+
cmd = f"docker compose -f {docker_compose_path.resolve()} logs"
|
|
51
|
+
logs = subprocess.run(
|
|
38
52
|
cmd,
|
|
39
53
|
shell=True,
|
|
40
54
|
capture_output=True,
|
|
41
55
|
)
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
56
|
+
print("DOCKER LOGS: {}".format(logs.stdout.decode("utf-8")))
|
|
57
|
+
raise e
|
|
58
|
+
finally:
|
|
59
|
+
run_cleanup(docker_compose_path=docker_compose_path)
|