unstructured-ingest 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/test_duckdb.py +82 -0
- test/integration/connectors/duckdb/test_motherduck.py +106 -0
- test/integration/connectors/test_kafka.py +109 -6
- test/integration/connectors/test_qdrant.py +55 -0
- test/unit/v2/connectors/test_confluence.py +39 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +1 -0
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +24 -21
- unstructured_ingest/v2/processes/connectors/chroma.py +6 -5
- unstructured_ingest/v2/processes/connectors/confluence.py +14 -2
- unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +99 -0
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +118 -0
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +133 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +34 -15
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +6 -2
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +23 -11
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +3 -3
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +2 -2
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +2 -3
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +8 -8
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +9 -2
- unstructured_ingest/v2/processes/connectors/kafka/local.py +1 -1
- unstructured_ingest/v2/processes/connectors/kdbai.py +2 -2
- unstructured_ingest/v2/processes/connectors/pinecone.py +2 -2
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +6 -4
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +2 -1
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +7 -9
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +62 -24
- unstructured_ingest/v2/processes/connectors/sql/sql.py +8 -3
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +6 -9
- unstructured_ingest/v2/utils.py +9 -0
- {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/METADATA +19 -17
- {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/RECORD +39 -31
- {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/top_level.txt +0 -0
|
File without changes
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import tempfile
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Generator
|
|
5
|
+
|
|
6
|
+
import duckdb
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import pytest
|
|
9
|
+
|
|
10
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG
|
|
11
|
+
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
12
|
+
from unstructured_ingest.v2.processes.connectors.duckdb.duckdb import (
|
|
13
|
+
CONNECTOR_TYPE,
|
|
14
|
+
DuckDBConnectionConfig,
|
|
15
|
+
DuckDBUploader,
|
|
16
|
+
DuckDBUploaderConfig,
|
|
17
|
+
DuckDBUploadStager,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@contextmanager
|
|
22
|
+
def duckdbd_setup() -> Generator[Path, None, None]:
|
|
23
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
24
|
+
db_path = Path(temp_dir) / "temp_duck.db"
|
|
25
|
+
db_init_path = Path(__file__).parent / "duckdb-schema.sql"
|
|
26
|
+
assert db_init_path.exists()
|
|
27
|
+
assert db_init_path.is_file()
|
|
28
|
+
with duckdb.connect(database=db_path) as duckdb_connection:
|
|
29
|
+
with db_init_path.open("r") as f:
|
|
30
|
+
query = f.read()
|
|
31
|
+
duckdb_connection.execute(query)
|
|
32
|
+
duckdb_connection.close()
|
|
33
|
+
yield db_path
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def validate_duckdb_destination(db_path: Path, expected_num_elements: int):
|
|
37
|
+
conn = None
|
|
38
|
+
try:
|
|
39
|
+
conn = duckdb.connect(db_path)
|
|
40
|
+
_results = conn.sql("select count(*) from elements").fetchall()
|
|
41
|
+
_count = _results[0][0]
|
|
42
|
+
assert (
|
|
43
|
+
_count == expected_num_elements
|
|
44
|
+
), f"dest check failed: got {_count}, expected {expected_num_elements}"
|
|
45
|
+
conn.close()
|
|
46
|
+
finally:
|
|
47
|
+
if conn:
|
|
48
|
+
conn.close()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "duckdb")
|
|
52
|
+
def test_duckdb_destination(upload_file: Path):
|
|
53
|
+
with duckdbd_setup() as test_db_path:
|
|
54
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
55
|
+
file_data = FileData(
|
|
56
|
+
source_identifiers=SourceIdentifiers(
|
|
57
|
+
fullpath=upload_file.name, filename=upload_file.name
|
|
58
|
+
),
|
|
59
|
+
connector_type=CONNECTOR_TYPE,
|
|
60
|
+
identifier="mock-file-data",
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# deafults to default stager config
|
|
64
|
+
stager = DuckDBUploadStager()
|
|
65
|
+
stager_params = {
|
|
66
|
+
"elements_filepath": upload_file,
|
|
67
|
+
"file_data": file_data,
|
|
68
|
+
"output_dir": temp_dir,
|
|
69
|
+
"output_filename": "test_db",
|
|
70
|
+
}
|
|
71
|
+
staged_path = stager.run(**stager_params)
|
|
72
|
+
|
|
73
|
+
connection_config = DuckDBConnectionConfig(database=str(test_db_path))
|
|
74
|
+
upload_config = DuckDBUploaderConfig()
|
|
75
|
+
uploader = DuckDBUploader(
|
|
76
|
+
connection_config=connection_config, upload_config=upload_config
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
uploader.run(path=staged_path, file_data=file_data)
|
|
80
|
+
|
|
81
|
+
staged_df = pd.read_json(staged_path, orient="records", lines=True)
|
|
82
|
+
validate_duckdb_destination(db_path=test_db_path, expected_num_elements=len(staged_df))
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import tempfile
|
|
3
|
+
import uuid
|
|
4
|
+
from contextlib import contextmanager
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Generator
|
|
7
|
+
|
|
8
|
+
import duckdb
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import pytest
|
|
11
|
+
|
|
12
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG
|
|
13
|
+
from test.integration.utils import requires_env
|
|
14
|
+
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
15
|
+
from unstructured_ingest.v2.processes.connectors.duckdb.motherduck import (
|
|
16
|
+
CONNECTOR_TYPE,
|
|
17
|
+
MotherDuckAccessConfig,
|
|
18
|
+
MotherDuckConnectionConfig,
|
|
19
|
+
MotherDuckUploader,
|
|
20
|
+
MotherDuckUploaderConfig,
|
|
21
|
+
MotherDuckUploadStager,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@contextmanager
|
|
26
|
+
def motherduck_setup(md_token: str) -> Generator[Path, None, None]:
|
|
27
|
+
database_name = f"test_{str(uuid.uuid4()).replace('-', '_')}"
|
|
28
|
+
try:
|
|
29
|
+
db_init_path = Path(__file__).parent / "duckdb-schema.sql"
|
|
30
|
+
assert db_init_path.exists()
|
|
31
|
+
assert db_init_path.is_file()
|
|
32
|
+
with duckdb.connect(f"md:?motherduck_token={md_token}") as md_conn:
|
|
33
|
+
with db_init_path.open("r") as f:
|
|
34
|
+
query = f.read()
|
|
35
|
+
md_conn.execute(f"CREATE DATABASE {database_name}")
|
|
36
|
+
md_conn.execute(f"USE {database_name}")
|
|
37
|
+
md_conn.execute(query)
|
|
38
|
+
md_conn.close()
|
|
39
|
+
yield database_name
|
|
40
|
+
finally:
|
|
41
|
+
with duckdb.connect(f"md:?motherduck_token={md_token}") as md_conn:
|
|
42
|
+
md_conn.execute(f"DROP DATABASE {database_name}")
|
|
43
|
+
md_conn.close()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def validate_motherduck_destination(database: str, expected_num_elements: int, md_token: str):
|
|
47
|
+
conn = None
|
|
48
|
+
try:
|
|
49
|
+
conn = duckdb.connect(f"md:?motherduck_token={md_token}")
|
|
50
|
+
conn.execute(f"USE {database}")
|
|
51
|
+
_results = conn.sql("select count(*) from elements").fetchall()
|
|
52
|
+
_count = _results[0][0]
|
|
53
|
+
assert (
|
|
54
|
+
_count == expected_num_elements
|
|
55
|
+
), f"dest check failed: got {_count}, expected {expected_num_elements}"
|
|
56
|
+
conn.close()
|
|
57
|
+
finally:
|
|
58
|
+
if conn:
|
|
59
|
+
conn.close()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def get_motherduck_token() -> dict:
|
|
63
|
+
motherduck_token = os.getenv("MOTHERDUCK_TOKEN", None)
|
|
64
|
+
assert motherduck_token
|
|
65
|
+
return motherduck_token
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "motherduck")
|
|
69
|
+
@requires_env("MOTHERDUCK_TOKEN")
|
|
70
|
+
def test_motherduck_destination(upload_file: Path):
|
|
71
|
+
md_token = get_motherduck_token()
|
|
72
|
+
with motherduck_setup(md_token) as test_database:
|
|
73
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
74
|
+
file_data = FileData(
|
|
75
|
+
source_identifiers=SourceIdentifiers(
|
|
76
|
+
fullpath=upload_file.name, filename=upload_file.name
|
|
77
|
+
),
|
|
78
|
+
connector_type=CONNECTOR_TYPE,
|
|
79
|
+
identifier="mock-file-data",
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# deafults to default stager config
|
|
83
|
+
stager = MotherDuckUploadStager()
|
|
84
|
+
stager_params = {
|
|
85
|
+
"elements_filepath": upload_file,
|
|
86
|
+
"file_data": file_data,
|
|
87
|
+
"output_dir": temp_dir,
|
|
88
|
+
"output_filename": "test_db",
|
|
89
|
+
}
|
|
90
|
+
staged_path = stager.run(**stager_params)
|
|
91
|
+
|
|
92
|
+
access_config = MotherDuckAccessConfig(md_token=md_token)
|
|
93
|
+
connection_config = MotherDuckConnectionConfig(
|
|
94
|
+
database=test_database, access_config=access_config
|
|
95
|
+
)
|
|
96
|
+
upload_config = MotherDuckUploaderConfig()
|
|
97
|
+
uploader = MotherDuckUploader(
|
|
98
|
+
connection_config=connection_config, upload_config=upload_config
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
uploader.run(path=staged_path, file_data=file_data)
|
|
102
|
+
|
|
103
|
+
staged_df = pd.read_json(staged_path, orient="records", lines=True)
|
|
104
|
+
validate_motherduck_destination(
|
|
105
|
+
database=test_database, expected_num_elements=len(staged_df), md_token=md_token
|
|
106
|
+
)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import os
|
|
2
3
|
import tempfile
|
|
3
4
|
import time
|
|
4
5
|
from pathlib import Path
|
|
@@ -17,8 +18,17 @@ from test.integration.connectors.utils.validation import (
|
|
|
17
18
|
ValidationConfigs,
|
|
18
19
|
source_connector_validation,
|
|
19
20
|
)
|
|
21
|
+
from test.integration.utils import requires_env
|
|
20
22
|
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
21
23
|
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
24
|
+
from unstructured_ingest.v2.processes.connectors.kafka.cloud import (
|
|
25
|
+
CloudKafkaAccessConfig,
|
|
26
|
+
CloudKafkaConnectionConfig,
|
|
27
|
+
CloudKafkaDownloader,
|
|
28
|
+
CloudKafkaDownloaderConfig,
|
|
29
|
+
CloudKafkaIndexer,
|
|
30
|
+
CloudKafkaIndexerConfig,
|
|
31
|
+
)
|
|
22
32
|
from unstructured_ingest.v2.processes.connectors.kafka.local import (
|
|
23
33
|
CONNECTOR_TYPE,
|
|
24
34
|
LocalKafkaConnectionConfig,
|
|
@@ -47,20 +57,27 @@ def docker_compose_ctx():
|
|
|
47
57
|
yield ctx
|
|
48
58
|
|
|
49
59
|
|
|
50
|
-
def wait_for_topic(
|
|
51
|
-
|
|
60
|
+
def wait_for_topic(
|
|
61
|
+
topic: str,
|
|
62
|
+
retries: int = 10,
|
|
63
|
+
interval: int = 1,
|
|
64
|
+
exists: bool = True,
|
|
65
|
+
admin_client=None,
|
|
66
|
+
):
|
|
67
|
+
if admin_client is None:
|
|
68
|
+
admin_client = get_admin_client()
|
|
52
69
|
current_topics = admin_client.list_topics().topics
|
|
53
70
|
attempts = 0
|
|
54
|
-
while topic not in current_topics and attempts < retries:
|
|
71
|
+
while (topic not in current_topics) == exists and attempts < retries:
|
|
55
72
|
attempts += 1
|
|
56
73
|
print(
|
|
57
|
-
"Attempt {}: Waiting for topic {} to exist
|
|
58
|
-
attempts, topic, ", ".join(current_topics)
|
|
74
|
+
"Attempt {}: Waiting for topic {} to {} exist. Current topics: [{}]".format(
|
|
75
|
+
attempts, topic, "" if exists else "not", ", ".join(current_topics)
|
|
59
76
|
)
|
|
60
77
|
)
|
|
61
78
|
time.sleep(interval)
|
|
62
79
|
current_topics = admin_client.list_topics().topics
|
|
63
|
-
if topic not in current_topics:
|
|
80
|
+
if (topic not in current_topics) == exists:
|
|
64
81
|
raise TimeoutError(f"Timeout out waiting for topic {topic} to exist")
|
|
65
82
|
|
|
66
83
|
|
|
@@ -110,6 +127,92 @@ async def test_kafka_source_local(kafka_seed_topic: str):
|
|
|
110
127
|
)
|
|
111
128
|
|
|
112
129
|
|
|
130
|
+
@pytest.fixture
|
|
131
|
+
def kafka_seed_topic_cloud(expected_messages: int = 5) -> int:
|
|
132
|
+
conf = {
|
|
133
|
+
"bootstrap.servers": os.environ["KAFKA_BOOTSTRAP_SERVER"],
|
|
134
|
+
"sasl.username": os.environ["KAFKA_API_KEY"],
|
|
135
|
+
"sasl.password": os.environ["KAFKA_SECRET"],
|
|
136
|
+
"sasl.mechanism": "PLAIN",
|
|
137
|
+
"security.protocol": "SASL_SSL",
|
|
138
|
+
}
|
|
139
|
+
admin_client = AdminClient(conf)
|
|
140
|
+
try:
|
|
141
|
+
res = admin_client.delete_topics([TOPIC], operation_timeout=10)
|
|
142
|
+
for topic, f in res.items():
|
|
143
|
+
f.result()
|
|
144
|
+
print(f"Topic {topic} removed")
|
|
145
|
+
wait_for_topic(TOPIC, 5, 1, False, admin_client)
|
|
146
|
+
except Exception:
|
|
147
|
+
pass
|
|
148
|
+
|
|
149
|
+
cluster_meta = admin_client.list_topics()
|
|
150
|
+
current_topics = [topic for topic in cluster_meta.topics if topic != "__consumer_offsets"]
|
|
151
|
+
|
|
152
|
+
assert TOPIC not in current_topics, f"Topic {TOPIC} shouldn't exist"
|
|
153
|
+
|
|
154
|
+
# Kafka Cloud allows to use replication_factor=1 only for Dedicated clusters.
|
|
155
|
+
topic_obj = NewTopic(TOPIC, num_partitions=1, replication_factor=3)
|
|
156
|
+
|
|
157
|
+
res = admin_client.create_topics([topic_obj], operation_timeout=10, validate_only=False)
|
|
158
|
+
for topic, f in res.items():
|
|
159
|
+
f.result()
|
|
160
|
+
|
|
161
|
+
producer = Producer(conf)
|
|
162
|
+
for i in range(expected_messages):
|
|
163
|
+
message = f"This is some text for message {i}"
|
|
164
|
+
producer.produce(topic=TOPIC, value=message)
|
|
165
|
+
producer.flush(timeout=10)
|
|
166
|
+
return expected_messages
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
@pytest.mark.asyncio
|
|
170
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
171
|
+
@requires_env("KAFKA_API_KEY", "KAFKA_SECRET", "KAFKA_BOOTSTRAP_SERVER")
|
|
172
|
+
async def test_kafka_source_cloud(kafka_seed_topic_cloud: int):
|
|
173
|
+
"""
|
|
174
|
+
In order to have this test succeed, you need to create cluster on Confluent Cloud,
|
|
175
|
+
and create the API key with admin privileges. By default, user account keys have it.
|
|
176
|
+
"""
|
|
177
|
+
|
|
178
|
+
expected_messages = kafka_seed_topic_cloud
|
|
179
|
+
|
|
180
|
+
connection_config = CloudKafkaConnectionConfig(
|
|
181
|
+
bootstrap_server=os.environ["KAFKA_BOOTSTRAP_SERVER"],
|
|
182
|
+
port=9092,
|
|
183
|
+
access_config=CloudKafkaAccessConfig(
|
|
184
|
+
kafka_api_key=os.environ["KAFKA_API_KEY"],
|
|
185
|
+
secret=os.environ["KAFKA_SECRET"],
|
|
186
|
+
),
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
with tempfile.TemporaryDirectory() as tempdir:
|
|
190
|
+
tempdir_path = Path(tempdir)
|
|
191
|
+
download_config = CloudKafkaDownloaderConfig(download_dir=tempdir_path)
|
|
192
|
+
indexer = CloudKafkaIndexer(
|
|
193
|
+
connection_config=connection_config,
|
|
194
|
+
index_config=CloudKafkaIndexerConfig(
|
|
195
|
+
topic=TOPIC,
|
|
196
|
+
num_messages_to_consume=expected_messages,
|
|
197
|
+
),
|
|
198
|
+
)
|
|
199
|
+
downloader = CloudKafkaDownloader(
|
|
200
|
+
connection_config=connection_config, download_config=download_config
|
|
201
|
+
)
|
|
202
|
+
indexer.precheck()
|
|
203
|
+
await source_connector_validation(
|
|
204
|
+
indexer=indexer,
|
|
205
|
+
downloader=downloader,
|
|
206
|
+
configs=ValidationConfigs(
|
|
207
|
+
test_id="kafka",
|
|
208
|
+
exclude_fields_extend=["connector_type"],
|
|
209
|
+
expected_num_files=expected_messages,
|
|
210
|
+
validate_downloaded_files=True,
|
|
211
|
+
validate_file_data=True,
|
|
212
|
+
),
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
|
|
113
216
|
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
114
217
|
def test_kafka_source_local_precheck_fail_no_cluster():
|
|
115
218
|
connection_config = LocalKafkaConnectionConfig(bootstrap_server="localhost", port=29092)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import os
|
|
2
3
|
import uuid
|
|
3
4
|
from contextlib import asynccontextmanager
|
|
4
5
|
from pathlib import Path
|
|
@@ -9,7 +10,16 @@ from qdrant_client import AsyncQdrantClient
|
|
|
9
10
|
|
|
10
11
|
from test.integration.connectors.utils.constants import DESTINATION_TAG
|
|
11
12
|
from test.integration.connectors.utils.docker import container_context
|
|
13
|
+
from test.integration.utils import requires_env
|
|
12
14
|
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
15
|
+
from unstructured_ingest.v2.processes.connectors.qdrant.cloud import (
|
|
16
|
+
CloudQdrantAccessConfig,
|
|
17
|
+
CloudQdrantConnectionConfig,
|
|
18
|
+
CloudQdrantUploader,
|
|
19
|
+
CloudQdrantUploaderConfig,
|
|
20
|
+
CloudQdrantUploadStager,
|
|
21
|
+
CloudQdrantUploadStagerConfig,
|
|
22
|
+
)
|
|
13
23
|
from unstructured_ingest.v2.processes.connectors.qdrant.local import (
|
|
14
24
|
CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE,
|
|
15
25
|
)
|
|
@@ -135,3 +145,48 @@ async def test_qdrant_destination_server(upload_file: Path, tmp_path: Path, dock
|
|
|
135
145
|
uploader.run(path=upload_file, file_data=file_data)
|
|
136
146
|
async with qdrant_client(connection_kwargs) as client:
|
|
137
147
|
await validate_upload(client=client, upload_file=upload_file)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
@pytest.mark.asyncio
|
|
151
|
+
@pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant")
|
|
152
|
+
@requires_env("QDRANT_API_KEY", "QDRANT_SERVER_URL")
|
|
153
|
+
async def test_qdrant_destination_cloud(upload_file: Path, tmp_path: Path):
|
|
154
|
+
server_url = os.environ["QDRANT_SERVER_URL"]
|
|
155
|
+
api_key = os.environ["QDRANT_API_KEY"]
|
|
156
|
+
connection_kwargs = {"location": server_url, "api_key": api_key}
|
|
157
|
+
async with qdrant_client(connection_kwargs) as client:
|
|
158
|
+
await client.create_collection(COLLECTION_NAME, vectors_config=VECTORS_CONFIG)
|
|
159
|
+
AsyncQdrantClient(**connection_kwargs)
|
|
160
|
+
|
|
161
|
+
stager = CloudQdrantUploadStager(
|
|
162
|
+
upload_stager_config=CloudQdrantUploadStagerConfig(),
|
|
163
|
+
)
|
|
164
|
+
uploader = CloudQdrantUploader(
|
|
165
|
+
connection_config=CloudQdrantConnectionConfig(
|
|
166
|
+
url=server_url,
|
|
167
|
+
access_config=CloudQdrantAccessConfig(
|
|
168
|
+
api_key=api_key,
|
|
169
|
+
),
|
|
170
|
+
),
|
|
171
|
+
upload_config=CloudQdrantUploaderConfig(collection_name=COLLECTION_NAME),
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
file_data = FileData(
|
|
175
|
+
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
176
|
+
connector_type=SERVER_CONNECTOR_TYPE,
|
|
177
|
+
identifier="mock-file-data",
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
staged_upload_file = stager.run(
|
|
181
|
+
elements_filepath=upload_file,
|
|
182
|
+
file_data=file_data,
|
|
183
|
+
output_dir=tmp_path,
|
|
184
|
+
output_filename=upload_file.name,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
if uploader.is_async():
|
|
188
|
+
await uploader.run_async(path=staged_upload_file, file_data=file_data)
|
|
189
|
+
else:
|
|
190
|
+
uploader.run(path=staged_upload_file, file_data=file_data)
|
|
191
|
+
async with qdrant_client(connection_kwargs) as client:
|
|
192
|
+
await validate_upload(client=client, upload_file=upload_file)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from pydantic import ValidationError
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.v2.processes.connectors.confluence import (
|
|
5
|
+
ConfluenceAccessConfig,
|
|
6
|
+
ConfluenceConnectionConfig,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_connection_config_multiple_auth():
|
|
11
|
+
with pytest.raises(ValidationError):
|
|
12
|
+
ConfluenceConnectionConfig(
|
|
13
|
+
access_config=ConfluenceAccessConfig(
|
|
14
|
+
api_token="api_token",
|
|
15
|
+
access_token="access_token",
|
|
16
|
+
),
|
|
17
|
+
user_email="user_email",
|
|
18
|
+
url="url",
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_connection_config_no_auth():
|
|
23
|
+
with pytest.raises(ValidationError):
|
|
24
|
+
ConfluenceConnectionConfig(access_config=ConfluenceAccessConfig(), url="url")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def test_connection_config_basic_auth():
|
|
28
|
+
ConfluenceConnectionConfig(
|
|
29
|
+
access_config=ConfluenceAccessConfig(api_token="api_token"),
|
|
30
|
+
url="url",
|
|
31
|
+
user_email="user_email",
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_connection_config_pat_auth():
|
|
36
|
+
ConfluenceConnectionConfig(
|
|
37
|
+
access_config=ConfluenceAccessConfig(access_token="access_token"),
|
|
38
|
+
url="url",
|
|
39
|
+
)
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.3.
|
|
1
|
+
__version__ = "0.3.7" # pragma: no cover
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import unstructured_ingest.v2.processes.connectors.databricks # noqa: F401
|
|
4
|
+
import unstructured_ingest.v2.processes.connectors.duckdb # noqa: F401
|
|
4
5
|
import unstructured_ingest.v2.processes.connectors.elasticsearch # noqa: F401
|
|
5
6
|
import unstructured_ingest.v2.processes.connectors.fsspec # noqa: F401
|
|
6
7
|
import unstructured_ingest.v2.processes.connectors.kafka # noqa: F401
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import json
|
|
2
|
-
import uuid
|
|
3
2
|
from dataclasses import dataclass, field
|
|
4
3
|
from pathlib import Path
|
|
5
4
|
from typing import TYPE_CHECKING, Any
|
|
@@ -24,6 +23,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
24
23
|
DestinationRegistryEntry,
|
|
25
24
|
)
|
|
26
25
|
from unstructured_ingest.v2.processes.connectors.utils import parse_datetime
|
|
26
|
+
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
27
27
|
|
|
28
28
|
if TYPE_CHECKING:
|
|
29
29
|
from azure.search.documents import SearchClient
|
|
@@ -100,7 +100,7 @@ class AzureAISearchUploadStager(UploadStager):
|
|
|
100
100
|
Azure Cognitive Search index
|
|
101
101
|
"""
|
|
102
102
|
|
|
103
|
-
data["id"] =
|
|
103
|
+
data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
|
|
104
104
|
data[RECORD_ID_LABEL] = file_data.identifier
|
|
105
105
|
|
|
106
106
|
if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
|
|
@@ -173,8 +173,10 @@ class AzureAISearchUploader(Uploader):
|
|
|
173
173
|
connector_type: str = CONNECTOR_TYPE
|
|
174
174
|
|
|
175
175
|
def query_docs(self, record_id: str, index_key: str) -> list[str]:
|
|
176
|
-
|
|
177
|
-
|
|
176
|
+
with self.connection_config.get_search_client() as search_client:
|
|
177
|
+
results = list(
|
|
178
|
+
search_client.search(filter=f"record_id eq '{record_id}'", select=[index_key])
|
|
179
|
+
)
|
|
178
180
|
return [result[index_key] for result in results]
|
|
179
181
|
|
|
180
182
|
def delete_by_record_id(self, file_data: FileData, index_key: str) -> None:
|
|
@@ -186,10 +188,10 @@ class AzureAISearchUploader(Uploader):
|
|
|
186
188
|
doc_ids_to_delete = self.query_docs(record_id=file_data.identifier, index_key=index_key)
|
|
187
189
|
if not doc_ids_to_delete:
|
|
188
190
|
return
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
191
|
+
with self.connection_config.get_search_client() as search_client:
|
|
192
|
+
results = search_client.delete_documents(
|
|
193
|
+
documents=[{index_key: doc_id} for doc_id in doc_ids_to_delete]
|
|
194
|
+
)
|
|
193
195
|
errors = []
|
|
194
196
|
success = []
|
|
195
197
|
for result in results:
|
|
@@ -207,7 +209,9 @@ class AzureAISearchUploader(Uploader):
|
|
|
207
209
|
|
|
208
210
|
@DestinationConnectionError.wrap
|
|
209
211
|
@requires_dependencies(["azure"], extras="azure-ai-search")
|
|
210
|
-
def write_dict(
|
|
212
|
+
def write_dict(
|
|
213
|
+
self, elements_dict: list[dict[str, Any]], search_client: "SearchClient"
|
|
214
|
+
) -> None:
|
|
211
215
|
import azure.core.exceptions
|
|
212
216
|
|
|
213
217
|
logger.info(
|
|
@@ -215,12 +219,10 @@ class AzureAISearchUploader(Uploader):
|
|
|
215
219
|
f"index at {self.connection_config.index}",
|
|
216
220
|
)
|
|
217
221
|
try:
|
|
218
|
-
results =
|
|
219
|
-
documents=elements_dict
|
|
220
|
-
)
|
|
221
|
-
|
|
222
|
+
results = search_client.upload_documents(documents=elements_dict)
|
|
222
223
|
except azure.core.exceptions.HttpResponseError as http_error:
|
|
223
224
|
raise WriteError(f"http error: {http_error}") from http_error
|
|
225
|
+
|
|
224
226
|
errors = []
|
|
225
227
|
success = []
|
|
226
228
|
for result in results:
|
|
@@ -240,8 +242,8 @@ class AzureAISearchUploader(Uploader):
|
|
|
240
242
|
)
|
|
241
243
|
|
|
242
244
|
def can_delete(self) -> bool:
|
|
243
|
-
|
|
244
|
-
|
|
245
|
+
with self.connection_config.get_search_index_client() as search_index_client:
|
|
246
|
+
index = search_index_client.get_index(name=self.connection_config.index)
|
|
245
247
|
index_fields = index.fields
|
|
246
248
|
record_id_fields = [
|
|
247
249
|
field for field in index_fields if field.name == self.upload_config.record_id_key
|
|
@@ -252,8 +254,8 @@ class AzureAISearchUploader(Uploader):
|
|
|
252
254
|
return record_id_field.filterable
|
|
253
255
|
|
|
254
256
|
def get_index_key(self) -> str:
|
|
255
|
-
|
|
256
|
-
|
|
257
|
+
with self.connection_config.get_search_index_client() as search_index_client:
|
|
258
|
+
index = search_index_client.get_index(name=self.connection_config.index)
|
|
257
259
|
index_fields = index.fields
|
|
258
260
|
key_fields = [field for field in index_fields if field.key]
|
|
259
261
|
if not key_fields:
|
|
@@ -262,8 +264,8 @@ class AzureAISearchUploader(Uploader):
|
|
|
262
264
|
|
|
263
265
|
def precheck(self) -> None:
|
|
264
266
|
try:
|
|
265
|
-
|
|
266
|
-
|
|
267
|
+
with self.connection_config.get_search_client() as search_client:
|
|
268
|
+
search_client.get_document_count()
|
|
267
269
|
except Exception as e:
|
|
268
270
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
269
271
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
@@ -284,8 +286,9 @@ class AzureAISearchUploader(Uploader):
|
|
|
284
286
|
logger.warning("criteria for deleting previous content not met, skipping")
|
|
285
287
|
|
|
286
288
|
batch_size = self.upload_config.batch_size
|
|
287
|
-
|
|
288
|
-
|
|
289
|
+
with self.connection_config.get_search_client() as search_client:
|
|
290
|
+
for chunk in batch_generator(elements_dict, batch_size):
|
|
291
|
+
self.write_dict(elements_dict=chunk, search_client=search_client) # noqa: E203
|
|
289
292
|
|
|
290
293
|
|
|
291
294
|
azure_ai_search_destination_entry = DestinationRegistryEntry(
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import json
|
|
2
|
-
import uuid
|
|
3
2
|
from dataclasses import dataclass, field
|
|
4
3
|
from datetime import date, datetime
|
|
5
4
|
from pathlib import Path
|
|
@@ -23,6 +22,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
23
22
|
)
|
|
24
23
|
from unstructured_ingest.v2.logger import logger
|
|
25
24
|
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
25
|
+
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
26
26
|
|
|
27
27
|
from .utils import conform_string_to_dict
|
|
28
28
|
|
|
@@ -83,13 +83,12 @@ class ChromaUploadStager(UploadStager):
|
|
|
83
83
|
return parser.parse(date_string)
|
|
84
84
|
|
|
85
85
|
@staticmethod
|
|
86
|
-
def conform_dict(data: dict) -> dict:
|
|
86
|
+
def conform_dict(data: dict, file_data: FileData) -> dict:
|
|
87
87
|
"""
|
|
88
88
|
Prepares dictionary in the format that Chroma requires
|
|
89
89
|
"""
|
|
90
|
-
element_id = data.get("element_id", str(uuid.uuid4()))
|
|
91
90
|
return {
|
|
92
|
-
"id":
|
|
91
|
+
"id": get_enhanced_element_id(element_dict=data, file_data=file_data),
|
|
93
92
|
"embedding": data.pop("embeddings", None),
|
|
94
93
|
"document": data.pop("text", None),
|
|
95
94
|
"metadata": flatten_dict(data, separator="-", flatten_lists=True, remove_none=True),
|
|
@@ -105,7 +104,9 @@ class ChromaUploadStager(UploadStager):
|
|
|
105
104
|
) -> Path:
|
|
106
105
|
with open(elements_filepath) as elements_file:
|
|
107
106
|
elements_contents = json.load(elements_file)
|
|
108
|
-
conformed_elements = [
|
|
107
|
+
conformed_elements = [
|
|
108
|
+
self.conform_dict(data=element, file_data=file_data) for element in elements_contents
|
|
109
|
+
]
|
|
109
110
|
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
110
111
|
with open(output_path, "w") as output_file:
|
|
111
112
|
json.dump(conformed_elements, output_file)
|
|
@@ -30,16 +30,28 @@ CONNECTOR_TYPE = "confluence"
|
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
class ConfluenceAccessConfig(AccessConfig):
|
|
33
|
-
api_token: str = Field(description="Confluence API token")
|
|
33
|
+
api_token: Optional[str] = Field(description="Confluence API token", default=None)
|
|
34
|
+
access_token: Optional[str] = Field(
|
|
35
|
+
description="Confluence Personal Access Token", default=None
|
|
36
|
+
)
|
|
34
37
|
|
|
35
38
|
|
|
36
39
|
class ConfluenceConnectionConfig(ConnectionConfig):
|
|
37
40
|
url: str = Field(description="URL of the Confluence instance")
|
|
38
|
-
user_email: str = Field(description="User email for authentication")
|
|
41
|
+
user_email: Optional[str] = Field(description="User email for authentication", default=None)
|
|
39
42
|
access_config: Secret[ConfluenceAccessConfig] = Field(
|
|
40
43
|
description="Access configuration for Confluence"
|
|
41
44
|
)
|
|
42
45
|
|
|
46
|
+
def model_post_init(self, __context):
|
|
47
|
+
access_configs = self.access_config.get_secret_value()
|
|
48
|
+
basic_auth = self.user_email and access_configs.api_token
|
|
49
|
+
pat_auth = access_configs.access_token
|
|
50
|
+
if basic_auth and pat_auth:
|
|
51
|
+
raise ValueError("both forms of auth provided, only one allowed")
|
|
52
|
+
if not (basic_auth or pat_auth):
|
|
53
|
+
raise ValueError("neither forms of auth provided")
|
|
54
|
+
|
|
43
55
|
@requires_dependencies(["atlassian"], extras="confluence")
|
|
44
56
|
def get_client(self) -> "Confluence":
|
|
45
57
|
from atlassian import Confluence
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
4
|
+
add_destination_entry,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
from .duckdb import CONNECTOR_TYPE as DUCKDB_CONNECTOR_TYPE
|
|
8
|
+
from .duckdb import duckdb_destination_entry
|
|
9
|
+
from .motherduck import CONNECTOR_TYPE as MOTHERDUCK_CONNECTOR_TYPE
|
|
10
|
+
from .motherduck import motherduck_destination_entry
|
|
11
|
+
|
|
12
|
+
add_destination_entry(destination_type=DUCKDB_CONNECTOR_TYPE, entry=duckdb_destination_entry)
|
|
13
|
+
add_destination_entry(
|
|
14
|
+
destination_type=MOTHERDUCK_CONNECTOR_TYPE, entry=motherduck_destination_entry
|
|
15
|
+
)
|