unstructured-ingest 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (39) hide show
  1. test/integration/connectors/duckdb/__init__.py +0 -0
  2. test/integration/connectors/duckdb/test_duckdb.py +82 -0
  3. test/integration/connectors/duckdb/test_motherduck.py +106 -0
  4. test/integration/connectors/test_kafka.py +109 -6
  5. test/integration/connectors/test_qdrant.py +55 -0
  6. test/unit/v2/connectors/test_confluence.py +39 -0
  7. unstructured_ingest/__version__.py +1 -1
  8. unstructured_ingest/v2/processes/connectors/__init__.py +1 -0
  9. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +24 -21
  10. unstructured_ingest/v2/processes/connectors/chroma.py +6 -5
  11. unstructured_ingest/v2/processes/connectors/confluence.py +14 -2
  12. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  13. unstructured_ingest/v2/processes/connectors/duckdb/base.py +99 -0
  14. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +118 -0
  15. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +133 -0
  16. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +34 -15
  17. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +6 -2
  18. unstructured_ingest/v2/processes/connectors/fsspec/box.py +23 -11
  19. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +3 -3
  20. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +2 -2
  21. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +2 -3
  22. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +8 -8
  23. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +9 -2
  24. unstructured_ingest/v2/processes/connectors/kafka/local.py +1 -1
  25. unstructured_ingest/v2/processes/connectors/kdbai.py +2 -2
  26. unstructured_ingest/v2/processes/connectors/pinecone.py +21 -33
  27. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +6 -4
  28. unstructured_ingest/v2/processes/connectors/sql/__init__.py +2 -1
  29. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +7 -9
  30. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +62 -24
  31. unstructured_ingest/v2/processes/connectors/sql/sql.py +8 -3
  32. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +6 -9
  33. unstructured_ingest/v2/utils.py +9 -0
  34. {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.8.dist-info}/METADATA +18 -16
  35. {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.8.dist-info}/RECORD +39 -31
  36. {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.8.dist-info}/LICENSE.md +0 -0
  37. {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.8.dist-info}/WHEEL +0 -0
  38. {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.8.dist-info}/entry_points.txt +0 -0
  39. {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.8.dist-info}/top_level.txt +0 -0
File without changes
@@ -0,0 +1,82 @@
1
+ import tempfile
2
+ from contextlib import contextmanager
3
+ from pathlib import Path
4
+ from typing import Generator
5
+
6
+ import duckdb
7
+ import pandas as pd
8
+ import pytest
9
+
10
+ from test.integration.connectors.utils.constants import DESTINATION_TAG
11
+ from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
12
+ from unstructured_ingest.v2.processes.connectors.duckdb.duckdb import (
13
+ CONNECTOR_TYPE,
14
+ DuckDBConnectionConfig,
15
+ DuckDBUploader,
16
+ DuckDBUploaderConfig,
17
+ DuckDBUploadStager,
18
+ )
19
+
20
+
21
+ @contextmanager
22
+ def duckdbd_setup() -> Generator[Path, None, None]:
23
+ with tempfile.TemporaryDirectory() as temp_dir:
24
+ db_path = Path(temp_dir) / "temp_duck.db"
25
+ db_init_path = Path(__file__).parent / "duckdb-schema.sql"
26
+ assert db_init_path.exists()
27
+ assert db_init_path.is_file()
28
+ with duckdb.connect(database=db_path) as duckdb_connection:
29
+ with db_init_path.open("r") as f:
30
+ query = f.read()
31
+ duckdb_connection.execute(query)
32
+ duckdb_connection.close()
33
+ yield db_path
34
+
35
+
36
+ def validate_duckdb_destination(db_path: Path, expected_num_elements: int):
37
+ conn = None
38
+ try:
39
+ conn = duckdb.connect(db_path)
40
+ _results = conn.sql("select count(*) from elements").fetchall()
41
+ _count = _results[0][0]
42
+ assert (
43
+ _count == expected_num_elements
44
+ ), f"dest check failed: got {_count}, expected {expected_num_elements}"
45
+ conn.close()
46
+ finally:
47
+ if conn:
48
+ conn.close()
49
+
50
+
51
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "duckdb")
52
+ def test_duckdb_destination(upload_file: Path):
53
+ with duckdbd_setup() as test_db_path:
54
+ with tempfile.TemporaryDirectory() as temp_dir:
55
+ file_data = FileData(
56
+ source_identifiers=SourceIdentifiers(
57
+ fullpath=upload_file.name, filename=upload_file.name
58
+ ),
59
+ connector_type=CONNECTOR_TYPE,
60
+ identifier="mock-file-data",
61
+ )
62
+
63
+ # deafults to default stager config
64
+ stager = DuckDBUploadStager()
65
+ stager_params = {
66
+ "elements_filepath": upload_file,
67
+ "file_data": file_data,
68
+ "output_dir": temp_dir,
69
+ "output_filename": "test_db",
70
+ }
71
+ staged_path = stager.run(**stager_params)
72
+
73
+ connection_config = DuckDBConnectionConfig(database=str(test_db_path))
74
+ upload_config = DuckDBUploaderConfig()
75
+ uploader = DuckDBUploader(
76
+ connection_config=connection_config, upload_config=upload_config
77
+ )
78
+
79
+ uploader.run(path=staged_path, file_data=file_data)
80
+
81
+ staged_df = pd.read_json(staged_path, orient="records", lines=True)
82
+ validate_duckdb_destination(db_path=test_db_path, expected_num_elements=len(staged_df))
@@ -0,0 +1,106 @@
1
+ import os
2
+ import tempfile
3
+ import uuid
4
+ from contextlib import contextmanager
5
+ from pathlib import Path
6
+ from typing import Generator
7
+
8
+ import duckdb
9
+ import pandas as pd
10
+ import pytest
11
+
12
+ from test.integration.connectors.utils.constants import DESTINATION_TAG
13
+ from test.integration.utils import requires_env
14
+ from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
15
+ from unstructured_ingest.v2.processes.connectors.duckdb.motherduck import (
16
+ CONNECTOR_TYPE,
17
+ MotherDuckAccessConfig,
18
+ MotherDuckConnectionConfig,
19
+ MotherDuckUploader,
20
+ MotherDuckUploaderConfig,
21
+ MotherDuckUploadStager,
22
+ )
23
+
24
+
25
+ @contextmanager
26
+ def motherduck_setup(md_token: str) -> Generator[Path, None, None]:
27
+ database_name = f"test_{str(uuid.uuid4()).replace('-', '_')}"
28
+ try:
29
+ db_init_path = Path(__file__).parent / "duckdb-schema.sql"
30
+ assert db_init_path.exists()
31
+ assert db_init_path.is_file()
32
+ with duckdb.connect(f"md:?motherduck_token={md_token}") as md_conn:
33
+ with db_init_path.open("r") as f:
34
+ query = f.read()
35
+ md_conn.execute(f"CREATE DATABASE {database_name}")
36
+ md_conn.execute(f"USE {database_name}")
37
+ md_conn.execute(query)
38
+ md_conn.close()
39
+ yield database_name
40
+ finally:
41
+ with duckdb.connect(f"md:?motherduck_token={md_token}") as md_conn:
42
+ md_conn.execute(f"DROP DATABASE {database_name}")
43
+ md_conn.close()
44
+
45
+
46
+ def validate_motherduck_destination(database: str, expected_num_elements: int, md_token: str):
47
+ conn = None
48
+ try:
49
+ conn = duckdb.connect(f"md:?motherduck_token={md_token}")
50
+ conn.execute(f"USE {database}")
51
+ _results = conn.sql("select count(*) from elements").fetchall()
52
+ _count = _results[0][0]
53
+ assert (
54
+ _count == expected_num_elements
55
+ ), f"dest check failed: got {_count}, expected {expected_num_elements}"
56
+ conn.close()
57
+ finally:
58
+ if conn:
59
+ conn.close()
60
+
61
+
62
+ def get_motherduck_token() -> dict:
63
+ motherduck_token = os.getenv("MOTHERDUCK_TOKEN", None)
64
+ assert motherduck_token
65
+ return motherduck_token
66
+
67
+
68
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "motherduck")
69
+ @requires_env("MOTHERDUCK_TOKEN")
70
+ def test_motherduck_destination(upload_file: Path):
71
+ md_token = get_motherduck_token()
72
+ with motherduck_setup(md_token) as test_database:
73
+ with tempfile.TemporaryDirectory() as temp_dir:
74
+ file_data = FileData(
75
+ source_identifiers=SourceIdentifiers(
76
+ fullpath=upload_file.name, filename=upload_file.name
77
+ ),
78
+ connector_type=CONNECTOR_TYPE,
79
+ identifier="mock-file-data",
80
+ )
81
+
82
+ # deafults to default stager config
83
+ stager = MotherDuckUploadStager()
84
+ stager_params = {
85
+ "elements_filepath": upload_file,
86
+ "file_data": file_data,
87
+ "output_dir": temp_dir,
88
+ "output_filename": "test_db",
89
+ }
90
+ staged_path = stager.run(**stager_params)
91
+
92
+ access_config = MotherDuckAccessConfig(md_token=md_token)
93
+ connection_config = MotherDuckConnectionConfig(
94
+ database=test_database, access_config=access_config
95
+ )
96
+ upload_config = MotherDuckUploaderConfig()
97
+ uploader = MotherDuckUploader(
98
+ connection_config=connection_config, upload_config=upload_config
99
+ )
100
+
101
+ uploader.run(path=staged_path, file_data=file_data)
102
+
103
+ staged_df = pd.read_json(staged_path, orient="records", lines=True)
104
+ validate_motherduck_destination(
105
+ database=test_database, expected_num_elements=len(staged_df), md_token=md_token
106
+ )
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import os
2
3
  import tempfile
3
4
  import time
4
5
  from pathlib import Path
@@ -17,8 +18,17 @@ from test.integration.connectors.utils.validation import (
17
18
  ValidationConfigs,
18
19
  source_connector_validation,
19
20
  )
21
+ from test.integration.utils import requires_env
20
22
  from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
21
23
  from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
24
+ from unstructured_ingest.v2.processes.connectors.kafka.cloud import (
25
+ CloudKafkaAccessConfig,
26
+ CloudKafkaConnectionConfig,
27
+ CloudKafkaDownloader,
28
+ CloudKafkaDownloaderConfig,
29
+ CloudKafkaIndexer,
30
+ CloudKafkaIndexerConfig,
31
+ )
22
32
  from unstructured_ingest.v2.processes.connectors.kafka.local import (
23
33
  CONNECTOR_TYPE,
24
34
  LocalKafkaConnectionConfig,
@@ -47,20 +57,27 @@ def docker_compose_ctx():
47
57
  yield ctx
48
58
 
49
59
 
50
- def wait_for_topic(topic: str, retries: int = 10, interval: int = 1):
51
- admin_client = get_admin_client()
60
+ def wait_for_topic(
61
+ topic: str,
62
+ retries: int = 10,
63
+ interval: int = 1,
64
+ exists: bool = True,
65
+ admin_client=None,
66
+ ):
67
+ if admin_client is None:
68
+ admin_client = get_admin_client()
52
69
  current_topics = admin_client.list_topics().topics
53
70
  attempts = 0
54
- while topic not in current_topics and attempts < retries:
71
+ while (topic not in current_topics) == exists and attempts < retries:
55
72
  attempts += 1
56
73
  print(
57
- "Attempt {}: Waiting for topic {} to exist in {}".format(
58
- attempts, topic, ", ".join(current_topics)
74
+ "Attempt {}: Waiting for topic {} to {} exist. Current topics: [{}]".format(
75
+ attempts, topic, "" if exists else "not", ", ".join(current_topics)
59
76
  )
60
77
  )
61
78
  time.sleep(interval)
62
79
  current_topics = admin_client.list_topics().topics
63
- if topic not in current_topics:
80
+ if (topic not in current_topics) == exists:
64
81
  raise TimeoutError(f"Timeout out waiting for topic {topic} to exist")
65
82
 
66
83
 
@@ -110,6 +127,92 @@ async def test_kafka_source_local(kafka_seed_topic: str):
110
127
  )
111
128
 
112
129
 
130
+ @pytest.fixture
131
+ def kafka_seed_topic_cloud(expected_messages: int = 5) -> int:
132
+ conf = {
133
+ "bootstrap.servers": os.environ["KAFKA_BOOTSTRAP_SERVER"],
134
+ "sasl.username": os.environ["KAFKA_API_KEY"],
135
+ "sasl.password": os.environ["KAFKA_SECRET"],
136
+ "sasl.mechanism": "PLAIN",
137
+ "security.protocol": "SASL_SSL",
138
+ }
139
+ admin_client = AdminClient(conf)
140
+ try:
141
+ res = admin_client.delete_topics([TOPIC], operation_timeout=10)
142
+ for topic, f in res.items():
143
+ f.result()
144
+ print(f"Topic {topic} removed")
145
+ wait_for_topic(TOPIC, 5, 1, False, admin_client)
146
+ except Exception:
147
+ pass
148
+
149
+ cluster_meta = admin_client.list_topics()
150
+ current_topics = [topic for topic in cluster_meta.topics if topic != "__consumer_offsets"]
151
+
152
+ assert TOPIC not in current_topics, f"Topic {TOPIC} shouldn't exist"
153
+
154
+ # Kafka Cloud allows to use replication_factor=1 only for Dedicated clusters.
155
+ topic_obj = NewTopic(TOPIC, num_partitions=1, replication_factor=3)
156
+
157
+ res = admin_client.create_topics([topic_obj], operation_timeout=10, validate_only=False)
158
+ for topic, f in res.items():
159
+ f.result()
160
+
161
+ producer = Producer(conf)
162
+ for i in range(expected_messages):
163
+ message = f"This is some text for message {i}"
164
+ producer.produce(topic=TOPIC, value=message)
165
+ producer.flush(timeout=10)
166
+ return expected_messages
167
+
168
+
169
+ @pytest.mark.asyncio
170
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
171
+ @requires_env("KAFKA_API_KEY", "KAFKA_SECRET", "KAFKA_BOOTSTRAP_SERVER")
172
+ async def test_kafka_source_cloud(kafka_seed_topic_cloud: int):
173
+ """
174
+ In order to have this test succeed, you need to create cluster on Confluent Cloud,
175
+ and create the API key with admin privileges. By default, user account keys have it.
176
+ """
177
+
178
+ expected_messages = kafka_seed_topic_cloud
179
+
180
+ connection_config = CloudKafkaConnectionConfig(
181
+ bootstrap_server=os.environ["KAFKA_BOOTSTRAP_SERVER"],
182
+ port=9092,
183
+ access_config=CloudKafkaAccessConfig(
184
+ kafka_api_key=os.environ["KAFKA_API_KEY"],
185
+ secret=os.environ["KAFKA_SECRET"],
186
+ ),
187
+ )
188
+
189
+ with tempfile.TemporaryDirectory() as tempdir:
190
+ tempdir_path = Path(tempdir)
191
+ download_config = CloudKafkaDownloaderConfig(download_dir=tempdir_path)
192
+ indexer = CloudKafkaIndexer(
193
+ connection_config=connection_config,
194
+ index_config=CloudKafkaIndexerConfig(
195
+ topic=TOPIC,
196
+ num_messages_to_consume=expected_messages,
197
+ ),
198
+ )
199
+ downloader = CloudKafkaDownloader(
200
+ connection_config=connection_config, download_config=download_config
201
+ )
202
+ indexer.precheck()
203
+ await source_connector_validation(
204
+ indexer=indexer,
205
+ downloader=downloader,
206
+ configs=ValidationConfigs(
207
+ test_id="kafka",
208
+ exclude_fields_extend=["connector_type"],
209
+ expected_num_files=expected_messages,
210
+ validate_downloaded_files=True,
211
+ validate_file_data=True,
212
+ ),
213
+ )
214
+
215
+
113
216
  @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
114
217
  def test_kafka_source_local_precheck_fail_no_cluster():
115
218
  connection_config = LocalKafkaConnectionConfig(bootstrap_server="localhost", port=29092)
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import os
2
3
  import uuid
3
4
  from contextlib import asynccontextmanager
4
5
  from pathlib import Path
@@ -9,7 +10,16 @@ from qdrant_client import AsyncQdrantClient
9
10
 
10
11
  from test.integration.connectors.utils.constants import DESTINATION_TAG
11
12
  from test.integration.connectors.utils.docker import container_context
13
+ from test.integration.utils import requires_env
12
14
  from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
15
+ from unstructured_ingest.v2.processes.connectors.qdrant.cloud import (
16
+ CloudQdrantAccessConfig,
17
+ CloudQdrantConnectionConfig,
18
+ CloudQdrantUploader,
19
+ CloudQdrantUploaderConfig,
20
+ CloudQdrantUploadStager,
21
+ CloudQdrantUploadStagerConfig,
22
+ )
13
23
  from unstructured_ingest.v2.processes.connectors.qdrant.local import (
14
24
  CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE,
15
25
  )
@@ -135,3 +145,48 @@ async def test_qdrant_destination_server(upload_file: Path, tmp_path: Path, dock
135
145
  uploader.run(path=upload_file, file_data=file_data)
136
146
  async with qdrant_client(connection_kwargs) as client:
137
147
  await validate_upload(client=client, upload_file=upload_file)
148
+
149
+
150
+ @pytest.mark.asyncio
151
+ @pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant")
152
+ @requires_env("QDRANT_API_KEY", "QDRANT_SERVER_URL")
153
+ async def test_qdrant_destination_cloud(upload_file: Path, tmp_path: Path):
154
+ server_url = os.environ["QDRANT_SERVER_URL"]
155
+ api_key = os.environ["QDRANT_API_KEY"]
156
+ connection_kwargs = {"location": server_url, "api_key": api_key}
157
+ async with qdrant_client(connection_kwargs) as client:
158
+ await client.create_collection(COLLECTION_NAME, vectors_config=VECTORS_CONFIG)
159
+ AsyncQdrantClient(**connection_kwargs)
160
+
161
+ stager = CloudQdrantUploadStager(
162
+ upload_stager_config=CloudQdrantUploadStagerConfig(),
163
+ )
164
+ uploader = CloudQdrantUploader(
165
+ connection_config=CloudQdrantConnectionConfig(
166
+ url=server_url,
167
+ access_config=CloudQdrantAccessConfig(
168
+ api_key=api_key,
169
+ ),
170
+ ),
171
+ upload_config=CloudQdrantUploaderConfig(collection_name=COLLECTION_NAME),
172
+ )
173
+
174
+ file_data = FileData(
175
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
176
+ connector_type=SERVER_CONNECTOR_TYPE,
177
+ identifier="mock-file-data",
178
+ )
179
+
180
+ staged_upload_file = stager.run(
181
+ elements_filepath=upload_file,
182
+ file_data=file_data,
183
+ output_dir=tmp_path,
184
+ output_filename=upload_file.name,
185
+ )
186
+
187
+ if uploader.is_async():
188
+ await uploader.run_async(path=staged_upload_file, file_data=file_data)
189
+ else:
190
+ uploader.run(path=staged_upload_file, file_data=file_data)
191
+ async with qdrant_client(connection_kwargs) as client:
192
+ await validate_upload(client=client, upload_file=upload_file)
@@ -0,0 +1,39 @@
1
+ import pytest
2
+ from pydantic import ValidationError
3
+
4
+ from unstructured_ingest.v2.processes.connectors.confluence import (
5
+ ConfluenceAccessConfig,
6
+ ConfluenceConnectionConfig,
7
+ )
8
+
9
+
10
+ def test_connection_config_multiple_auth():
11
+ with pytest.raises(ValidationError):
12
+ ConfluenceConnectionConfig(
13
+ access_config=ConfluenceAccessConfig(
14
+ api_token="api_token",
15
+ access_token="access_token",
16
+ ),
17
+ user_email="user_email",
18
+ url="url",
19
+ )
20
+
21
+
22
+ def test_connection_config_no_auth():
23
+ with pytest.raises(ValidationError):
24
+ ConfluenceConnectionConfig(access_config=ConfluenceAccessConfig(), url="url")
25
+
26
+
27
+ def test_connection_config_basic_auth():
28
+ ConfluenceConnectionConfig(
29
+ access_config=ConfluenceAccessConfig(api_token="api_token"),
30
+ url="url",
31
+ user_email="user_email",
32
+ )
33
+
34
+
35
+ def test_connection_config_pat_auth():
36
+ ConfluenceConnectionConfig(
37
+ access_config=ConfluenceAccessConfig(access_token="access_token"),
38
+ url="url",
39
+ )
@@ -1 +1 @@
1
- __version__ = "0.3.6" # pragma: no cover
1
+ __version__ = "0.3.8" # pragma: no cover
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import unstructured_ingest.v2.processes.connectors.databricks # noqa: F401
4
+ import unstructured_ingest.v2.processes.connectors.duckdb # noqa: F401
4
5
  import unstructured_ingest.v2.processes.connectors.elasticsearch # noqa: F401
5
6
  import unstructured_ingest.v2.processes.connectors.fsspec # noqa: F401
6
7
  import unstructured_ingest.v2.processes.connectors.kafka # noqa: F401
@@ -1,5 +1,4 @@
1
1
  import json
2
- import uuid
3
2
  from dataclasses import dataclass, field
4
3
  from pathlib import Path
5
4
  from typing import TYPE_CHECKING, Any
@@ -24,6 +23,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
24
23
  DestinationRegistryEntry,
25
24
  )
26
25
  from unstructured_ingest.v2.processes.connectors.utils import parse_datetime
26
+ from unstructured_ingest.v2.utils import get_enhanced_element_id
27
27
 
28
28
  if TYPE_CHECKING:
29
29
  from azure.search.documents import SearchClient
@@ -100,7 +100,7 @@ class AzureAISearchUploadStager(UploadStager):
100
100
  Azure Cognitive Search index
101
101
  """
102
102
 
103
- data["id"] = str(uuid.uuid4())
103
+ data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
104
104
  data[RECORD_ID_LABEL] = file_data.identifier
105
105
 
106
106
  if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
@@ -173,8 +173,10 @@ class AzureAISearchUploader(Uploader):
173
173
  connector_type: str = CONNECTOR_TYPE
174
174
 
175
175
  def query_docs(self, record_id: str, index_key: str) -> list[str]:
176
- client = self.connection_config.get_search_client()
177
- results = list(client.search(filter=f"record_id eq '{record_id}'", select=[index_key]))
176
+ with self.connection_config.get_search_client() as search_client:
177
+ results = list(
178
+ search_client.search(filter=f"record_id eq '{record_id}'", select=[index_key])
179
+ )
178
180
  return [result[index_key] for result in results]
179
181
 
180
182
  def delete_by_record_id(self, file_data: FileData, index_key: str) -> None:
@@ -186,10 +188,10 @@ class AzureAISearchUploader(Uploader):
186
188
  doc_ids_to_delete = self.query_docs(record_id=file_data.identifier, index_key=index_key)
187
189
  if not doc_ids_to_delete:
188
190
  return
189
- client: SearchClient = self.connection_config.get_search_client()
190
- results = client.delete_documents(
191
- documents=[{index_key: doc_id} for doc_id in doc_ids_to_delete]
192
- )
191
+ with self.connection_config.get_search_client() as search_client:
192
+ results = search_client.delete_documents(
193
+ documents=[{index_key: doc_id} for doc_id in doc_ids_to_delete]
194
+ )
193
195
  errors = []
194
196
  success = []
195
197
  for result in results:
@@ -207,7 +209,9 @@ class AzureAISearchUploader(Uploader):
207
209
 
208
210
  @DestinationConnectionError.wrap
209
211
  @requires_dependencies(["azure"], extras="azure-ai-search")
210
- def write_dict(self, elements_dict: list[dict[str, Any]]) -> None:
212
+ def write_dict(
213
+ self, elements_dict: list[dict[str, Any]], search_client: "SearchClient"
214
+ ) -> None:
211
215
  import azure.core.exceptions
212
216
 
213
217
  logger.info(
@@ -215,12 +219,10 @@ class AzureAISearchUploader(Uploader):
215
219
  f"index at {self.connection_config.index}",
216
220
  )
217
221
  try:
218
- results = self.connection_config.get_search_client().upload_documents(
219
- documents=elements_dict
220
- )
221
-
222
+ results = search_client.upload_documents(documents=elements_dict)
222
223
  except azure.core.exceptions.HttpResponseError as http_error:
223
224
  raise WriteError(f"http error: {http_error}") from http_error
225
+
224
226
  errors = []
225
227
  success = []
226
228
  for result in results:
@@ -240,8 +242,8 @@ class AzureAISearchUploader(Uploader):
240
242
  )
241
243
 
242
244
  def can_delete(self) -> bool:
243
- search_index_client = self.connection_config.get_search_index_client()
244
- index = search_index_client.get_index(name=self.connection_config.index)
245
+ with self.connection_config.get_search_index_client() as search_index_client:
246
+ index = search_index_client.get_index(name=self.connection_config.index)
245
247
  index_fields = index.fields
246
248
  record_id_fields = [
247
249
  field for field in index_fields if field.name == self.upload_config.record_id_key
@@ -252,8 +254,8 @@ class AzureAISearchUploader(Uploader):
252
254
  return record_id_field.filterable
253
255
 
254
256
  def get_index_key(self) -> str:
255
- search_index_client = self.connection_config.get_search_index_client()
256
- index = search_index_client.get_index(name=self.connection_config.index)
257
+ with self.connection_config.get_search_index_client() as search_index_client:
258
+ index = search_index_client.get_index(name=self.connection_config.index)
257
259
  index_fields = index.fields
258
260
  key_fields = [field for field in index_fields if field.key]
259
261
  if not key_fields:
@@ -262,8 +264,8 @@ class AzureAISearchUploader(Uploader):
262
264
 
263
265
  def precheck(self) -> None:
264
266
  try:
265
- client = self.connection_config.get_search_client()
266
- client.get_document_count()
267
+ with self.connection_config.get_search_client() as search_client:
268
+ search_client.get_document_count()
267
269
  except Exception as e:
268
270
  logger.error(f"failed to validate connection: {e}", exc_info=True)
269
271
  raise DestinationConnectionError(f"failed to validate connection: {e}")
@@ -284,8 +286,9 @@ class AzureAISearchUploader(Uploader):
284
286
  logger.warning("criteria for deleting previous content not met, skipping")
285
287
 
286
288
  batch_size = self.upload_config.batch_size
287
- for chunk in batch_generator(elements_dict, batch_size):
288
- self.write_dict(elements_dict=chunk) # noqa: E203
289
+ with self.connection_config.get_search_client() as search_client:
290
+ for chunk in batch_generator(elements_dict, batch_size):
291
+ self.write_dict(elements_dict=chunk, search_client=search_client) # noqa: E203
289
292
 
290
293
 
291
294
  azure_ai_search_destination_entry = DestinationRegistryEntry(
@@ -1,5 +1,4 @@
1
1
  import json
2
- import uuid
3
2
  from dataclasses import dataclass, field
4
3
  from datetime import date, datetime
5
4
  from pathlib import Path
@@ -23,6 +22,7 @@ from unstructured_ingest.v2.interfaces import (
23
22
  )
24
23
  from unstructured_ingest.v2.logger import logger
25
24
  from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
25
+ from unstructured_ingest.v2.utils import get_enhanced_element_id
26
26
 
27
27
  from .utils import conform_string_to_dict
28
28
 
@@ -83,13 +83,12 @@ class ChromaUploadStager(UploadStager):
83
83
  return parser.parse(date_string)
84
84
 
85
85
  @staticmethod
86
- def conform_dict(data: dict) -> dict:
86
+ def conform_dict(data: dict, file_data: FileData) -> dict:
87
87
  """
88
88
  Prepares dictionary in the format that Chroma requires
89
89
  """
90
- element_id = data.get("element_id", str(uuid.uuid4()))
91
90
  return {
92
- "id": element_id,
91
+ "id": get_enhanced_element_id(element_dict=data, file_data=file_data),
93
92
  "embedding": data.pop("embeddings", None),
94
93
  "document": data.pop("text", None),
95
94
  "metadata": flatten_dict(data, separator="-", flatten_lists=True, remove_none=True),
@@ -105,7 +104,9 @@ class ChromaUploadStager(UploadStager):
105
104
  ) -> Path:
106
105
  with open(elements_filepath) as elements_file:
107
106
  elements_contents = json.load(elements_file)
108
- conformed_elements = [self.conform_dict(data=element) for element in elements_contents]
107
+ conformed_elements = [
108
+ self.conform_dict(data=element, file_data=file_data) for element in elements_contents
109
+ ]
109
110
  output_path = Path(output_dir) / Path(f"{output_filename}.json")
110
111
  with open(output_path, "w") as output_file:
111
112
  json.dump(conformed_elements, output_file)
@@ -30,16 +30,28 @@ CONNECTOR_TYPE = "confluence"
30
30
 
31
31
 
32
32
  class ConfluenceAccessConfig(AccessConfig):
33
- api_token: str = Field(description="Confluence API token")
33
+ api_token: Optional[str] = Field(description="Confluence API token", default=None)
34
+ access_token: Optional[str] = Field(
35
+ description="Confluence Personal Access Token", default=None
36
+ )
34
37
 
35
38
 
36
39
  class ConfluenceConnectionConfig(ConnectionConfig):
37
40
  url: str = Field(description="URL of the Confluence instance")
38
- user_email: str = Field(description="User email for authentication")
41
+ user_email: Optional[str] = Field(description="User email for authentication", default=None)
39
42
  access_config: Secret[ConfluenceAccessConfig] = Field(
40
43
  description="Access configuration for Confluence"
41
44
  )
42
45
 
46
+ def model_post_init(self, __context):
47
+ access_configs = self.access_config.get_secret_value()
48
+ basic_auth = self.user_email and access_configs.api_token
49
+ pat_auth = access_configs.access_token
50
+ if basic_auth and pat_auth:
51
+ raise ValueError("both forms of auth provided, only one allowed")
52
+ if not (basic_auth or pat_auth):
53
+ raise ValueError("neither forms of auth provided")
54
+
43
55
  @requires_dependencies(["atlassian"], extras="confluence")
44
56
  def get_client(self) -> "Confluence":
45
57
  from atlassian import Confluence
@@ -0,0 +1,15 @@
1
+ from __future__ import annotations
2
+
3
+ from unstructured_ingest.v2.processes.connector_registry import (
4
+ add_destination_entry,
5
+ )
6
+
7
+ from .duckdb import CONNECTOR_TYPE as DUCKDB_CONNECTOR_TYPE
8
+ from .duckdb import duckdb_destination_entry
9
+ from .motherduck import CONNECTOR_TYPE as MOTHERDUCK_CONNECTOR_TYPE
10
+ from .motherduck import motherduck_destination_entry
11
+
12
+ add_destination_entry(destination_type=DUCKDB_CONNECTOR_TYPE, entry=duckdb_destination_entry)
13
+ add_destination_entry(
14
+ destination_type=MOTHERDUCK_CONNECTOR_TYPE, entry=motherduck_destination_entry
15
+ )