unstructured-ingest 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/chunkers/test_chunkers.py +0 -11
- test/integration/connectors/conftest.py +11 -1
- test/integration/connectors/databricks_tests/test_volumes_native.py +4 -3
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +51 -44
- test/integration/connectors/duckdb/test_motherduck.py +37 -48
- test/integration/connectors/elasticsearch/test_elasticsearch.py +26 -4
- test/integration/connectors/elasticsearch/test_opensearch.py +26 -3
- test/integration/connectors/sql/test_postgres.py +103 -92
- test/integration/connectors/sql/test_singlestore.py +112 -100
- test/integration/connectors/sql/test_snowflake.py +142 -117
- test/integration/connectors/sql/test_sqlite.py +87 -76
- test/integration/connectors/test_astradb.py +62 -1
- test/integration/connectors/test_azure_ai_search.py +25 -3
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +4 -4
- test/integration/connectors/test_delta_table.py +1 -0
- test/integration/connectors/test_kafka.py +6 -6
- test/integration/connectors/test_milvus.py +21 -0
- test/integration/connectors/test_mongodb.py +7 -4
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_pinecone.py +25 -1
- test/integration/connectors/test_qdrant.py +25 -2
- test/integration/connectors/test_s3.py +9 -6
- test/integration/connectors/utils/docker.py +6 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +88 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/{validation.py → validation/source.py} +42 -98
- test/integration/connectors/utils/validation/utils.py +36 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/utils/chunking.py +11 -0
- unstructured_ingest/utils/data_prep.py +36 -0
- unstructured_ingest/v2/interfaces/__init__.py +3 -1
- unstructured_ingest/v2/interfaces/file_data.py +58 -14
- unstructured_ingest/v2/interfaces/upload_stager.py +70 -6
- unstructured_ingest/v2/interfaces/uploader.py +11 -2
- unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
- unstructured_ingest/v2/pipeline/steps/download.py +5 -4
- unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
- unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
- unstructured_ingest/v2/pipeline/steps/index.py +4 -4
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
- unstructured_ingest/v2/pipeline/steps/stage.py +5 -3
- unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
- unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +43 -63
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +16 -40
- unstructured_ingest/v2/processes/connectors/chroma.py +36 -59
- unstructured_ingest/v2/processes/connectors/couchbase.py +92 -93
- unstructured_ingest/v2/processes/connectors/delta_table.py +11 -33
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +26 -26
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +29 -20
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +37 -44
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +46 -75
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
- unstructured_ingest/v2/processes/connectors/gitlab.py +32 -31
- unstructured_ingest/v2/processes/connectors/google_drive.py +32 -29
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +2 -4
- unstructured_ingest/v2/processes/connectors/kdbai.py +44 -70
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -10
- unstructured_ingest/v2/processes/connectors/local.py +13 -2
- unstructured_ingest/v2/processes/connectors/milvus.py +16 -57
- unstructured_ingest/v2/processes/connectors/mongodb.py +99 -108
- unstructured_ingest/v2/processes/connectors/neo4j.py +383 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
- unstructured_ingest/v2/processes/connectors/pinecone.py +3 -33
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +32 -41
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/sql.py +72 -66
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +9 -31
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/METADATA +20 -15
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/RECORD +87 -79
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/top_level.txt +0 -0
|
@@ -1,16 +1,19 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import os
|
|
2
|
-
import tempfile
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
|
-
import docker
|
|
6
|
-
import pandas as pd
|
|
7
5
|
import pytest
|
|
8
6
|
import snowflake.connector as sf
|
|
7
|
+
from _pytest.fixtures import TopRequest
|
|
9
8
|
|
|
10
9
|
from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, env_setup_path
|
|
11
10
|
from test.integration.connectors.utils.docker import container_context
|
|
12
|
-
from test.integration.connectors.utils.validation import (
|
|
13
|
-
|
|
11
|
+
from test.integration.connectors.utils.validation.destination import (
|
|
12
|
+
StagerValidationConfigs,
|
|
13
|
+
stager_validation,
|
|
14
|
+
)
|
|
15
|
+
from test.integration.connectors.utils.validation.source import (
|
|
16
|
+
SourceValidationConfigs,
|
|
14
17
|
source_connector_validation,
|
|
15
18
|
)
|
|
16
19
|
from test.integration.utils import requires_env
|
|
@@ -30,14 +33,15 @@ from unstructured_ingest.v2.processes.connectors.sql.snowflake import (
|
|
|
30
33
|
SEED_DATA_ROWS = 20
|
|
31
34
|
|
|
32
35
|
|
|
33
|
-
def seed_data():
|
|
34
|
-
|
|
35
|
-
user
|
|
36
|
-
password
|
|
37
|
-
account
|
|
38
|
-
database
|
|
39
|
-
host
|
|
40
|
-
|
|
36
|
+
def seed_data() -> dict:
|
|
37
|
+
connect_params = {
|
|
38
|
+
"user": "test",
|
|
39
|
+
"password": "test",
|
|
40
|
+
"account": "test",
|
|
41
|
+
"database": "test",
|
|
42
|
+
"host": "snowflake.localhost.localstack.cloud",
|
|
43
|
+
}
|
|
44
|
+
conn = sf.connect(**connect_params)
|
|
41
45
|
|
|
42
46
|
file = Path(env_setup_path / "sql" / "snowflake" / "source" / "snowflake-schema.sql")
|
|
43
47
|
|
|
@@ -52,16 +56,31 @@ def seed_data():
|
|
|
52
56
|
|
|
53
57
|
cur.close()
|
|
54
58
|
conn.close()
|
|
59
|
+
return connect_params
|
|
55
60
|
|
|
56
61
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
62
|
+
@pytest.fixture
|
|
63
|
+
def source_database_setup() -> dict:
|
|
64
|
+
token = os.getenv("LOCALSTACK_AUTH_TOKEN")
|
|
65
|
+
with container_context(
|
|
66
|
+
image="localstack/snowflake",
|
|
67
|
+
environment={"LOCALSTACK_AUTH_TOKEN": token, "EXTRA_CORS_ALLOWED_ORIGINS": "*"},
|
|
68
|
+
ports={4566: 4566, 443: 443},
|
|
69
|
+
healthcheck_retries=30,
|
|
70
|
+
):
|
|
71
|
+
connect_params = seed_data()
|
|
72
|
+
yield connect_params
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def init_db_destination() -> dict:
|
|
76
|
+
connect_params = {
|
|
77
|
+
"user": "test",
|
|
78
|
+
"password": "test",
|
|
79
|
+
"account": "test",
|
|
80
|
+
"database": "test",
|
|
81
|
+
"host": "snowflake.localhost.localstack.cloud",
|
|
82
|
+
}
|
|
83
|
+
conn = sf.connect(**connect_params)
|
|
65
84
|
|
|
66
85
|
file = Path(env_setup_path / "sql" / "snowflake" / "destination" / "snowflake-schema.sql")
|
|
67
86
|
|
|
@@ -73,52 +92,53 @@ def init_db_destination():
|
|
|
73
92
|
|
|
74
93
|
cur.close()
|
|
75
94
|
conn.close()
|
|
95
|
+
return connect_params
|
|
76
96
|
|
|
77
97
|
|
|
78
|
-
@pytest.
|
|
79
|
-
|
|
80
|
-
@requires_env("LOCALSTACK_AUTH_TOKEN")
|
|
81
|
-
async def test_snowflake_source():
|
|
82
|
-
docker_client = docker.from_env()
|
|
98
|
+
@pytest.fixture
|
|
99
|
+
def destination_database_setup() -> dict:
|
|
83
100
|
token = os.getenv("LOCALSTACK_AUTH_TOKEN")
|
|
84
101
|
with container_context(
|
|
85
|
-
docker_client=docker_client,
|
|
86
102
|
image="localstack/snowflake",
|
|
87
103
|
environment={"LOCALSTACK_AUTH_TOKEN": token, "EXTRA_CORS_ALLOWED_ORIGINS": "*"},
|
|
88
104
|
ports={4566: 4566, 443: 443},
|
|
89
105
|
healthcheck_retries=30,
|
|
90
106
|
):
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
107
|
+
connect_params = init_db_destination()
|
|
108
|
+
yield connect_params
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@pytest.mark.asyncio
|
|
112
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
|
|
113
|
+
@requires_env("LOCALSTACK_AUTH_TOKEN")
|
|
114
|
+
async def test_snowflake_source(temp_dir: Path, source_database_setup: dict):
|
|
115
|
+
connection_config = SnowflakeConnectionConfig(
|
|
116
|
+
access_config=SnowflakeAccessConfig(password="test"),
|
|
117
|
+
account="test",
|
|
118
|
+
user="test",
|
|
119
|
+
database="test",
|
|
120
|
+
host="snowflake.localhost.localstack.cloud",
|
|
121
|
+
)
|
|
122
|
+
indexer = SnowflakeIndexer(
|
|
123
|
+
connection_config=connection_config,
|
|
124
|
+
index_config=SnowflakeIndexerConfig(table_name="cars", id_column="CAR_ID", batch_size=5),
|
|
125
|
+
)
|
|
126
|
+
downloader = SnowflakeDownloader(
|
|
127
|
+
connection_config=connection_config,
|
|
128
|
+
download_config=SnowflakeDownloaderConfig(
|
|
129
|
+
fields=["CAR_ID", "BRAND"], download_dir=temp_dir
|
|
130
|
+
),
|
|
131
|
+
)
|
|
132
|
+
await source_connector_validation(
|
|
133
|
+
indexer=indexer,
|
|
134
|
+
downloader=downloader,
|
|
135
|
+
configs=SourceValidationConfigs(
|
|
136
|
+
test_id="snowflake",
|
|
137
|
+
expected_num_files=SEED_DATA_ROWS,
|
|
138
|
+
expected_number_indexed_file_data=4,
|
|
139
|
+
validate_downloaded_files=True,
|
|
140
|
+
),
|
|
141
|
+
)
|
|
122
142
|
|
|
123
143
|
|
|
124
144
|
def validate_destination(
|
|
@@ -145,65 +165,70 @@ def validate_destination(
|
|
|
145
165
|
@pytest.mark.asyncio
|
|
146
166
|
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
|
|
147
167
|
@requires_env("LOCALSTACK_AUTH_TOKEN")
|
|
148
|
-
async def test_snowflake_destination(
|
|
168
|
+
async def test_snowflake_destination(
|
|
169
|
+
upload_file: Path, temp_dir: Path, destination_database_setup: dict
|
|
170
|
+
):
|
|
149
171
|
# the postgres destination connector doesn't leverage the file data but is required as an input,
|
|
150
172
|
# mocking it with arbitrary values to meet the base requirements:
|
|
151
173
|
mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
connect_params
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
174
|
+
init_db_destination()
|
|
175
|
+
stager = SnowflakeUploadStager()
|
|
176
|
+
staged_path = stager.run(
|
|
177
|
+
elements_filepath=upload_file,
|
|
178
|
+
file_data=mock_file_data,
|
|
179
|
+
output_dir=temp_dir,
|
|
180
|
+
output_filename=upload_file.name,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# The stager should append the `.json` suffix to the output filename passed in.
|
|
184
|
+
assert staged_path.suffix == upload_file.suffix
|
|
185
|
+
|
|
186
|
+
connect_params = {
|
|
187
|
+
"user": "test",
|
|
188
|
+
"password": "test",
|
|
189
|
+
"account": "test",
|
|
190
|
+
"database": "test",
|
|
191
|
+
"host": "snowflake.localhost.localstack.cloud",
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
uploader = SnowflakeUploader(
|
|
195
|
+
connection_config=SnowflakeConnectionConfig(
|
|
196
|
+
access_config=SnowflakeAccessConfig(password=connect_params["password"]),
|
|
197
|
+
account=connect_params["account"],
|
|
198
|
+
user=connect_params["user"],
|
|
199
|
+
database=connect_params["database"],
|
|
200
|
+
host=connect_params["host"],
|
|
201
|
+
)
|
|
202
|
+
)
|
|
203
|
+
uploader.precheck()
|
|
204
|
+
uploader.run(path=staged_path, file_data=mock_file_data)
|
|
205
|
+
|
|
206
|
+
with staged_path.open("r") as f:
|
|
207
|
+
staged_data = json.load(f)
|
|
208
|
+
expected_num_elements = len(staged_data)
|
|
209
|
+
validate_destination(
|
|
210
|
+
connect_params=connect_params,
|
|
211
|
+
expected_num_elements=expected_num_elements,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
uploader.run(path=staged_path, file_data=mock_file_data)
|
|
215
|
+
validate_destination(
|
|
216
|
+
connect_params=connect_params,
|
|
217
|
+
expected_num_elements=expected_num_elements,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
222
|
+
def test_snowflake_stager(
|
|
223
|
+
request: TopRequest,
|
|
224
|
+
upload_file_str: str,
|
|
225
|
+
tmp_path: Path,
|
|
226
|
+
):
|
|
227
|
+
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
228
|
+
stager = SnowflakeUploadStager()
|
|
229
|
+
stager_validation(
|
|
230
|
+
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
231
|
+
input_file=upload_file,
|
|
232
|
+
stager=stager,
|
|
233
|
+
tmp_dir=tmp_path,
|
|
234
|
+
)
|
|
@@ -1,14 +1,18 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import sqlite3
|
|
2
3
|
import tempfile
|
|
3
|
-
from contextlib import contextmanager
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
|
|
6
|
-
import pandas as pd
|
|
7
6
|
import pytest
|
|
7
|
+
from _pytest.fixtures import TopRequest
|
|
8
8
|
|
|
9
9
|
from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, env_setup_path
|
|
10
|
-
from test.integration.connectors.utils.validation import (
|
|
11
|
-
|
|
10
|
+
from test.integration.connectors.utils.validation.destination import (
|
|
11
|
+
StagerValidationConfigs,
|
|
12
|
+
stager_validation,
|
|
13
|
+
)
|
|
14
|
+
from test.integration.connectors.utils.validation.source import (
|
|
15
|
+
SourceValidationConfigs,
|
|
12
16
|
source_connector_validation,
|
|
13
17
|
)
|
|
14
18
|
from unstructured_ingest.v2.interfaces import FileData
|
|
@@ -23,11 +27,11 @@ from unstructured_ingest.v2.processes.connectors.sql.sqlite import (
|
|
|
23
27
|
SQLiteUploadStager,
|
|
24
28
|
)
|
|
25
29
|
|
|
26
|
-
SEED_DATA_ROWS =
|
|
30
|
+
SEED_DATA_ROWS = 10
|
|
27
31
|
|
|
28
32
|
|
|
29
|
-
@
|
|
30
|
-
def
|
|
33
|
+
@pytest.fixture
|
|
34
|
+
def source_database_setup() -> Path:
|
|
31
35
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
32
36
|
db_path = Path(tmpdir) / "mock_database.db"
|
|
33
37
|
db_init_path = env_setup_path / "sql" / "sqlite" / "source" / "sqlite-schema.sql"
|
|
@@ -49,49 +53,42 @@ def sqlite_download_setup() -> Path:
|
|
|
49
53
|
|
|
50
54
|
@pytest.mark.asyncio
|
|
51
55
|
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
|
|
52
|
-
async def test_sqlite_source():
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
),
|
|
77
|
-
)
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
@contextmanager
|
|
81
|
-
def sqlite_upload_setup() -> Path:
|
|
56
|
+
async def test_sqlite_source(source_database_setup: Path, temp_dir: Path):
|
|
57
|
+
connection_config = SQLiteConnectionConfig(database_path=source_database_setup)
|
|
58
|
+
indexer = SQLiteIndexer(
|
|
59
|
+
connection_config=connection_config,
|
|
60
|
+
index_config=SQLiteIndexerConfig(table_name="cars", id_column="car_id", batch_size=6),
|
|
61
|
+
)
|
|
62
|
+
downloader = SQLiteDownloader(
|
|
63
|
+
connection_config=connection_config,
|
|
64
|
+
download_config=SQLiteDownloaderConfig(fields=["car_id", "brand"], download_dir=temp_dir),
|
|
65
|
+
)
|
|
66
|
+
await source_connector_validation(
|
|
67
|
+
indexer=indexer,
|
|
68
|
+
downloader=downloader,
|
|
69
|
+
configs=SourceValidationConfigs(
|
|
70
|
+
test_id="sqlite",
|
|
71
|
+
expected_num_files=SEED_DATA_ROWS,
|
|
72
|
+
expected_number_indexed_file_data=2,
|
|
73
|
+
validate_downloaded_files=True,
|
|
74
|
+
),
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@pytest.fixture
|
|
79
|
+
def destination_database_setup(temp_dir: Path) -> Path:
|
|
82
80
|
# Provision the local file that sqlite points to to have the desired schema for the integration
|
|
83
81
|
# tests and make sure the file and connection get cleaned up by using a context manager.
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
with
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
yield db_path
|
|
82
|
+
db_path = temp_dir / "elements.db"
|
|
83
|
+
db_init_path = env_setup_path / "sql" / "sqlite" / "destination" / "sqlite-schema.sql"
|
|
84
|
+
assert db_init_path.exists()
|
|
85
|
+
assert db_init_path.is_file()
|
|
86
|
+
with sqlite3.connect(database=db_path) as sqlite_connection:
|
|
87
|
+
with db_init_path.open("r") as f:
|
|
88
|
+
query = f.read()
|
|
89
|
+
cursor = sqlite_connection.cursor()
|
|
90
|
+
cursor.executescript(query)
|
|
91
|
+
return db_path
|
|
95
92
|
|
|
96
93
|
|
|
97
94
|
def validate_destination(db_path: Path, expected_num_elements: int):
|
|
@@ -114,34 +111,48 @@ def validate_destination(db_path: Path, expected_num_elements: int):
|
|
|
114
111
|
|
|
115
112
|
@pytest.mark.asyncio
|
|
116
113
|
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
|
|
117
|
-
async def test_sqlite_destination(
|
|
114
|
+
async def test_sqlite_destination(
|
|
115
|
+
upload_file: Path, temp_dir: Path, destination_database_setup: Path
|
|
116
|
+
):
|
|
118
117
|
# the sqlite destination connector doesn't leverage the file data but is required as an input,
|
|
119
118
|
# mocking it with arbitrary values to meet the base requirements:
|
|
120
119
|
mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
120
|
+
stager = SQLiteUploadStager()
|
|
121
|
+
staged_path = stager.run(
|
|
122
|
+
elements_filepath=upload_file,
|
|
123
|
+
file_data=mock_file_data,
|
|
124
|
+
output_dir=temp_dir,
|
|
125
|
+
output_filename=upload_file.name,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# The stager should append the `.json` suffix to the output filename passed in.
|
|
129
|
+
assert staged_path.suffix == upload_file.suffix
|
|
130
|
+
|
|
131
|
+
uploader = SQLiteUploader(
|
|
132
|
+
connection_config=SQLiteConnectionConfig(database_path=destination_database_setup)
|
|
133
|
+
)
|
|
134
|
+
uploader.precheck()
|
|
135
|
+
uploader.run(path=staged_path, file_data=mock_file_data)
|
|
136
|
+
|
|
137
|
+
with staged_path.open("r") as f:
|
|
138
|
+
staged_data = json.load(f)
|
|
139
|
+
validate_destination(db_path=destination_database_setup, expected_num_elements=len(staged_data))
|
|
140
|
+
|
|
141
|
+
uploader.run(path=staged_path, file_data=mock_file_data)
|
|
142
|
+
validate_destination(db_path=destination_database_setup, expected_num_elements=len(staged_data))
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
146
|
+
def test_sqlite_stager(
|
|
147
|
+
request: TopRequest,
|
|
148
|
+
upload_file_str: str,
|
|
149
|
+
tmp_path: Path,
|
|
150
|
+
):
|
|
151
|
+
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
152
|
+
stager = SQLiteUploadStager()
|
|
153
|
+
stager_validation(
|
|
154
|
+
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
155
|
+
input_file=upload_file,
|
|
156
|
+
stager=stager,
|
|
157
|
+
tmp_dir=tmp_path,
|
|
158
|
+
)
|
|
@@ -5,16 +5,27 @@ from pathlib import Path
|
|
|
5
5
|
from uuid import uuid4
|
|
6
6
|
|
|
7
7
|
import pytest
|
|
8
|
+
from _pytest.fixtures import TopRequest
|
|
8
9
|
from astrapy import Collection
|
|
9
10
|
from astrapy import DataAPIClient as AstraDBClient
|
|
10
11
|
|
|
11
12
|
from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
|
|
13
|
+
from test.integration.connectors.utils.validation.destination import (
|
|
14
|
+
StagerValidationConfigs,
|
|
15
|
+
stager_validation,
|
|
16
|
+
)
|
|
17
|
+
from test.integration.connectors.utils.validation.source import (
|
|
18
|
+
SourceValidationConfigs,
|
|
19
|
+
source_connector_validation,
|
|
20
|
+
)
|
|
12
21
|
from test.integration.utils import requires_env
|
|
13
22
|
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
14
23
|
from unstructured_ingest.v2.processes.connectors.astradb import (
|
|
15
24
|
CONNECTOR_TYPE,
|
|
16
25
|
AstraDBAccessConfig,
|
|
17
26
|
AstraDBConnectionConfig,
|
|
27
|
+
AstraDBDownloader,
|
|
28
|
+
AstraDBDownloaderConfig,
|
|
18
29
|
AstraDBIndexer,
|
|
19
30
|
AstraDBIndexerConfig,
|
|
20
31
|
AstraDBUploader,
|
|
@@ -105,10 +116,44 @@ def collection(upload_file: Path) -> Collection:
|
|
|
105
116
|
astra_db.drop_collection(collection)
|
|
106
117
|
|
|
107
118
|
|
|
119
|
+
@pytest.mark.asyncio
|
|
120
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
121
|
+
@requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
|
|
122
|
+
async def test_astra_search_source(
|
|
123
|
+
tmp_path: Path,
|
|
124
|
+
):
|
|
125
|
+
env_data = get_env_data()
|
|
126
|
+
collection_name = "ingest_test_src"
|
|
127
|
+
connection_config = AstraDBConnectionConfig(
|
|
128
|
+
access_config=AstraDBAccessConfig(token=env_data.token, api_endpoint=env_data.api_endpoint)
|
|
129
|
+
)
|
|
130
|
+
indexer = AstraDBIndexer(
|
|
131
|
+
index_config=AstraDBIndexerConfig(
|
|
132
|
+
collection_name=collection_name,
|
|
133
|
+
),
|
|
134
|
+
connection_config=connection_config,
|
|
135
|
+
)
|
|
136
|
+
downloader = AstraDBDownloader(
|
|
137
|
+
connection_config=connection_config,
|
|
138
|
+
download_config=AstraDBDownloaderConfig(download_dir=tmp_path),
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
await source_connector_validation(
|
|
142
|
+
indexer=indexer,
|
|
143
|
+
downloader=downloader,
|
|
144
|
+
configs=SourceValidationConfigs(
|
|
145
|
+
test_id=CONNECTOR_TYPE,
|
|
146
|
+
expected_num_files=5,
|
|
147
|
+
expected_number_indexed_file_data=1,
|
|
148
|
+
validate_downloaded_files=True,
|
|
149
|
+
),
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
|
|
108
153
|
@pytest.mark.asyncio
|
|
109
154
|
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
110
155
|
@requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
|
|
111
|
-
async def
|
|
156
|
+
async def test_astra_search_destination(
|
|
112
157
|
upload_file: Path,
|
|
113
158
|
collection: Collection,
|
|
114
159
|
tmp_path: Path,
|
|
@@ -154,3 +199,19 @@ async def test_azure_ai_search_destination(
|
|
|
154
199
|
f"Expected count ({expected_count}) doesn't match how "
|
|
155
200
|
f"much came back from collection: {current_count}"
|
|
156
201
|
)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
205
|
+
def test_astra_stager(
|
|
206
|
+
request: TopRequest,
|
|
207
|
+
upload_file_str: str,
|
|
208
|
+
tmp_path: Path,
|
|
209
|
+
):
|
|
210
|
+
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
211
|
+
stager = AstraDBUploadStager()
|
|
212
|
+
stager_validation(
|
|
213
|
+
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
214
|
+
input_file=upload_file,
|
|
215
|
+
stager=stager,
|
|
216
|
+
tmp_dir=tmp_path,
|
|
217
|
+
)
|
|
@@ -5,6 +5,7 @@ from pathlib import Path
|
|
|
5
5
|
from uuid import uuid4
|
|
6
6
|
|
|
7
7
|
import pytest
|
|
8
|
+
from _pytest.fixtures import TopRequest
|
|
8
9
|
from azure.core.credentials import AzureKeyCredential
|
|
9
10
|
from azure.search.documents import SearchClient
|
|
10
11
|
from azure.search.documents.indexes import SearchIndexClient
|
|
@@ -25,6 +26,10 @@ from azure.search.documents.indexes.models import (
|
|
|
25
26
|
from test.integration.connectors.utils.constants import (
|
|
26
27
|
DESTINATION_TAG,
|
|
27
28
|
)
|
|
29
|
+
from test.integration.connectors.utils.validation.destination import (
|
|
30
|
+
StagerValidationConfigs,
|
|
31
|
+
stager_validation,
|
|
32
|
+
)
|
|
28
33
|
from test.integration.utils import requires_env
|
|
29
34
|
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
30
35
|
from unstructured_ingest.v2.processes.connectors.azure_ai_search import (
|
|
@@ -225,9 +230,26 @@ async def test_azure_ai_search_destination(
|
|
|
225
230
|
with staged_filepath.open() as f:
|
|
226
231
|
staged_elements = json.load(f)
|
|
227
232
|
expected_count = len(staged_elements)
|
|
228
|
-
|
|
229
|
-
|
|
233
|
+
with uploader.connection_config.get_search_client() as search_client:
|
|
234
|
+
validate_count(search_client=search_client, expected_count=expected_count)
|
|
230
235
|
|
|
231
236
|
# Rerun and make sure the same documents get updated
|
|
232
237
|
uploader.run(path=staged_filepath, file_data=file_data)
|
|
233
|
-
|
|
238
|
+
with uploader.connection_config.get_search_client() as search_client:
|
|
239
|
+
validate_count(search_client=search_client, expected_count=expected_count)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
243
|
+
def test_azure_ai_search_stager(
|
|
244
|
+
request: TopRequest,
|
|
245
|
+
upload_file_str: str,
|
|
246
|
+
tmp_path: Path,
|
|
247
|
+
):
|
|
248
|
+
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
249
|
+
stager = AzureAISearchUploadStager()
|
|
250
|
+
stager_validation(
|
|
251
|
+
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
252
|
+
input_file=upload_file,
|
|
253
|
+
stager=stager,
|
|
254
|
+
tmp_dir=tmp_path,
|
|
255
|
+
)
|