unstructured-ingest 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/chunkers/test_chunkers.py +0 -11
- test/integration/connectors/conftest.py +11 -1
- test/integration/connectors/databricks_tests/test_volumes_native.py +4 -3
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +51 -44
- test/integration/connectors/duckdb/test_motherduck.py +37 -48
- test/integration/connectors/elasticsearch/test_elasticsearch.py +26 -4
- test/integration/connectors/elasticsearch/test_opensearch.py +26 -3
- test/integration/connectors/sql/test_postgres.py +103 -92
- test/integration/connectors/sql/test_singlestore.py +112 -100
- test/integration/connectors/sql/test_snowflake.py +142 -117
- test/integration/connectors/sql/test_sqlite.py +87 -76
- test/integration/connectors/test_astradb.py +62 -1
- test/integration/connectors/test_azure_ai_search.py +25 -3
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +4 -4
- test/integration/connectors/test_delta_table.py +1 -0
- test/integration/connectors/test_kafka.py +6 -6
- test/integration/connectors/test_milvus.py +21 -0
- test/integration/connectors/test_mongodb.py +7 -4
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_pinecone.py +25 -1
- test/integration/connectors/test_qdrant.py +25 -2
- test/integration/connectors/test_s3.py +9 -6
- test/integration/connectors/utils/docker.py +6 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +88 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/{validation.py → validation/source.py} +42 -98
- test/integration/connectors/utils/validation/utils.py +36 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/utils/chunking.py +11 -0
- unstructured_ingest/utils/data_prep.py +36 -0
- unstructured_ingest/v2/interfaces/__init__.py +3 -1
- unstructured_ingest/v2/interfaces/file_data.py +58 -14
- unstructured_ingest/v2/interfaces/upload_stager.py +70 -6
- unstructured_ingest/v2/interfaces/uploader.py +11 -2
- unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
- unstructured_ingest/v2/pipeline/steps/download.py +5 -4
- unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
- unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
- unstructured_ingest/v2/pipeline/steps/index.py +4 -4
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
- unstructured_ingest/v2/pipeline/steps/stage.py +5 -3
- unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
- unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +43 -63
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +16 -40
- unstructured_ingest/v2/processes/connectors/chroma.py +36 -59
- unstructured_ingest/v2/processes/connectors/couchbase.py +92 -93
- unstructured_ingest/v2/processes/connectors/delta_table.py +11 -33
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +26 -26
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +29 -20
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +37 -44
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +46 -75
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
- unstructured_ingest/v2/processes/connectors/gitlab.py +32 -31
- unstructured_ingest/v2/processes/connectors/google_drive.py +32 -29
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +2 -4
- unstructured_ingest/v2/processes/connectors/kdbai.py +44 -70
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -10
- unstructured_ingest/v2/processes/connectors/local.py +13 -2
- unstructured_ingest/v2/processes/connectors/milvus.py +16 -57
- unstructured_ingest/v2/processes/connectors/mongodb.py +99 -108
- unstructured_ingest/v2/processes/connectors/neo4j.py +383 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
- unstructured_ingest/v2/processes/connectors/pinecone.py +3 -33
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +32 -41
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/sql.py +72 -66
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +9 -31
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/METADATA +20 -15
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/RECORD +87 -79
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/top_level.txt +0 -0
|
@@ -29,14 +29,3 @@ async def test_chunker_api(chunker_file: Path, strategy: str):
|
|
|
29
29
|
chunker = Chunker(config=chunker_config)
|
|
30
30
|
results = await chunker.run_async(elements_filepath=chunker_file)
|
|
31
31
|
assert results
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
@pytest.mark.parametrize("chunker_file", chunker_files, ids=[path.name for path in chunker_files])
|
|
35
|
-
@pytest.mark.parametrize("strategy", ["basic", "by_title"])
|
|
36
|
-
def test_chunker_basic(chunker_file: Path, strategy: str):
|
|
37
|
-
chunker_config = ChunkerConfig(
|
|
38
|
-
chunking_strategy=strategy,
|
|
39
|
-
)
|
|
40
|
-
chunker = Chunker(config=chunker_config)
|
|
41
|
-
results = chunker.run(elements_filepath=chunker_file)
|
|
42
|
-
assert results
|
|
@@ -6,7 +6,7 @@ import pytest
|
|
|
6
6
|
|
|
7
7
|
from unstructured_ingest.v2.logger import logger
|
|
8
8
|
|
|
9
|
-
FILENAME = "DA-1p-with-duplicate-pages.pdf.json"
|
|
9
|
+
FILENAME = Path("DA-1p-with-duplicate-pages.pdf.json")
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
@pytest.fixture
|
|
@@ -19,6 +19,16 @@ def upload_file() -> Path:
|
|
|
19
19
|
return upload_file
|
|
20
20
|
|
|
21
21
|
|
|
22
|
+
@pytest.fixture
|
|
23
|
+
def upload_file_ndjson() -> Path:
|
|
24
|
+
int_test_dir = Path(__file__).parent
|
|
25
|
+
assets_dir = int_test_dir / "assets"
|
|
26
|
+
upload_file = assets_dir / FILENAME.with_suffix(".ndjson")
|
|
27
|
+
assert upload_file.exists()
|
|
28
|
+
assert upload_file.is_file()
|
|
29
|
+
return upload_file
|
|
30
|
+
|
|
31
|
+
|
|
22
32
|
@pytest.fixture
|
|
23
33
|
def temp_dir() -> Generator[Path, None, None]:
|
|
24
34
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
@@ -11,8 +11,8 @@ from databricks.sdk import WorkspaceClient
|
|
|
11
11
|
from databricks.sdk.errors.platform import NotFound
|
|
12
12
|
|
|
13
13
|
from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
|
|
14
|
-
from test.integration.connectors.utils.validation import (
|
|
15
|
-
|
|
14
|
+
from test.integration.connectors.utils.validation.source import (
|
|
15
|
+
SourceValidationConfigs,
|
|
16
16
|
source_connector_validation,
|
|
17
17
|
)
|
|
18
18
|
from test.integration.utils import requires_env
|
|
@@ -82,7 +82,7 @@ async def test_volumes_native_source():
|
|
|
82
82
|
await source_connector_validation(
|
|
83
83
|
indexer=indexer,
|
|
84
84
|
downloader=downloader,
|
|
85
|
-
configs=
|
|
85
|
+
configs=SourceValidationConfigs(
|
|
86
86
|
test_id="databricks_volumes_native",
|
|
87
87
|
expected_num_files=1,
|
|
88
88
|
),
|
|
@@ -156,6 +156,7 @@ async def test_volumes_native_destination(upload_file: Path):
|
|
|
156
156
|
catalog=env_data.catalog,
|
|
157
157
|
),
|
|
158
158
|
)
|
|
159
|
+
uploader.precheck()
|
|
159
160
|
if uploader.is_async():
|
|
160
161
|
await uploader.run_async(path=upload_file, file_data=file_data)
|
|
161
162
|
else:
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
int_test_dir = Path(__file__).parent
|
|
6
|
+
assets_dir = int_test_dir / "assets"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@pytest.fixture
|
|
10
|
+
def duckdb_schema() -> Path:
|
|
11
|
+
schema_file = assets_dir / "duckdb-schema.sql"
|
|
12
|
+
assert schema_file.exists()
|
|
13
|
+
assert schema_file.is_file()
|
|
14
|
+
return schema_file
|
|
@@ -1,13 +1,15 @@
|
|
|
1
|
-
import
|
|
2
|
-
from contextlib import contextmanager
|
|
1
|
+
import json
|
|
3
2
|
from pathlib import Path
|
|
4
|
-
from typing import Generator
|
|
5
3
|
|
|
6
4
|
import duckdb
|
|
7
|
-
import pandas as pd
|
|
8
5
|
import pytest
|
|
6
|
+
from _pytest.fixtures import TopRequest
|
|
9
7
|
|
|
10
8
|
from test.integration.connectors.utils.constants import DESTINATION_TAG
|
|
9
|
+
from test.integration.connectors.utils.validation.destination import (
|
|
10
|
+
StagerValidationConfigs,
|
|
11
|
+
stager_validation,
|
|
12
|
+
)
|
|
11
13
|
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
12
14
|
from unstructured_ingest.v2.processes.connectors.duckdb.duckdb import (
|
|
13
15
|
CONNECTOR_TYPE,
|
|
@@ -18,19 +20,15 @@ from unstructured_ingest.v2.processes.connectors.duckdb.duckdb import (
|
|
|
18
20
|
)
|
|
19
21
|
|
|
20
22
|
|
|
21
|
-
@
|
|
22
|
-
def
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
query = f.read()
|
|
31
|
-
duckdb_connection.execute(query)
|
|
32
|
-
duckdb_connection.close()
|
|
33
|
-
yield db_path
|
|
23
|
+
@pytest.fixture
|
|
24
|
+
def provisioned_db_file(duckdb_schema: Path, temp_dir: Path) -> Path:
|
|
25
|
+
db_path = Path(temp_dir) / "temp_duck.db"
|
|
26
|
+
with duckdb.connect(database=db_path) as duckdb_connection:
|
|
27
|
+
with duckdb_schema.open("r") as f:
|
|
28
|
+
query = f.read()
|
|
29
|
+
duckdb_connection.execute(query)
|
|
30
|
+
duckdb_connection.close()
|
|
31
|
+
return db_path
|
|
34
32
|
|
|
35
33
|
|
|
36
34
|
def validate_duckdb_destination(db_path: Path, expected_num_elements: int):
|
|
@@ -49,34 +47,43 @@ def validate_duckdb_destination(db_path: Path, expected_num_elements: int):
|
|
|
49
47
|
|
|
50
48
|
|
|
51
49
|
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "duckdb")
|
|
52
|
-
def test_duckdb_destination(upload_file: Path):
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
50
|
+
def test_duckdb_destination(upload_file: Path, provisioned_db_file: Path, temp_dir: Path):
|
|
51
|
+
file_data = FileData(
|
|
52
|
+
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
53
|
+
connector_type=CONNECTOR_TYPE,
|
|
54
|
+
identifier="mock-file-data",
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
stager = DuckDBUploadStager()
|
|
58
|
+
staged_path = stager.run(
|
|
59
|
+
elements_filepath=upload_file,
|
|
60
|
+
file_data=file_data,
|
|
61
|
+
output_dir=temp_dir,
|
|
62
|
+
output_filename=upload_file.name,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
connection_config = DuckDBConnectionConfig(database=str(provisioned_db_file))
|
|
66
|
+
upload_config = DuckDBUploaderConfig()
|
|
67
|
+
uploader = DuckDBUploader(connection_config=connection_config, upload_config=upload_config)
|
|
62
68
|
|
|
63
|
-
|
|
64
|
-
stager = DuckDBUploadStager()
|
|
65
|
-
stager_params = {
|
|
66
|
-
"elements_filepath": upload_file,
|
|
67
|
-
"file_data": file_data,
|
|
68
|
-
"output_dir": temp_dir,
|
|
69
|
-
"output_filename": "test_db",
|
|
70
|
-
}
|
|
71
|
-
staged_path = stager.run(**stager_params)
|
|
69
|
+
uploader.run(path=staged_path, file_data=file_data)
|
|
72
70
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
connection_config=connection_config, upload_config=upload_config
|
|
77
|
-
)
|
|
71
|
+
with staged_path.open() as f:
|
|
72
|
+
data = json.load(f)
|
|
73
|
+
validate_duckdb_destination(db_path=provisioned_db_file, expected_num_elements=len(data))
|
|
78
74
|
|
|
79
|
-
uploader.run(path=staged_path, file_data=file_data)
|
|
80
75
|
|
|
81
|
-
|
|
82
|
-
|
|
76
|
+
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
77
|
+
def test_duckdb_stager(
|
|
78
|
+
request: TopRequest,
|
|
79
|
+
upload_file_str: str,
|
|
80
|
+
tmp_path: Path,
|
|
81
|
+
):
|
|
82
|
+
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
83
|
+
stager = DuckDBUploadStager()
|
|
84
|
+
stager_validation(
|
|
85
|
+
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
86
|
+
input_file=upload_file,
|
|
87
|
+
stager=stager,
|
|
88
|
+
tmp_dir=tmp_path,
|
|
89
|
+
)
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
-
import tempfile
|
|
3
2
|
import uuid
|
|
4
|
-
from contextlib import contextmanager
|
|
5
3
|
from pathlib import Path
|
|
6
4
|
from typing import Generator
|
|
7
5
|
|
|
@@ -22,15 +20,19 @@ from unstructured_ingest.v2.processes.connectors.duckdb.motherduck import (
|
|
|
22
20
|
)
|
|
23
21
|
|
|
24
22
|
|
|
25
|
-
@
|
|
26
|
-
def
|
|
23
|
+
@pytest.fixture
|
|
24
|
+
def md_token() -> str:
|
|
25
|
+
motherduck_token = os.getenv("MOTHERDUCK_TOKEN", None)
|
|
26
|
+
assert motherduck_token
|
|
27
|
+
return motherduck_token
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@pytest.fixture
|
|
31
|
+
def provisioned_db(md_token: str, duckdb_schema: Path) -> Generator[str, None, None]:
|
|
27
32
|
database_name = f"test_{str(uuid.uuid4()).replace('-', '_')}"
|
|
28
33
|
try:
|
|
29
|
-
db_init_path = Path(__file__).parent / "duckdb-schema.sql"
|
|
30
|
-
assert db_init_path.exists()
|
|
31
|
-
assert db_init_path.is_file()
|
|
32
34
|
with duckdb.connect(f"md:?motherduck_token={md_token}") as md_conn:
|
|
33
|
-
with
|
|
35
|
+
with duckdb_schema.open("r") as f:
|
|
34
36
|
query = f.read()
|
|
35
37
|
md_conn.execute(f"CREATE DATABASE {database_name}")
|
|
36
38
|
md_conn.execute(f"USE {database_name}")
|
|
@@ -59,48 +61,35 @@ def validate_motherduck_destination(database: str, expected_num_elements: int, m
|
|
|
59
61
|
conn.close()
|
|
60
62
|
|
|
61
63
|
|
|
62
|
-
|
|
63
|
-
motherduck_token = os.getenv("MOTHERDUCK_TOKEN", None)
|
|
64
|
-
assert motherduck_token
|
|
65
|
-
return motherduck_token
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "motherduck")
|
|
64
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
69
65
|
@requires_env("MOTHERDUCK_TOKEN")
|
|
70
|
-
def test_motherduck_destination(
|
|
71
|
-
md_token
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
connector_type=CONNECTOR_TYPE,
|
|
79
|
-
identifier="mock-file-data",
|
|
80
|
-
)
|
|
66
|
+
def test_motherduck_destination(
|
|
67
|
+
md_token: str, upload_file: Path, provisioned_db: str, temp_dir: Path
|
|
68
|
+
):
|
|
69
|
+
file_data = FileData(
|
|
70
|
+
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
71
|
+
connector_type=CONNECTOR_TYPE,
|
|
72
|
+
identifier="mock-file-data",
|
|
73
|
+
)
|
|
81
74
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
}
|
|
90
|
-
staged_path = stager.run(**stager_params)
|
|
75
|
+
stager = MotherDuckUploadStager()
|
|
76
|
+
staged_path = stager.run(
|
|
77
|
+
elements_filepath=upload_file,
|
|
78
|
+
file_data=file_data,
|
|
79
|
+
output_dir=temp_dir,
|
|
80
|
+
output_filename=upload_file.name,
|
|
81
|
+
)
|
|
91
82
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
connection_config=connection_config, upload_config=upload_config
|
|
99
|
-
)
|
|
83
|
+
access_config = MotherDuckAccessConfig(md_token=md_token)
|
|
84
|
+
connection_config = MotherDuckConnectionConfig(
|
|
85
|
+
database=provisioned_db, access_config=access_config
|
|
86
|
+
)
|
|
87
|
+
upload_config = MotherDuckUploaderConfig()
|
|
88
|
+
uploader = MotherDuckUploader(connection_config=connection_config, upload_config=upload_config)
|
|
100
89
|
|
|
101
|
-
|
|
90
|
+
uploader.run(path=staged_path, file_data=file_data)
|
|
102
91
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
92
|
+
staged_df = pd.read_json(staged_path, orient="records", lines=True)
|
|
93
|
+
validate_motherduck_destination(
|
|
94
|
+
database=provisioned_db, expected_num_elements=len(staged_df), md_token=md_token
|
|
95
|
+
)
|
|
@@ -5,16 +5,20 @@ import time
|
|
|
5
5
|
from contextlib import contextmanager
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
from typing import Generator
|
|
8
|
-
|
|
8
|
+
from test.integration.connectors.utils.validation.destination import (
|
|
9
|
+
StagerValidationConfigs,
|
|
10
|
+
stager_validation,
|
|
11
|
+
)
|
|
9
12
|
import pandas as pd
|
|
10
13
|
import pytest
|
|
14
|
+
from _pytest.fixtures import TopRequest
|
|
11
15
|
from elasticsearch import Elasticsearch as ElasticsearchClient
|
|
12
16
|
from elasticsearch.helpers import bulk
|
|
13
17
|
|
|
14
18
|
from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
|
|
15
19
|
from test.integration.connectors.utils.docker import HealthCheck, container_context
|
|
16
|
-
from test.integration.connectors.utils.validation import (
|
|
17
|
-
|
|
20
|
+
from test.integration.connectors.utils.validation.source import (
|
|
21
|
+
SourceValidationConfigs,
|
|
18
22
|
source_connector_validation,
|
|
19
23
|
)
|
|
20
24
|
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
@@ -194,7 +198,7 @@ async def test_elasticsearch_source(source_index: str, movies_dataframe: pd.Data
|
|
|
194
198
|
await source_connector_validation(
|
|
195
199
|
indexer=indexer,
|
|
196
200
|
downloader=downloader,
|
|
197
|
-
configs=
|
|
201
|
+
configs=SourceValidationConfigs(
|
|
198
202
|
test_id=CONNECTOR_TYPE,
|
|
199
203
|
expected_num_files=expected_num_files,
|
|
200
204
|
expected_number_indexed_file_data=1,
|
|
@@ -306,3 +310,21 @@ def test_elasticsearch_destination_precheck_fail_no_index(destination_index: str
|
|
|
306
310
|
)
|
|
307
311
|
with pytest.raises(DestinationConnectionError):
|
|
308
312
|
uploader.precheck()
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
316
|
+
def test_elasticsearch_stager(
|
|
317
|
+
request: TopRequest,
|
|
318
|
+
upload_file_str: str,
|
|
319
|
+
tmp_path: Path,
|
|
320
|
+
):
|
|
321
|
+
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
322
|
+
stager = ElasticsearchUploadStager(
|
|
323
|
+
upload_stager_config=ElasticsearchUploadStagerConfig(index_name="mock_index")
|
|
324
|
+
)
|
|
325
|
+
stager_validation(
|
|
326
|
+
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
327
|
+
input_file=upload_file,
|
|
328
|
+
stager=stager,
|
|
329
|
+
tmp_dir=tmp_path,
|
|
330
|
+
)
|
|
@@ -7,12 +7,17 @@ from typing import Generator
|
|
|
7
7
|
|
|
8
8
|
import pandas as pd
|
|
9
9
|
import pytest
|
|
10
|
+
from _pytest.fixtures import TopRequest
|
|
10
11
|
from opensearchpy import Document, Keyword, OpenSearch, Text
|
|
11
12
|
|
|
12
13
|
from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
|
|
13
14
|
from test.integration.connectors.utils.docker import HealthCheck, container_context
|
|
14
|
-
from test.integration.connectors.utils.validation import (
|
|
15
|
-
|
|
15
|
+
from test.integration.connectors.utils.validation.destination import (
|
|
16
|
+
StagerValidationConfigs,
|
|
17
|
+
stager_validation,
|
|
18
|
+
)
|
|
19
|
+
from test.integration.connectors.utils.validation.source import (
|
|
20
|
+
SourceValidationConfigs,
|
|
16
21
|
source_connector_validation,
|
|
17
22
|
)
|
|
18
23
|
from unstructured_ingest.error import (
|
|
@@ -183,7 +188,7 @@ async def test_opensearch_source(source_index: str, movies_dataframe: pd.DataFra
|
|
|
183
188
|
await source_connector_validation(
|
|
184
189
|
indexer=indexer,
|
|
185
190
|
downloader=downloader,
|
|
186
|
-
configs=
|
|
191
|
+
configs=SourceValidationConfigs(
|
|
187
192
|
test_id=CONNECTOR_TYPE,
|
|
188
193
|
expected_num_files=expected_num_files,
|
|
189
194
|
expected_number_indexed_file_data=1,
|
|
@@ -300,3 +305,21 @@ def test_opensearch_destination_precheck_fail_no_index(destination_index: str):
|
|
|
300
305
|
)
|
|
301
306
|
with pytest.raises(DestinationConnectionError):
|
|
302
307
|
uploader.precheck()
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
311
|
+
def test_opensearch_stager(
|
|
312
|
+
request: TopRequest,
|
|
313
|
+
upload_file_str: str,
|
|
314
|
+
tmp_path: Path,
|
|
315
|
+
):
|
|
316
|
+
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
317
|
+
stager = OpenSearchUploadStager(
|
|
318
|
+
upload_stager_config=OpenSearchUploadStagerConfig(index_name="mock_index")
|
|
319
|
+
)
|
|
320
|
+
stager_validation(
|
|
321
|
+
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
322
|
+
input_file=upload_file,
|
|
323
|
+
stager=stager,
|
|
324
|
+
tmp_dir=tmp_path,
|
|
325
|
+
)
|