unstructured-ingest 0.3.8__py3-none-any.whl → 0.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/chunkers/test_chunkers.py +0 -11
- test/integration/connectors/conftest.py +11 -1
- test/integration/connectors/databricks_tests/test_volumes_native.py +4 -3
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +51 -44
- test/integration/connectors/duckdb/test_motherduck.py +37 -48
- test/integration/connectors/elasticsearch/test_elasticsearch.py +26 -4
- test/integration/connectors/elasticsearch/test_opensearch.py +26 -3
- test/integration/connectors/sql/test_postgres.py +102 -91
- test/integration/connectors/sql/test_singlestore.py +111 -99
- test/integration/connectors/sql/test_snowflake.py +142 -117
- test/integration/connectors/sql/test_sqlite.py +86 -75
- test/integration/connectors/test_astradb.py +22 -1
- test/integration/connectors/test_azure_ai_search.py +25 -3
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +4 -4
- test/integration/connectors/test_delta_table.py +1 -0
- test/integration/connectors/test_kafka.py +4 -4
- test/integration/connectors/test_milvus.py +21 -0
- test/integration/connectors/test_mongodb.py +3 -3
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_pinecone.py +25 -1
- test/integration/connectors/test_qdrant.py +25 -2
- test/integration/connectors/test_s3.py +9 -6
- test/integration/connectors/utils/docker.py +6 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +88 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/{validation.py → validation/source.py} +15 -91
- test/integration/connectors/utils/validation/utils.py +36 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/utils/chunking.py +11 -0
- unstructured_ingest/utils/data_prep.py +36 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +70 -6
- unstructured_ingest/v2/interfaces/uploader.py +11 -2
- unstructured_ingest/v2/pipeline/steps/stage.py +3 -1
- unstructured_ingest/v2/processes/connectors/astradb.py +8 -30
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +16 -40
- unstructured_ingest/v2/processes/connectors/chroma.py +36 -59
- unstructured_ingest/v2/processes/connectors/couchbase.py +42 -52
- unstructured_ingest/v2/processes/connectors/delta_table.py +11 -33
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +26 -26
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +29 -20
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +37 -44
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +5 -30
- unstructured_ingest/v2/processes/connectors/gitlab.py +32 -31
- unstructured_ingest/v2/processes/connectors/google_drive.py +32 -29
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +2 -4
- unstructured_ingest/v2/processes/connectors/kdbai.py +44 -70
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -10
- unstructured_ingest/v2/processes/connectors/local.py +13 -2
- unstructured_ingest/v2/processes/connectors/milvus.py +16 -57
- unstructured_ingest/v2/processes/connectors/mongodb.py +4 -8
- unstructured_ingest/v2/processes/connectors/neo4j.py +381 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +3 -33
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +32 -41
- unstructured_ingest/v2/processes/connectors/sql/sql.py +41 -40
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +9 -31
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/METADATA +18 -14
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/RECORD +64 -56
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/top_level.txt +0 -0
|
@@ -29,14 +29,3 @@ async def test_chunker_api(chunker_file: Path, strategy: str):
|
|
|
29
29
|
chunker = Chunker(config=chunker_config)
|
|
30
30
|
results = await chunker.run_async(elements_filepath=chunker_file)
|
|
31
31
|
assert results
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
@pytest.mark.parametrize("chunker_file", chunker_files, ids=[path.name for path in chunker_files])
|
|
35
|
-
@pytest.mark.parametrize("strategy", ["basic", "by_title"])
|
|
36
|
-
def test_chunker_basic(chunker_file: Path, strategy: str):
|
|
37
|
-
chunker_config = ChunkerConfig(
|
|
38
|
-
chunking_strategy=strategy,
|
|
39
|
-
)
|
|
40
|
-
chunker = Chunker(config=chunker_config)
|
|
41
|
-
results = chunker.run(elements_filepath=chunker_file)
|
|
42
|
-
assert results
|
|
@@ -6,7 +6,7 @@ import pytest
|
|
|
6
6
|
|
|
7
7
|
from unstructured_ingest.v2.logger import logger
|
|
8
8
|
|
|
9
|
-
FILENAME = "DA-1p-with-duplicate-pages.pdf.json"
|
|
9
|
+
FILENAME = Path("DA-1p-with-duplicate-pages.pdf.json")
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
@pytest.fixture
|
|
@@ -19,6 +19,16 @@ def upload_file() -> Path:
|
|
|
19
19
|
return upload_file
|
|
20
20
|
|
|
21
21
|
|
|
22
|
+
@pytest.fixture
|
|
23
|
+
def upload_file_ndjson() -> Path:
|
|
24
|
+
int_test_dir = Path(__file__).parent
|
|
25
|
+
assets_dir = int_test_dir / "assets"
|
|
26
|
+
upload_file = assets_dir / FILENAME.with_suffix(".ndjson")
|
|
27
|
+
assert upload_file.exists()
|
|
28
|
+
assert upload_file.is_file()
|
|
29
|
+
return upload_file
|
|
30
|
+
|
|
31
|
+
|
|
22
32
|
@pytest.fixture
|
|
23
33
|
def temp_dir() -> Generator[Path, None, None]:
|
|
24
34
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
@@ -11,8 +11,8 @@ from databricks.sdk import WorkspaceClient
|
|
|
11
11
|
from databricks.sdk.errors.platform import NotFound
|
|
12
12
|
|
|
13
13
|
from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
|
|
14
|
-
from test.integration.connectors.utils.validation import (
|
|
15
|
-
|
|
14
|
+
from test.integration.connectors.utils.validation.source import (
|
|
15
|
+
SourceValidationConfigs,
|
|
16
16
|
source_connector_validation,
|
|
17
17
|
)
|
|
18
18
|
from test.integration.utils import requires_env
|
|
@@ -82,7 +82,7 @@ async def test_volumes_native_source():
|
|
|
82
82
|
await source_connector_validation(
|
|
83
83
|
indexer=indexer,
|
|
84
84
|
downloader=downloader,
|
|
85
|
-
configs=
|
|
85
|
+
configs=SourceValidationConfigs(
|
|
86
86
|
test_id="databricks_volumes_native",
|
|
87
87
|
expected_num_files=1,
|
|
88
88
|
),
|
|
@@ -156,6 +156,7 @@ async def test_volumes_native_destination(upload_file: Path):
|
|
|
156
156
|
catalog=env_data.catalog,
|
|
157
157
|
),
|
|
158
158
|
)
|
|
159
|
+
uploader.precheck()
|
|
159
160
|
if uploader.is_async():
|
|
160
161
|
await uploader.run_async(path=upload_file, file_data=file_data)
|
|
161
162
|
else:
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
int_test_dir = Path(__file__).parent
|
|
6
|
+
assets_dir = int_test_dir / "assets"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@pytest.fixture
|
|
10
|
+
def duckdb_schema() -> Path:
|
|
11
|
+
schema_file = assets_dir / "duckdb-schema.sql"
|
|
12
|
+
assert schema_file.exists()
|
|
13
|
+
assert schema_file.is_file()
|
|
14
|
+
return schema_file
|
|
@@ -1,13 +1,15 @@
|
|
|
1
|
-
import
|
|
2
|
-
from contextlib import contextmanager
|
|
1
|
+
import json
|
|
3
2
|
from pathlib import Path
|
|
4
|
-
from typing import Generator
|
|
5
3
|
|
|
6
4
|
import duckdb
|
|
7
|
-
import pandas as pd
|
|
8
5
|
import pytest
|
|
6
|
+
from _pytest.fixtures import TopRequest
|
|
9
7
|
|
|
10
8
|
from test.integration.connectors.utils.constants import DESTINATION_TAG
|
|
9
|
+
from test.integration.connectors.utils.validation.destination import (
|
|
10
|
+
StagerValidationConfigs,
|
|
11
|
+
stager_validation,
|
|
12
|
+
)
|
|
11
13
|
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
12
14
|
from unstructured_ingest.v2.processes.connectors.duckdb.duckdb import (
|
|
13
15
|
CONNECTOR_TYPE,
|
|
@@ -18,19 +20,15 @@ from unstructured_ingest.v2.processes.connectors.duckdb.duckdb import (
|
|
|
18
20
|
)
|
|
19
21
|
|
|
20
22
|
|
|
21
|
-
@
|
|
22
|
-
def
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
query = f.read()
|
|
31
|
-
duckdb_connection.execute(query)
|
|
32
|
-
duckdb_connection.close()
|
|
33
|
-
yield db_path
|
|
23
|
+
@pytest.fixture
|
|
24
|
+
def provisioned_db_file(duckdb_schema: Path, temp_dir: Path) -> Path:
|
|
25
|
+
db_path = Path(temp_dir) / "temp_duck.db"
|
|
26
|
+
with duckdb.connect(database=db_path) as duckdb_connection:
|
|
27
|
+
with duckdb_schema.open("r") as f:
|
|
28
|
+
query = f.read()
|
|
29
|
+
duckdb_connection.execute(query)
|
|
30
|
+
duckdb_connection.close()
|
|
31
|
+
return db_path
|
|
34
32
|
|
|
35
33
|
|
|
36
34
|
def validate_duckdb_destination(db_path: Path, expected_num_elements: int):
|
|
@@ -49,34 +47,43 @@ def validate_duckdb_destination(db_path: Path, expected_num_elements: int):
|
|
|
49
47
|
|
|
50
48
|
|
|
51
49
|
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "duckdb")
|
|
52
|
-
def test_duckdb_destination(upload_file: Path):
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
50
|
+
def test_duckdb_destination(upload_file: Path, provisioned_db_file: Path, temp_dir: Path):
|
|
51
|
+
file_data = FileData(
|
|
52
|
+
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
53
|
+
connector_type=CONNECTOR_TYPE,
|
|
54
|
+
identifier="mock-file-data",
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
stager = DuckDBUploadStager()
|
|
58
|
+
staged_path = stager.run(
|
|
59
|
+
elements_filepath=upload_file,
|
|
60
|
+
file_data=file_data,
|
|
61
|
+
output_dir=temp_dir,
|
|
62
|
+
output_filename=upload_file.name,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
connection_config = DuckDBConnectionConfig(database=str(provisioned_db_file))
|
|
66
|
+
upload_config = DuckDBUploaderConfig()
|
|
67
|
+
uploader = DuckDBUploader(connection_config=connection_config, upload_config=upload_config)
|
|
62
68
|
|
|
63
|
-
|
|
64
|
-
stager = DuckDBUploadStager()
|
|
65
|
-
stager_params = {
|
|
66
|
-
"elements_filepath": upload_file,
|
|
67
|
-
"file_data": file_data,
|
|
68
|
-
"output_dir": temp_dir,
|
|
69
|
-
"output_filename": "test_db",
|
|
70
|
-
}
|
|
71
|
-
staged_path = stager.run(**stager_params)
|
|
69
|
+
uploader.run(path=staged_path, file_data=file_data)
|
|
72
70
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
connection_config=connection_config, upload_config=upload_config
|
|
77
|
-
)
|
|
71
|
+
with staged_path.open() as f:
|
|
72
|
+
data = json.load(f)
|
|
73
|
+
validate_duckdb_destination(db_path=provisioned_db_file, expected_num_elements=len(data))
|
|
78
74
|
|
|
79
|
-
uploader.run(path=staged_path, file_data=file_data)
|
|
80
75
|
|
|
81
|
-
|
|
82
|
-
|
|
76
|
+
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
77
|
+
def test_duckdb_stager(
|
|
78
|
+
request: TopRequest,
|
|
79
|
+
upload_file_str: str,
|
|
80
|
+
tmp_path: Path,
|
|
81
|
+
):
|
|
82
|
+
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
83
|
+
stager = DuckDBUploadStager()
|
|
84
|
+
stager_validation(
|
|
85
|
+
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
86
|
+
input_file=upload_file,
|
|
87
|
+
stager=stager,
|
|
88
|
+
tmp_dir=tmp_path,
|
|
89
|
+
)
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
-
import tempfile
|
|
3
2
|
import uuid
|
|
4
|
-
from contextlib import contextmanager
|
|
5
3
|
from pathlib import Path
|
|
6
4
|
from typing import Generator
|
|
7
5
|
|
|
@@ -22,15 +20,19 @@ from unstructured_ingest.v2.processes.connectors.duckdb.motherduck import (
|
|
|
22
20
|
)
|
|
23
21
|
|
|
24
22
|
|
|
25
|
-
@
|
|
26
|
-
def
|
|
23
|
+
@pytest.fixture
|
|
24
|
+
def md_token() -> str:
|
|
25
|
+
motherduck_token = os.getenv("MOTHERDUCK_TOKEN", None)
|
|
26
|
+
assert motherduck_token
|
|
27
|
+
return motherduck_token
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@pytest.fixture
|
|
31
|
+
def provisioned_db(md_token: str, duckdb_schema: Path) -> Generator[str, None, None]:
|
|
27
32
|
database_name = f"test_{str(uuid.uuid4()).replace('-', '_')}"
|
|
28
33
|
try:
|
|
29
|
-
db_init_path = Path(__file__).parent / "duckdb-schema.sql"
|
|
30
|
-
assert db_init_path.exists()
|
|
31
|
-
assert db_init_path.is_file()
|
|
32
34
|
with duckdb.connect(f"md:?motherduck_token={md_token}") as md_conn:
|
|
33
|
-
with
|
|
35
|
+
with duckdb_schema.open("r") as f:
|
|
34
36
|
query = f.read()
|
|
35
37
|
md_conn.execute(f"CREATE DATABASE {database_name}")
|
|
36
38
|
md_conn.execute(f"USE {database_name}")
|
|
@@ -59,48 +61,35 @@ def validate_motherduck_destination(database: str, expected_num_elements: int, m
|
|
|
59
61
|
conn.close()
|
|
60
62
|
|
|
61
63
|
|
|
62
|
-
|
|
63
|
-
motherduck_token = os.getenv("MOTHERDUCK_TOKEN", None)
|
|
64
|
-
assert motherduck_token
|
|
65
|
-
return motherduck_token
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "motherduck")
|
|
64
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
69
65
|
@requires_env("MOTHERDUCK_TOKEN")
|
|
70
|
-
def test_motherduck_destination(
|
|
71
|
-
md_token
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
connector_type=CONNECTOR_TYPE,
|
|
79
|
-
identifier="mock-file-data",
|
|
80
|
-
)
|
|
66
|
+
def test_motherduck_destination(
|
|
67
|
+
md_token: str, upload_file: Path, provisioned_db: str, temp_dir: Path
|
|
68
|
+
):
|
|
69
|
+
file_data = FileData(
|
|
70
|
+
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
71
|
+
connector_type=CONNECTOR_TYPE,
|
|
72
|
+
identifier="mock-file-data",
|
|
73
|
+
)
|
|
81
74
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
}
|
|
90
|
-
staged_path = stager.run(**stager_params)
|
|
75
|
+
stager = MotherDuckUploadStager()
|
|
76
|
+
staged_path = stager.run(
|
|
77
|
+
elements_filepath=upload_file,
|
|
78
|
+
file_data=file_data,
|
|
79
|
+
output_dir=temp_dir,
|
|
80
|
+
output_filename=upload_file.name,
|
|
81
|
+
)
|
|
91
82
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
connection_config=connection_config, upload_config=upload_config
|
|
99
|
-
)
|
|
83
|
+
access_config = MotherDuckAccessConfig(md_token=md_token)
|
|
84
|
+
connection_config = MotherDuckConnectionConfig(
|
|
85
|
+
database=provisioned_db, access_config=access_config
|
|
86
|
+
)
|
|
87
|
+
upload_config = MotherDuckUploaderConfig()
|
|
88
|
+
uploader = MotherDuckUploader(connection_config=connection_config, upload_config=upload_config)
|
|
100
89
|
|
|
101
|
-
|
|
90
|
+
uploader.run(path=staged_path, file_data=file_data)
|
|
102
91
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
92
|
+
staged_df = pd.read_json(staged_path, orient="records", lines=True)
|
|
93
|
+
validate_motherduck_destination(
|
|
94
|
+
database=provisioned_db, expected_num_elements=len(staged_df), md_token=md_token
|
|
95
|
+
)
|
|
@@ -5,16 +5,20 @@ import time
|
|
|
5
5
|
from contextlib import contextmanager
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
from typing import Generator
|
|
8
|
-
|
|
8
|
+
from test.integration.connectors.utils.validation.destination import (
|
|
9
|
+
StagerValidationConfigs,
|
|
10
|
+
stager_validation,
|
|
11
|
+
)
|
|
9
12
|
import pandas as pd
|
|
10
13
|
import pytest
|
|
14
|
+
from _pytest.fixtures import TopRequest
|
|
11
15
|
from elasticsearch import Elasticsearch as ElasticsearchClient
|
|
12
16
|
from elasticsearch.helpers import bulk
|
|
13
17
|
|
|
14
18
|
from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
|
|
15
19
|
from test.integration.connectors.utils.docker import HealthCheck, container_context
|
|
16
|
-
from test.integration.connectors.utils.validation import (
|
|
17
|
-
|
|
20
|
+
from test.integration.connectors.utils.validation.source import (
|
|
21
|
+
SourceValidationConfigs,
|
|
18
22
|
source_connector_validation,
|
|
19
23
|
)
|
|
20
24
|
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
@@ -194,7 +198,7 @@ async def test_elasticsearch_source(source_index: str, movies_dataframe: pd.Data
|
|
|
194
198
|
await source_connector_validation(
|
|
195
199
|
indexer=indexer,
|
|
196
200
|
downloader=downloader,
|
|
197
|
-
configs=
|
|
201
|
+
configs=SourceValidationConfigs(
|
|
198
202
|
test_id=CONNECTOR_TYPE,
|
|
199
203
|
expected_num_files=expected_num_files,
|
|
200
204
|
expected_number_indexed_file_data=1,
|
|
@@ -306,3 +310,21 @@ def test_elasticsearch_destination_precheck_fail_no_index(destination_index: str
|
|
|
306
310
|
)
|
|
307
311
|
with pytest.raises(DestinationConnectionError):
|
|
308
312
|
uploader.precheck()
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
316
|
+
def test_elasticsearch_stager(
|
|
317
|
+
request: TopRequest,
|
|
318
|
+
upload_file_str: str,
|
|
319
|
+
tmp_path: Path,
|
|
320
|
+
):
|
|
321
|
+
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
322
|
+
stager = ElasticsearchUploadStager(
|
|
323
|
+
upload_stager_config=ElasticsearchUploadStagerConfig(index_name="mock_index")
|
|
324
|
+
)
|
|
325
|
+
stager_validation(
|
|
326
|
+
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
327
|
+
input_file=upload_file,
|
|
328
|
+
stager=stager,
|
|
329
|
+
tmp_dir=tmp_path,
|
|
330
|
+
)
|
|
@@ -7,12 +7,17 @@ from typing import Generator
|
|
|
7
7
|
|
|
8
8
|
import pandas as pd
|
|
9
9
|
import pytest
|
|
10
|
+
from _pytest.fixtures import TopRequest
|
|
10
11
|
from opensearchpy import Document, Keyword, OpenSearch, Text
|
|
11
12
|
|
|
12
13
|
from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
|
|
13
14
|
from test.integration.connectors.utils.docker import HealthCheck, container_context
|
|
14
|
-
from test.integration.connectors.utils.validation import (
|
|
15
|
-
|
|
15
|
+
from test.integration.connectors.utils.validation.destination import (
|
|
16
|
+
StagerValidationConfigs,
|
|
17
|
+
stager_validation,
|
|
18
|
+
)
|
|
19
|
+
from test.integration.connectors.utils.validation.source import (
|
|
20
|
+
SourceValidationConfigs,
|
|
16
21
|
source_connector_validation,
|
|
17
22
|
)
|
|
18
23
|
from unstructured_ingest.error import (
|
|
@@ -183,7 +188,7 @@ async def test_opensearch_source(source_index: str, movies_dataframe: pd.DataFra
|
|
|
183
188
|
await source_connector_validation(
|
|
184
189
|
indexer=indexer,
|
|
185
190
|
downloader=downloader,
|
|
186
|
-
configs=
|
|
191
|
+
configs=SourceValidationConfigs(
|
|
187
192
|
test_id=CONNECTOR_TYPE,
|
|
188
193
|
expected_num_files=expected_num_files,
|
|
189
194
|
expected_number_indexed_file_data=1,
|
|
@@ -300,3 +305,21 @@ def test_opensearch_destination_precheck_fail_no_index(destination_index: str):
|
|
|
300
305
|
)
|
|
301
306
|
with pytest.raises(DestinationConnectionError):
|
|
302
307
|
uploader.precheck()
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
311
|
+
def test_opensearch_stager(
|
|
312
|
+
request: TopRequest,
|
|
313
|
+
upload_file_str: str,
|
|
314
|
+
tmp_path: Path,
|
|
315
|
+
):
|
|
316
|
+
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
317
|
+
stager = OpenSearchUploadStager(
|
|
318
|
+
upload_stager_config=OpenSearchUploadStagerConfig(index_name="mock_index")
|
|
319
|
+
)
|
|
320
|
+
stager_validation(
|
|
321
|
+
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
322
|
+
input_file=upload_file,
|
|
323
|
+
stager=stager,
|
|
324
|
+
tmp_dir=tmp_path,
|
|
325
|
+
)
|
|
@@ -1,15 +1,18 @@
|
|
|
1
|
-
import
|
|
2
|
-
from contextlib import contextmanager
|
|
1
|
+
import json
|
|
3
2
|
from pathlib import Path
|
|
4
3
|
|
|
5
|
-
import pandas as pd
|
|
6
4
|
import pytest
|
|
5
|
+
from _pytest.fixtures import TopRequest
|
|
7
6
|
from psycopg2 import connect
|
|
8
7
|
|
|
9
8
|
from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, env_setup_path
|
|
10
9
|
from test.integration.connectors.utils.docker_compose import docker_compose_context
|
|
11
|
-
from test.integration.connectors.utils.validation import (
|
|
12
|
-
|
|
10
|
+
from test.integration.connectors.utils.validation.destination import (
|
|
11
|
+
StagerValidationConfigs,
|
|
12
|
+
stager_validation,
|
|
13
|
+
)
|
|
14
|
+
from test.integration.connectors.utils.validation.source import (
|
|
15
|
+
SourceValidationConfigs,
|
|
13
16
|
source_connector_validation,
|
|
14
17
|
)
|
|
15
18
|
from unstructured_ingest.v2.interfaces import FileData
|
|
@@ -28,13 +31,14 @@ from unstructured_ingest.v2.processes.connectors.sql.postgres import (
|
|
|
28
31
|
SEED_DATA_ROWS = 20
|
|
29
32
|
|
|
30
33
|
|
|
31
|
-
@
|
|
32
|
-
def
|
|
34
|
+
@pytest.fixture
|
|
35
|
+
def source_database_setup() -> str:
|
|
36
|
+
db_name = "test_db"
|
|
33
37
|
with docker_compose_context(docker_compose_path=env_setup_path / "sql" / "postgres" / "source"):
|
|
34
38
|
connection = connect(
|
|
35
39
|
user="unstructured",
|
|
36
40
|
password="test",
|
|
37
|
-
dbname=
|
|
41
|
+
dbname=db_name,
|
|
38
42
|
host="localhost",
|
|
39
43
|
port=5433,
|
|
40
44
|
)
|
|
@@ -43,12 +47,12 @@ def postgres_download_setup() -> None:
|
|
|
43
47
|
sql_statment = f"INSERT INTO cars (brand, price) VALUES " f"('brand_{i}', {i})"
|
|
44
48
|
cursor.execute(sql_statment)
|
|
45
49
|
connection.commit()
|
|
46
|
-
yield
|
|
50
|
+
yield db_name
|
|
47
51
|
|
|
48
52
|
|
|
49
53
|
@pytest.mark.asyncio
|
|
50
54
|
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
|
|
51
|
-
async def test_postgres_source():
|
|
55
|
+
async def test_postgres_source(temp_dir: Path, source_database_setup: str):
|
|
52
56
|
connect_params = {
|
|
53
57
|
"host": "localhost",
|
|
54
58
|
"port": 5433,
|
|
@@ -56,37 +60,31 @@ async def test_postgres_source():
|
|
|
56
60
|
"user": "unstructured",
|
|
57
61
|
"password": "test",
|
|
58
62
|
}
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
test_id="postgres",
|
|
85
|
-
expected_num_files=SEED_DATA_ROWS,
|
|
86
|
-
expected_number_indexed_file_data=4,
|
|
87
|
-
validate_downloaded_files=True,
|
|
88
|
-
),
|
|
89
|
-
)
|
|
63
|
+
connection_config = PostgresConnectionConfig(
|
|
64
|
+
host=connect_params["host"],
|
|
65
|
+
port=connect_params["port"],
|
|
66
|
+
database=connect_params["database"],
|
|
67
|
+
username=connect_params["user"],
|
|
68
|
+
access_config=PostgresAccessConfig(password=connect_params["password"]),
|
|
69
|
+
)
|
|
70
|
+
indexer = PostgresIndexer(
|
|
71
|
+
connection_config=connection_config,
|
|
72
|
+
index_config=PostgresIndexerConfig(table_name="cars", id_column="car_id", batch_size=5),
|
|
73
|
+
)
|
|
74
|
+
downloader = PostgresDownloader(
|
|
75
|
+
connection_config=connection_config,
|
|
76
|
+
download_config=PostgresDownloaderConfig(fields=["car_id", "brand"], download_dir=temp_dir),
|
|
77
|
+
)
|
|
78
|
+
await source_connector_validation(
|
|
79
|
+
indexer=indexer,
|
|
80
|
+
downloader=downloader,
|
|
81
|
+
configs=SourceValidationConfigs(
|
|
82
|
+
test_id="postgres",
|
|
83
|
+
expected_num_files=SEED_DATA_ROWS,
|
|
84
|
+
expected_number_indexed_file_data=4,
|
|
85
|
+
validate_downloaded_files=True,
|
|
86
|
+
),
|
|
87
|
+
)
|
|
90
88
|
|
|
91
89
|
|
|
92
90
|
def validate_destination(
|
|
@@ -118,63 +116,76 @@ def validate_destination(
|
|
|
118
116
|
|
|
119
117
|
@pytest.mark.asyncio
|
|
120
118
|
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
|
|
121
|
-
async def test_postgres_destination(upload_file: Path):
|
|
119
|
+
async def test_postgres_destination(upload_file: Path, temp_dir: Path):
|
|
122
120
|
# the postgres destination connector doesn't leverage the file data but is required as an input,
|
|
123
121
|
# mocking it with arbitrary values to meet the base requirements:
|
|
124
122
|
mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
|
|
125
123
|
with docker_compose_context(
|
|
126
124
|
docker_compose_path=env_setup_path / "sql" / "postgres" / "destination"
|
|
127
125
|
):
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
}
|
|
136
|
-
if stager.is_async():
|
|
137
|
-
staged_path = await stager.run_async(**stager_params)
|
|
138
|
-
else:
|
|
139
|
-
staged_path = stager.run(**stager_params)
|
|
140
|
-
|
|
141
|
-
# The stager should append the `.json` suffix to the output filename passed in.
|
|
142
|
-
assert staged_path.name == "test_db.json"
|
|
143
|
-
|
|
144
|
-
connect_params = {
|
|
145
|
-
"host": "localhost",
|
|
146
|
-
"port": 5433,
|
|
147
|
-
"database": "elements",
|
|
148
|
-
"user": "unstructured",
|
|
149
|
-
"password": "test",
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
uploader = PostgresUploader(
|
|
153
|
-
connection_config=PostgresConnectionConfig(
|
|
154
|
-
host=connect_params["host"],
|
|
155
|
-
port=connect_params["port"],
|
|
156
|
-
database=connect_params["database"],
|
|
157
|
-
username=connect_params["user"],
|
|
158
|
-
access_config=PostgresAccessConfig(password=connect_params["password"]),
|
|
159
|
-
)
|
|
160
|
-
)
|
|
126
|
+
stager = PostgresUploadStager()
|
|
127
|
+
staged_path = stager.run(
|
|
128
|
+
elements_filepath=upload_file,
|
|
129
|
+
file_data=mock_file_data,
|
|
130
|
+
output_dir=temp_dir,
|
|
131
|
+
output_filename=upload_file.name,
|
|
132
|
+
)
|
|
161
133
|
|
|
162
|
-
|
|
134
|
+
# The stager should append the `.json` suffix to the output filename passed in.
|
|
135
|
+
assert staged_path.suffix == upload_file.suffix
|
|
163
136
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
test_embedding=sample_element["embeddings"],
|
|
172
|
-
)
|
|
137
|
+
connect_params = {
|
|
138
|
+
"host": "localhost",
|
|
139
|
+
"port": 5433,
|
|
140
|
+
"database": "elements",
|
|
141
|
+
"user": "unstructured",
|
|
142
|
+
"password": "test",
|
|
143
|
+
}
|
|
173
144
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
145
|
+
uploader = PostgresUploader(
|
|
146
|
+
connection_config=PostgresConnectionConfig(
|
|
147
|
+
host=connect_params["host"],
|
|
148
|
+
port=connect_params["port"],
|
|
149
|
+
database=connect_params["database"],
|
|
150
|
+
username=connect_params["user"],
|
|
151
|
+
access_config=PostgresAccessConfig(password=connect_params["password"]),
|
|
180
152
|
)
|
|
153
|
+
)
|
|
154
|
+
uploader.precheck()
|
|
155
|
+
uploader.run(path=staged_path, file_data=mock_file_data)
|
|
156
|
+
|
|
157
|
+
with staged_path.open("r") as f:
|
|
158
|
+
staged_data = json.load(f)
|
|
159
|
+
|
|
160
|
+
sample_element = staged_data[0]
|
|
161
|
+
expected_num_elements = len(staged_data)
|
|
162
|
+
validate_destination(
|
|
163
|
+
connect_params=connect_params,
|
|
164
|
+
expected_num_elements=expected_num_elements,
|
|
165
|
+
expected_text=sample_element["text"],
|
|
166
|
+
test_embedding=sample_element["embeddings"],
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
uploader.run(path=staged_path, file_data=mock_file_data)
|
|
170
|
+
validate_destination(
|
|
171
|
+
connect_params=connect_params,
|
|
172
|
+
expected_num_elements=expected_num_elements,
|
|
173
|
+
expected_text=sample_element["text"],
|
|
174
|
+
test_embedding=sample_element["embeddings"],
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
179
|
+
def test_postgres_stager(
|
|
180
|
+
request: TopRequest,
|
|
181
|
+
upload_file_str: str,
|
|
182
|
+
tmp_path: Path,
|
|
183
|
+
):
|
|
184
|
+
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
185
|
+
stager = PostgresUploadStager()
|
|
186
|
+
stager_validation(
|
|
187
|
+
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
188
|
+
input_file=upload_file,
|
|
189
|
+
stager=stager,
|
|
190
|
+
tmp_dir=tmp_path,
|
|
191
|
+
)
|