unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/embed/mixedbreadai.py +0 -1
- unstructured_ingest/interfaces/upload_stager.py +2 -2
- unstructured_ingest/interfaces/uploader.py +3 -3
- unstructured_ingest/main.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/processes/chunker.py +4 -0
- unstructured_ingest/processes/connectors/airtable.py +4 -2
- unstructured_ingest/processes/connectors/astradb.py +2 -2
- unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
- unstructured_ingest/processes/connectors/confluence.py +0 -1
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
- unstructured_ingest/processes/connectors/delta_table.py +1 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
- unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
- unstructured_ingest/processes/connectors/gitlab.py +1 -2
- unstructured_ingest/processes/connectors/google_drive.py +0 -2
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
- unstructured_ingest/processes/connectors/kdbai.py +1 -0
- unstructured_ingest/processes/connectors/outlook.py +1 -2
- unstructured_ingest/processes/connectors/pinecone.py +0 -1
- unstructured_ingest/processes/connectors/redisdb.py +28 -24
- unstructured_ingest/processes/connectors/salesforce.py +1 -1
- unstructured_ingest/processes/connectors/slack.py +1 -2
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
- unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
- unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
- unstructured_ingest/processes/connectors/sql/sql.py +3 -4
- unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
- unstructured_ingest/processes/connectors/vectara.py +0 -2
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
- unstructured_ingest/processes/embedder.py +2 -2
- unstructured_ingest/processes/filter.py +1 -1
- unstructured_ingest/processes/partitioner.py +4 -0
- unstructured_ingest/processes/utils/blob_storage.py +2 -2
- unstructured_ingest/unstructured_api.py +13 -8
- unstructured_ingest/utils/data_prep.py +8 -32
- unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +50 -184
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
- examples/__init__.py +0 -0
- examples/airtable.py +0 -44
- examples/azure_cognitive_search.py +0 -55
- examples/chroma.py +0 -54
- examples/couchbase.py +0 -55
- examples/databricks_volumes_dest.py +0 -55
- examples/databricks_volumes_source.py +0 -53
- examples/delta_table.py +0 -45
- examples/discord_example.py +0 -36
- examples/elasticsearch.py +0 -49
- examples/google_drive.py +0 -45
- examples/kdbai.py +0 -54
- examples/local.py +0 -36
- examples/milvus.py +0 -44
- examples/mongodb.py +0 -53
- examples/opensearch.py +0 -50
- examples/pinecone.py +0 -57
- examples/s3.py +0 -38
- examples/salesforce.py +0 -44
- examples/sharepoint.py +0 -47
- examples/singlestore.py +0 -49
- examples/sql.py +0 -90
- examples/vectara.py +0 -54
- examples/weaviate.py +0 -44
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +0 -31
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +0 -38
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +0 -273
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +0 -90
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +0 -14
- test/integration/connectors/duckdb/test_duckdb.py +0 -90
- test/integration/connectors/duckdb/test_motherduck.py +0 -95
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +0 -34
- test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
- test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
- test/integration/connectors/sql/test_postgres.py +0 -201
- test/integration/connectors/sql/test_singlestore.py +0 -182
- test/integration/connectors/sql/test_snowflake.py +0 -244
- test/integration/connectors/sql/test_sqlite.py +0 -168
- test/integration/connectors/sql/test_vastdb.py +0 -34
- test/integration/connectors/test_astradb.py +0 -287
- test/integration/connectors/test_azure_ai_search.py +0 -254
- test/integration/connectors/test_chroma.py +0 -136
- test/integration/connectors/test_confluence.py +0 -111
- test/integration/connectors/test_delta_table.py +0 -183
- test/integration/connectors/test_dropbox.py +0 -151
- test/integration/connectors/test_github.py +0 -49
- test/integration/connectors/test_google_drive.py +0 -257
- test/integration/connectors/test_jira.py +0 -67
- test/integration/connectors/test_lancedb.py +0 -247
- test/integration/connectors/test_milvus.py +0 -208
- test/integration/connectors/test_mongodb.py +0 -335
- test/integration/connectors/test_neo4j.py +0 -244
- test/integration/connectors/test_notion.py +0 -152
- test/integration/connectors/test_onedrive.py +0 -163
- test/integration/connectors/test_pinecone.py +0 -387
- test/integration/connectors/test_qdrant.py +0 -216
- test/integration/connectors/test_redis.py +0 -143
- test/integration/connectors/test_s3.py +0 -184
- test/integration/connectors/test_sharepoint.py +0 -222
- test/integration/connectors/test_vectara.py +0 -282
- test/integration/connectors/test_zendesk.py +0 -120
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +0 -13
- test/integration/connectors/utils/docker.py +0 -151
- test/integration/connectors/utils/docker_compose.py +0 -59
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +0 -77
- test/integration/connectors/utils/validation/equality.py +0 -76
- test/integration/connectors/utils/validation/source.py +0 -331
- test/integration/connectors/utils/validation/utils.py +0 -36
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +0 -15
- test/integration/connectors/weaviate/test_cloud.py +0 -39
- test/integration/connectors/weaviate/test_local.py +0 -152
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +0 -13
- test/integration/embedders/test_azure_openai.py +0 -57
- test/integration/embedders/test_bedrock.py +0 -103
- test/integration/embedders/test_huggingface.py +0 -24
- test/integration/embedders/test_mixedbread.py +0 -71
- test/integration/embedders/test_octoai.py +0 -75
- test/integration/embedders/test_openai.py +0 -74
- test/integration/embedders/test_togetherai.py +0 -71
- test/integration/embedders/test_vertexai.py +0 -63
- test/integration/embedders/test_voyageai.py +0 -79
- test/integration/embedders/utils.py +0 -66
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +0 -76
- test/integration/utils.py +0 -15
- test/unit/__init__.py +0 -0
- test/unit/chunkers/__init__.py +0 -0
- test/unit/chunkers/test_chunkers.py +0 -49
- test/unit/connectors/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
- test/unit/connectors/motherduck/__init__.py +0 -0
- test/unit/connectors/motherduck/test_base.py +0 -73
- test/unit/connectors/sql/__init__.py +0 -0
- test/unit/connectors/sql/test_sql.py +0 -152
- test/unit/connectors/test_confluence.py +0 -71
- test/unit/connectors/test_jira.py +0 -401
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +0 -42
- test/unit/embed/test_octoai.py +0 -27
- test/unit/embed/test_openai.py +0 -28
- test/unit/embed/test_vertexai.py +0 -25
- test/unit/embed/test_voyageai.py +0 -24
- test/unit/embedders/__init__.py +0 -0
- test/unit/embedders/test_bedrock.py +0 -36
- test/unit/embedders/test_huggingface.py +0 -48
- test/unit/embedders/test_mixedbread.py +0 -37
- test/unit/embedders/test_octoai.py +0 -35
- test/unit/embedders/test_openai.py +0 -35
- test/unit/embedders/test_togetherai.py +0 -37
- test/unit/embedders/test_vertexai.py +0 -37
- test/unit/embedders/test_voyageai.py +0 -38
- test/unit/partitioners/__init__.py +0 -0
- test/unit/partitioners/test_partitioner.py +0 -63
- test/unit/test_error.py +0 -27
- test/unit/test_html.py +0 -112
- test/unit/test_interfaces.py +0 -26
- test/unit/test_utils.py +0 -220
- test/unit/utils/__init__.py +0 -0
- test/unit/utils/data_generator.py +0 -32
- unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
- unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
|
|
3
|
-
import pytest
|
|
4
|
-
|
|
5
|
-
int_test_dir = Path(__file__).parent
|
|
6
|
-
assets_dir = int_test_dir / "assets"
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
@pytest.fixture
|
|
10
|
-
def duckdb_schema() -> Path:
|
|
11
|
-
schema_file = assets_dir / "duckdb-schema.sql"
|
|
12
|
-
assert schema_file.exists()
|
|
13
|
-
assert schema_file.is_file()
|
|
14
|
-
return schema_file
|
|
@@ -1,90 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
|
|
4
|
-
import duckdb
|
|
5
|
-
import pytest
|
|
6
|
-
from _pytest.fixtures import TopRequest
|
|
7
|
-
|
|
8
|
-
from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
|
|
9
|
-
from test.integration.connectors.utils.validation.destination import (
|
|
10
|
-
StagerValidationConfigs,
|
|
11
|
-
stager_validation,
|
|
12
|
-
)
|
|
13
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
14
|
-
from unstructured_ingest.processes.connectors.duckdb.duckdb import (
|
|
15
|
-
CONNECTOR_TYPE,
|
|
16
|
-
DuckDBConnectionConfig,
|
|
17
|
-
DuckDBUploader,
|
|
18
|
-
DuckDBUploaderConfig,
|
|
19
|
-
DuckDBUploadStager,
|
|
20
|
-
)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
@pytest.fixture
|
|
24
|
-
def provisioned_db_file(duckdb_schema: Path, temp_dir: Path) -> Path:
|
|
25
|
-
db_path = Path(temp_dir) / "temp_duck.db"
|
|
26
|
-
with duckdb.connect(database=db_path) as duckdb_connection:
|
|
27
|
-
with duckdb_schema.open("r") as f:
|
|
28
|
-
query = f.read()
|
|
29
|
-
duckdb_connection.execute(query)
|
|
30
|
-
duckdb_connection.close()
|
|
31
|
-
return db_path
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def validate_duckdb_destination(db_path: Path, expected_num_elements: int):
|
|
35
|
-
conn = None
|
|
36
|
-
try:
|
|
37
|
-
conn = duckdb.connect(db_path)
|
|
38
|
-
_results = conn.sql("select count(*) from elements").fetchall()
|
|
39
|
-
_count = _results[0][0]
|
|
40
|
-
assert (
|
|
41
|
-
_count == expected_num_elements
|
|
42
|
-
), f"dest check failed: got {_count}, expected {expected_num_elements}"
|
|
43
|
-
conn.close()
|
|
44
|
-
finally:
|
|
45
|
-
if conn:
|
|
46
|
-
conn.close()
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "duckdb", SQL_TAG)
|
|
50
|
-
def test_duckdb_destination(upload_file: Path, provisioned_db_file: Path, temp_dir: Path):
|
|
51
|
-
file_data = FileData(
|
|
52
|
-
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
53
|
-
connector_type=CONNECTOR_TYPE,
|
|
54
|
-
identifier="mock-file-data",
|
|
55
|
-
)
|
|
56
|
-
|
|
57
|
-
stager = DuckDBUploadStager()
|
|
58
|
-
staged_path = stager.run(
|
|
59
|
-
elements_filepath=upload_file,
|
|
60
|
-
file_data=file_data,
|
|
61
|
-
output_dir=temp_dir,
|
|
62
|
-
output_filename=upload_file.name,
|
|
63
|
-
)
|
|
64
|
-
|
|
65
|
-
connection_config = DuckDBConnectionConfig(database=str(provisioned_db_file))
|
|
66
|
-
upload_config = DuckDBUploaderConfig()
|
|
67
|
-
uploader = DuckDBUploader(connection_config=connection_config, upload_config=upload_config)
|
|
68
|
-
|
|
69
|
-
uploader.run(path=staged_path, file_data=file_data)
|
|
70
|
-
|
|
71
|
-
with staged_path.open() as f:
|
|
72
|
-
data = json.load(f)
|
|
73
|
-
validate_duckdb_destination(db_path=provisioned_db_file, expected_num_elements=len(data))
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "duckdb", SQL_TAG)
|
|
77
|
-
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
78
|
-
def test_duckdb_stager(
|
|
79
|
-
request: TopRequest,
|
|
80
|
-
upload_file_str: str,
|
|
81
|
-
tmp_path: Path,
|
|
82
|
-
):
|
|
83
|
-
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
84
|
-
stager = DuckDBUploadStager()
|
|
85
|
-
stager_validation(
|
|
86
|
-
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
87
|
-
input_file=upload_file,
|
|
88
|
-
stager=stager,
|
|
89
|
-
tmp_dir=tmp_path,
|
|
90
|
-
)
|
|
@@ -1,95 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import uuid
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from typing import Generator
|
|
5
|
-
|
|
6
|
-
import duckdb
|
|
7
|
-
import pandas as pd
|
|
8
|
-
import pytest
|
|
9
|
-
|
|
10
|
-
from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
|
|
11
|
-
from test.integration.utils import requires_env
|
|
12
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
13
|
-
from unstructured_ingest.processes.connectors.duckdb.motherduck import (
|
|
14
|
-
CONNECTOR_TYPE,
|
|
15
|
-
MotherDuckAccessConfig,
|
|
16
|
-
MotherDuckConnectionConfig,
|
|
17
|
-
MotherDuckUploader,
|
|
18
|
-
MotherDuckUploaderConfig,
|
|
19
|
-
MotherDuckUploadStager,
|
|
20
|
-
)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
@pytest.fixture
|
|
24
|
-
def md_token() -> str:
|
|
25
|
-
motherduck_token = os.getenv("MOTHERDUCK_TOKEN", None)
|
|
26
|
-
assert motherduck_token
|
|
27
|
-
return motherduck_token
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
@pytest.fixture
|
|
31
|
-
def provisioned_db(md_token: str, duckdb_schema: Path) -> Generator[str, None, None]:
|
|
32
|
-
database_name = f"test_{str(uuid.uuid4()).replace('-', '_')}"
|
|
33
|
-
try:
|
|
34
|
-
with duckdb.connect(f"md:?motherduck_token={md_token}") as md_conn:
|
|
35
|
-
with duckdb_schema.open("r") as f:
|
|
36
|
-
query = f.read()
|
|
37
|
-
md_conn.execute(f"CREATE DATABASE {database_name}")
|
|
38
|
-
md_conn.execute(f"USE {database_name}")
|
|
39
|
-
md_conn.execute(query)
|
|
40
|
-
md_conn.close()
|
|
41
|
-
yield database_name
|
|
42
|
-
finally:
|
|
43
|
-
with duckdb.connect(f"md:?motherduck_token={md_token}") as md_conn:
|
|
44
|
-
md_conn.execute(f"DROP DATABASE {database_name}")
|
|
45
|
-
md_conn.close()
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def validate_motherduck_destination(database: str, expected_num_elements: int, md_token: str):
|
|
49
|
-
conn = None
|
|
50
|
-
try:
|
|
51
|
-
conn = duckdb.connect(f"md:?motherduck_token={md_token}")
|
|
52
|
-
conn.execute(f"USE {database}")
|
|
53
|
-
_results = conn.sql("select count(*) from elements").fetchall()
|
|
54
|
-
_count = _results[0][0]
|
|
55
|
-
assert (
|
|
56
|
-
_count == expected_num_elements
|
|
57
|
-
), f"dest check failed: got {_count}, expected {expected_num_elements}"
|
|
58
|
-
conn.close()
|
|
59
|
-
finally:
|
|
60
|
-
if conn:
|
|
61
|
-
conn.close()
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
|
|
65
|
-
@requires_env("MOTHERDUCK_TOKEN")
|
|
66
|
-
def test_motherduck_destination(
|
|
67
|
-
md_token: str, upload_file: Path, provisioned_db: str, temp_dir: Path
|
|
68
|
-
):
|
|
69
|
-
file_data = FileData(
|
|
70
|
-
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
71
|
-
connector_type=CONNECTOR_TYPE,
|
|
72
|
-
identifier="mock-file-data",
|
|
73
|
-
)
|
|
74
|
-
|
|
75
|
-
stager = MotherDuckUploadStager()
|
|
76
|
-
staged_path = stager.run(
|
|
77
|
-
elements_filepath=upload_file,
|
|
78
|
-
file_data=file_data,
|
|
79
|
-
output_dir=temp_dir,
|
|
80
|
-
output_filename=upload_file.name,
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
access_config = MotherDuckAccessConfig(md_token=md_token)
|
|
84
|
-
connection_config = MotherDuckConnectionConfig(
|
|
85
|
-
database=provisioned_db, access_config=access_config
|
|
86
|
-
)
|
|
87
|
-
upload_config = MotherDuckUploaderConfig()
|
|
88
|
-
uploader = MotherDuckUploader(connection_config=connection_config, upload_config=upload_config)
|
|
89
|
-
|
|
90
|
-
uploader.run(path=staged_path, file_data=file_data)
|
|
91
|
-
|
|
92
|
-
staged_df = pd.read_json(staged_path, orient="records", lines=True)
|
|
93
|
-
validate_motherduck_destination(
|
|
94
|
-
database=provisioned_db, expected_num_elements=len(staged_df), md_token=md_token
|
|
95
|
-
)
|
|
File without changes
|
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
|
|
4
|
-
import pandas as pd
|
|
5
|
-
import pytest
|
|
6
|
-
|
|
7
|
-
int_test_dir = Path(__file__).parent
|
|
8
|
-
assets_dir = int_test_dir / "assets"
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@pytest.fixture
|
|
12
|
-
def movies_dataframe() -> pd.DataFrame:
|
|
13
|
-
movies_file = assets_dir / "wiki_movie_plots_small.csv"
|
|
14
|
-
assert movies_file.exists()
|
|
15
|
-
assert movies_file.is_file()
|
|
16
|
-
return pd.read_csv(movies_file).dropna().reset_index()
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
@pytest.fixture
|
|
20
|
-
def opensearch_elements_mapping() -> dict:
|
|
21
|
-
elements_mapping_file = assets_dir / "opensearch_elements_mappings.json"
|
|
22
|
-
assert elements_mapping_file.exists()
|
|
23
|
-
assert elements_mapping_file.is_file()
|
|
24
|
-
with elements_mapping_file.open() as fp:
|
|
25
|
-
return json.load(fp)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
@pytest.fixture
|
|
29
|
-
def elasticsearch_elements_mapping() -> dict:
|
|
30
|
-
elements_mapping_file = assets_dir / "elasticsearch_elements_mappings.json"
|
|
31
|
-
assert elements_mapping_file.exists()
|
|
32
|
-
assert elements_mapping_file.is_file()
|
|
33
|
-
with elements_mapping_file.open() as fp:
|
|
34
|
-
return json.load(fp)
|
|
@@ -1,331 +0,0 @@
|
|
|
1
|
-
# ruff: noqa: I001
|
|
2
|
-
import json
|
|
3
|
-
import tempfile
|
|
4
|
-
import time
|
|
5
|
-
from contextlib import contextmanager
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
from typing import Generator
|
|
8
|
-
from test.integration.connectors.utils.validation.destination import (
|
|
9
|
-
StagerValidationConfigs,
|
|
10
|
-
stager_validation,
|
|
11
|
-
)
|
|
12
|
-
import pandas as pd
|
|
13
|
-
import pytest
|
|
14
|
-
from _pytest.fixtures import TopRequest
|
|
15
|
-
from elasticsearch import Elasticsearch as ElasticsearchClient
|
|
16
|
-
from elasticsearch.helpers import bulk
|
|
17
|
-
|
|
18
|
-
from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, NOSQL_TAG
|
|
19
|
-
from test.integration.connectors.utils.docker import HealthCheck, container_context
|
|
20
|
-
from test.integration.connectors.utils.validation.source import (
|
|
21
|
-
SourceValidationConfigs,
|
|
22
|
-
source_connector_validation,
|
|
23
|
-
)
|
|
24
|
-
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
25
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
26
|
-
from unstructured_ingest.processes.connectors.elasticsearch.elasticsearch import (
|
|
27
|
-
CONNECTOR_TYPE,
|
|
28
|
-
ElasticsearchAccessConfig,
|
|
29
|
-
ElasticsearchConnectionConfig,
|
|
30
|
-
ElasticsearchDownloader,
|
|
31
|
-
ElasticsearchDownloaderConfig,
|
|
32
|
-
ElasticsearchIndexer,
|
|
33
|
-
ElasticsearchIndexerConfig,
|
|
34
|
-
ElasticsearchUploader,
|
|
35
|
-
ElasticsearchUploaderConfig,
|
|
36
|
-
ElasticsearchUploadStager,
|
|
37
|
-
ElasticsearchUploadStagerConfig,
|
|
38
|
-
)
|
|
39
|
-
|
|
40
|
-
SOURCE_INDEX_NAME = "movies"
|
|
41
|
-
DESTINATION_INDEX_NAME = "elements"
|
|
42
|
-
ES_USERNAME = "elastic"
|
|
43
|
-
ES_PASSWORD = "elastic_password"
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
@contextmanager
|
|
47
|
-
def get_client() -> Generator[ElasticsearchClient, None, None]:
|
|
48
|
-
with ElasticsearchClient(
|
|
49
|
-
hosts="http://localhost:9200", basic_auth=(ES_USERNAME, ES_PASSWORD), request_timeout=30
|
|
50
|
-
) as client:
|
|
51
|
-
yield client
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
def form_elasticsearch_doc_dict(i, csv_row):
|
|
55
|
-
return {
|
|
56
|
-
"_index": SOURCE_INDEX_NAME,
|
|
57
|
-
"_id": i,
|
|
58
|
-
"_source": {
|
|
59
|
-
"title": csv_row["Title"],
|
|
60
|
-
"ethnicity": csv_row["Origin/Ethnicity"],
|
|
61
|
-
"director": csv_row["Director"],
|
|
62
|
-
"cast": csv_row["Cast"],
|
|
63
|
-
"genre": csv_row["Genre"],
|
|
64
|
-
"plot": csv_row["Plot"],
|
|
65
|
-
"year": csv_row["Release Year"],
|
|
66
|
-
"wiki_page": csv_row["Wiki Page"],
|
|
67
|
-
},
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
def dataframe_to_upload_data(df: pd.DataFrame) -> list[dict]:
|
|
72
|
-
upload_data = []
|
|
73
|
-
for index, row in df.iterrows():
|
|
74
|
-
upload_data.append(form_elasticsearch_doc_dict(index, row))
|
|
75
|
-
return upload_data
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
def get_index_count(client: ElasticsearchClient, index_name: str) -> int:
|
|
79
|
-
count_resp = client.cat.count(index=index_name, format="json")
|
|
80
|
-
return int(count_resp[0]["count"])
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
def validate_count(
|
|
84
|
-
client: ElasticsearchClient,
|
|
85
|
-
index_name: str,
|
|
86
|
-
expected_count: int,
|
|
87
|
-
retries: int = 10,
|
|
88
|
-
interval: int = 1,
|
|
89
|
-
) -> None:
|
|
90
|
-
current_count = get_index_count(client, index_name)
|
|
91
|
-
if current_count == expected_count:
|
|
92
|
-
return
|
|
93
|
-
tries = 0
|
|
94
|
-
while tries < retries:
|
|
95
|
-
print(
|
|
96
|
-
f"retrying validation to check if expected count "
|
|
97
|
-
f"{expected_count} will match current count {current_count}"
|
|
98
|
-
)
|
|
99
|
-
time.sleep(interval)
|
|
100
|
-
current_count = get_index_count(client, index_name)
|
|
101
|
-
if current_count == expected_count:
|
|
102
|
-
break
|
|
103
|
-
assert current_count == expected_count, (
|
|
104
|
-
f"Expected count ({expected_count}) doesn't match how "
|
|
105
|
-
f"much came back from index: {current_count}"
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
def seed_source_db(df: pd.DataFrame):
|
|
110
|
-
mapping = {
|
|
111
|
-
"properties": {
|
|
112
|
-
"title": {"type": "text", "analyzer": "english"},
|
|
113
|
-
"ethnicity": {"type": "text", "analyzer": "standard"},
|
|
114
|
-
"director": {"type": "text", "analyzer": "standard"},
|
|
115
|
-
"cast": {"type": "text", "analyzer": "standard"},
|
|
116
|
-
"genre": {"type": "text", "analyzer": "standard"},
|
|
117
|
-
"plot": {"type": "text", "analyzer": "english"},
|
|
118
|
-
"year": {"type": "integer"},
|
|
119
|
-
"wiki_page": {"type": "keyword"},
|
|
120
|
-
},
|
|
121
|
-
}
|
|
122
|
-
# seed content
|
|
123
|
-
with get_client() as client:
|
|
124
|
-
client.indices.create(index=SOURCE_INDEX_NAME, mappings=mapping)
|
|
125
|
-
upload_data = dataframe_to_upload_data(df=df)
|
|
126
|
-
bulk(client, upload_data)
|
|
127
|
-
client.indices.refresh(index=SOURCE_INDEX_NAME)
|
|
128
|
-
count = get_index_count(client, SOURCE_INDEX_NAME)
|
|
129
|
-
print(f"seeded {SOURCE_INDEX_NAME} index with {count} records")
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
@pytest.fixture
|
|
133
|
-
def source_index(movies_dataframe: pd.DataFrame) -> str:
|
|
134
|
-
with container_context(
|
|
135
|
-
image="docker.elastic.co/elasticsearch/elasticsearch:8.7.0",
|
|
136
|
-
ports={9200: 9200, 9300: 9300},
|
|
137
|
-
environment={
|
|
138
|
-
"discovery.type": "single-node",
|
|
139
|
-
"xpack.security.enabled": True,
|
|
140
|
-
"ELASTIC_PASSWORD": ES_PASSWORD,
|
|
141
|
-
"ELASTIC_USER": ES_USERNAME,
|
|
142
|
-
},
|
|
143
|
-
healthcheck=HealthCheck(
|
|
144
|
-
test="curl --silent --fail -u ${ELASTIC_USER}:${ELASTIC_PASSWORD} localhost:9200/_cluster/health || exit 1", # noqa: E501
|
|
145
|
-
interval=1,
|
|
146
|
-
start_period=5,
|
|
147
|
-
),
|
|
148
|
-
):
|
|
149
|
-
seed_source_db(df=movies_dataframe)
|
|
150
|
-
yield SOURCE_INDEX_NAME
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
@pytest.fixture
|
|
154
|
-
def destination_index(elasticsearch_elements_mapping: dict) -> str:
|
|
155
|
-
with container_context(
|
|
156
|
-
image="docker.elastic.co/elasticsearch/elasticsearch:8.7.0",
|
|
157
|
-
ports={9200: 9200, 9300: 9300},
|
|
158
|
-
environment={
|
|
159
|
-
"discovery.type": "single-node",
|
|
160
|
-
"xpack.security.enabled": True,
|
|
161
|
-
"ELASTIC_PASSWORD": ES_PASSWORD,
|
|
162
|
-
"ELASTIC_USER": ES_USERNAME,
|
|
163
|
-
},
|
|
164
|
-
healthcheck=HealthCheck(
|
|
165
|
-
test="curl --silent --fail -u ${ELASTIC_USER}:${ELASTIC_PASSWORD} localhost:9200/_cluster/health || exit 1", # noqa: E501
|
|
166
|
-
interval=1,
|
|
167
|
-
start_period=5,
|
|
168
|
-
),
|
|
169
|
-
):
|
|
170
|
-
with get_client() as client:
|
|
171
|
-
response = client.indices.create(
|
|
172
|
-
index=DESTINATION_INDEX_NAME, mappings=elasticsearch_elements_mapping
|
|
173
|
-
)
|
|
174
|
-
if not response["acknowledged"]:
|
|
175
|
-
raise RuntimeError(f"failed to create index: {response}")
|
|
176
|
-
yield DESTINATION_INDEX_NAME
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
@pytest.mark.asyncio
|
|
180
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
|
|
181
|
-
async def test_elasticsearch_source(source_index: str, movies_dataframe: pd.DataFrame):
|
|
182
|
-
indexer_config = ElasticsearchIndexerConfig(index_name=source_index)
|
|
183
|
-
with tempfile.TemporaryDirectory() as tempdir:
|
|
184
|
-
tempdir_path = Path(tempdir)
|
|
185
|
-
connection_config = ElasticsearchConnectionConfig(
|
|
186
|
-
access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
|
|
187
|
-
username=ES_USERNAME,
|
|
188
|
-
hosts=["http://localhost:9200"],
|
|
189
|
-
)
|
|
190
|
-
download_config = ElasticsearchDownloaderConfig(download_dir=tempdir_path)
|
|
191
|
-
indexer = ElasticsearchIndexer(
|
|
192
|
-
connection_config=connection_config, index_config=indexer_config
|
|
193
|
-
)
|
|
194
|
-
downloader = ElasticsearchDownloader(
|
|
195
|
-
connection_config=connection_config, download_config=download_config
|
|
196
|
-
)
|
|
197
|
-
expected_num_files = len(movies_dataframe)
|
|
198
|
-
await source_connector_validation(
|
|
199
|
-
indexer=indexer,
|
|
200
|
-
downloader=downloader,
|
|
201
|
-
configs=SourceValidationConfigs(
|
|
202
|
-
test_id=CONNECTOR_TYPE,
|
|
203
|
-
expected_num_files=expected_num_files,
|
|
204
|
-
expected_number_indexed_file_data=1,
|
|
205
|
-
validate_downloaded_files=True,
|
|
206
|
-
),
|
|
207
|
-
)
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
|
|
211
|
-
def test_elasticsearch_source_precheck_fail_no_cluster():
|
|
212
|
-
indexer_config = ElasticsearchIndexerConfig(index_name="index")
|
|
213
|
-
|
|
214
|
-
connection_config = ElasticsearchConnectionConfig(
|
|
215
|
-
access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
|
|
216
|
-
username=ES_USERNAME,
|
|
217
|
-
hosts=["http://localhost:9200"],
|
|
218
|
-
)
|
|
219
|
-
indexer = ElasticsearchIndexer(connection_config=connection_config, index_config=indexer_config)
|
|
220
|
-
with pytest.raises(SourceConnectionError):
|
|
221
|
-
indexer.precheck()
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
|
|
225
|
-
def test_elasticsearch_source_precheck_fail_no_index(source_index: str):
|
|
226
|
-
indexer_config = ElasticsearchIndexerConfig(index_name="index")
|
|
227
|
-
|
|
228
|
-
connection_config = ElasticsearchConnectionConfig(
|
|
229
|
-
access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
|
|
230
|
-
username=ES_USERNAME,
|
|
231
|
-
hosts=["http://localhost:9200"],
|
|
232
|
-
)
|
|
233
|
-
indexer = ElasticsearchIndexer(connection_config=connection_config, index_config=indexer_config)
|
|
234
|
-
with pytest.raises(SourceConnectionError):
|
|
235
|
-
indexer.precheck()
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
@pytest.mark.asyncio
|
|
239
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
|
|
240
|
-
async def test_elasticsearch_destination(
|
|
241
|
-
upload_file: Path,
|
|
242
|
-
destination_index: str,
|
|
243
|
-
tmp_path: Path,
|
|
244
|
-
):
|
|
245
|
-
file_data = FileData(
|
|
246
|
-
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
247
|
-
connector_type=CONNECTOR_TYPE,
|
|
248
|
-
identifier="mock file data",
|
|
249
|
-
)
|
|
250
|
-
connection_config = ElasticsearchConnectionConfig(
|
|
251
|
-
access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
|
|
252
|
-
username=ES_USERNAME,
|
|
253
|
-
hosts=["http://localhost:9200"],
|
|
254
|
-
)
|
|
255
|
-
stager = ElasticsearchUploadStager(
|
|
256
|
-
upload_stager_config=ElasticsearchUploadStagerConfig(index_name=destination_index)
|
|
257
|
-
)
|
|
258
|
-
|
|
259
|
-
uploader = ElasticsearchUploader(
|
|
260
|
-
connection_config=connection_config,
|
|
261
|
-
upload_config=ElasticsearchUploaderConfig(index_name=destination_index),
|
|
262
|
-
)
|
|
263
|
-
staged_filepath = stager.run(
|
|
264
|
-
elements_filepath=upload_file,
|
|
265
|
-
file_data=file_data,
|
|
266
|
-
output_dir=tmp_path,
|
|
267
|
-
output_filename=upload_file.name,
|
|
268
|
-
)
|
|
269
|
-
uploader.precheck()
|
|
270
|
-
uploader.run(path=staged_filepath, file_data=file_data)
|
|
271
|
-
|
|
272
|
-
# Run validation
|
|
273
|
-
with staged_filepath.open() as f:
|
|
274
|
-
staged_elements = json.load(f)
|
|
275
|
-
expected_count = len(staged_elements)
|
|
276
|
-
with get_client() as client:
|
|
277
|
-
validate_count(client=client, expected_count=expected_count, index_name=destination_index)
|
|
278
|
-
|
|
279
|
-
# Rerun and make sure the same documents get updated
|
|
280
|
-
uploader.run(path=staged_filepath, file_data=file_data)
|
|
281
|
-
with get_client() as client:
|
|
282
|
-
validate_count(client=client, expected_count=expected_count, index_name=destination_index)
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
|
|
286
|
-
def test_elasticsearch_destination_precheck_fail():
|
|
287
|
-
connection_config = ElasticsearchConnectionConfig(
|
|
288
|
-
access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
|
|
289
|
-
username=ES_USERNAME,
|
|
290
|
-
hosts=["http://localhost:9200"],
|
|
291
|
-
)
|
|
292
|
-
uploader = ElasticsearchUploader(
|
|
293
|
-
connection_config=connection_config,
|
|
294
|
-
upload_config=ElasticsearchUploaderConfig(index_name="index"),
|
|
295
|
-
)
|
|
296
|
-
with pytest.raises(DestinationConnectionError):
|
|
297
|
-
uploader.precheck()
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
|
|
301
|
-
def test_elasticsearch_destination_precheck_fail_no_index(destination_index: str):
|
|
302
|
-
connection_config = ElasticsearchConnectionConfig(
|
|
303
|
-
access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
|
|
304
|
-
username=ES_USERNAME,
|
|
305
|
-
hosts=["http://localhost:9200"],
|
|
306
|
-
)
|
|
307
|
-
uploader = ElasticsearchUploader(
|
|
308
|
-
connection_config=connection_config,
|
|
309
|
-
upload_config=ElasticsearchUploaderConfig(index_name="index"),
|
|
310
|
-
)
|
|
311
|
-
with pytest.raises(DestinationConnectionError):
|
|
312
|
-
uploader.precheck()
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
|
|
316
|
-
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
317
|
-
def test_elasticsearch_stager(
|
|
318
|
-
request: TopRequest,
|
|
319
|
-
upload_file_str: str,
|
|
320
|
-
tmp_path: Path,
|
|
321
|
-
):
|
|
322
|
-
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
323
|
-
stager = ElasticsearchUploadStager(
|
|
324
|
-
upload_stager_config=ElasticsearchUploadStagerConfig(index_name="mock_index")
|
|
325
|
-
)
|
|
326
|
-
stager_validation(
|
|
327
|
-
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
328
|
-
input_file=upload_file,
|
|
329
|
-
stager=stager,
|
|
330
|
-
tmp_dir=tmp_path,
|
|
331
|
-
)
|