unstructured-ingest 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (87) hide show
  1. test/integration/chunkers/test_chunkers.py +0 -11
  2. test/integration/connectors/conftest.py +11 -1
  3. test/integration/connectors/databricks_tests/test_volumes_native.py +4 -3
  4. test/integration/connectors/duckdb/conftest.py +14 -0
  5. test/integration/connectors/duckdb/test_duckdb.py +51 -44
  6. test/integration/connectors/duckdb/test_motherduck.py +37 -48
  7. test/integration/connectors/elasticsearch/test_elasticsearch.py +26 -4
  8. test/integration/connectors/elasticsearch/test_opensearch.py +26 -3
  9. test/integration/connectors/sql/test_postgres.py +103 -92
  10. test/integration/connectors/sql/test_singlestore.py +112 -100
  11. test/integration/connectors/sql/test_snowflake.py +142 -117
  12. test/integration/connectors/sql/test_sqlite.py +87 -76
  13. test/integration/connectors/test_astradb.py +62 -1
  14. test/integration/connectors/test_azure_ai_search.py +25 -3
  15. test/integration/connectors/test_chroma.py +120 -0
  16. test/integration/connectors/test_confluence.py +4 -4
  17. test/integration/connectors/test_delta_table.py +1 -0
  18. test/integration/connectors/test_kafka.py +6 -6
  19. test/integration/connectors/test_milvus.py +21 -0
  20. test/integration/connectors/test_mongodb.py +7 -4
  21. test/integration/connectors/test_neo4j.py +236 -0
  22. test/integration/connectors/test_pinecone.py +25 -1
  23. test/integration/connectors/test_qdrant.py +25 -2
  24. test/integration/connectors/test_s3.py +9 -6
  25. test/integration/connectors/utils/docker.py +6 -0
  26. test/integration/connectors/utils/validation/__init__.py +0 -0
  27. test/integration/connectors/utils/validation/destination.py +88 -0
  28. test/integration/connectors/utils/validation/equality.py +75 -0
  29. test/integration/connectors/utils/{validation.py → validation/source.py} +42 -98
  30. test/integration/connectors/utils/validation/utils.py +36 -0
  31. unstructured_ingest/__version__.py +1 -1
  32. unstructured_ingest/utils/chunking.py +11 -0
  33. unstructured_ingest/utils/data_prep.py +36 -0
  34. unstructured_ingest/v2/interfaces/__init__.py +3 -1
  35. unstructured_ingest/v2/interfaces/file_data.py +58 -14
  36. unstructured_ingest/v2/interfaces/upload_stager.py +70 -6
  37. unstructured_ingest/v2/interfaces/uploader.py +11 -2
  38. unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
  39. unstructured_ingest/v2/pipeline/steps/download.py +5 -4
  40. unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
  41. unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
  42. unstructured_ingest/v2/pipeline/steps/index.py +4 -4
  43. unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
  44. unstructured_ingest/v2/pipeline/steps/stage.py +5 -3
  45. unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
  46. unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
  47. unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
  48. unstructured_ingest/v2/processes/connectors/astradb.py +43 -63
  49. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +16 -40
  50. unstructured_ingest/v2/processes/connectors/chroma.py +36 -59
  51. unstructured_ingest/v2/processes/connectors/couchbase.py +92 -93
  52. unstructured_ingest/v2/processes/connectors/delta_table.py +11 -33
  53. unstructured_ingest/v2/processes/connectors/duckdb/base.py +26 -26
  54. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +29 -20
  55. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +37 -44
  56. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +46 -75
  57. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
  58. unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
  59. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
  60. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
  61. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
  62. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
  63. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
  64. unstructured_ingest/v2/processes/connectors/gitlab.py +32 -31
  65. unstructured_ingest/v2/processes/connectors/google_drive.py +32 -29
  66. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +2 -4
  67. unstructured_ingest/v2/processes/connectors/kdbai.py +44 -70
  68. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -10
  69. unstructured_ingest/v2/processes/connectors/local.py +13 -2
  70. unstructured_ingest/v2/processes/connectors/milvus.py +16 -57
  71. unstructured_ingest/v2/processes/connectors/mongodb.py +99 -108
  72. unstructured_ingest/v2/processes/connectors/neo4j.py +383 -0
  73. unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
  74. unstructured_ingest/v2/processes/connectors/pinecone.py +3 -33
  75. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +32 -41
  76. unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
  77. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
  78. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
  79. unstructured_ingest/v2/processes/connectors/sql/sql.py +72 -66
  80. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
  81. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +9 -31
  82. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/METADATA +20 -15
  83. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/RECORD +87 -79
  84. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/LICENSE.md +0 -0
  85. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/WHEEL +0 -0
  86. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/entry_points.txt +0 -0
  87. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/top_level.txt +0 -0
@@ -29,14 +29,3 @@ async def test_chunker_api(chunker_file: Path, strategy: str):
29
29
  chunker = Chunker(config=chunker_config)
30
30
  results = await chunker.run_async(elements_filepath=chunker_file)
31
31
  assert results
32
-
33
-
34
- @pytest.mark.parametrize("chunker_file", chunker_files, ids=[path.name for path in chunker_files])
35
- @pytest.mark.parametrize("strategy", ["basic", "by_title"])
36
- def test_chunker_basic(chunker_file: Path, strategy: str):
37
- chunker_config = ChunkerConfig(
38
- chunking_strategy=strategy,
39
- )
40
- chunker = Chunker(config=chunker_config)
41
- results = chunker.run(elements_filepath=chunker_file)
42
- assert results
@@ -6,7 +6,7 @@ import pytest
6
6
 
7
7
  from unstructured_ingest.v2.logger import logger
8
8
 
9
- FILENAME = "DA-1p-with-duplicate-pages.pdf.json"
9
+ FILENAME = Path("DA-1p-with-duplicate-pages.pdf.json")
10
10
 
11
11
 
12
12
  @pytest.fixture
@@ -19,6 +19,16 @@ def upload_file() -> Path:
19
19
  return upload_file
20
20
 
21
21
 
22
+ @pytest.fixture
23
+ def upload_file_ndjson() -> Path:
24
+ int_test_dir = Path(__file__).parent
25
+ assets_dir = int_test_dir / "assets"
26
+ upload_file = assets_dir / FILENAME.with_suffix(".ndjson")
27
+ assert upload_file.exists()
28
+ assert upload_file.is_file()
29
+ return upload_file
30
+
31
+
22
32
  @pytest.fixture
23
33
  def temp_dir() -> Generator[Path, None, None]:
24
34
  with tempfile.TemporaryDirectory() as temp_dir:
@@ -11,8 +11,8 @@ from databricks.sdk import WorkspaceClient
11
11
  from databricks.sdk.errors.platform import NotFound
12
12
 
13
13
  from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
14
- from test.integration.connectors.utils.validation import (
15
- ValidationConfigs,
14
+ from test.integration.connectors.utils.validation.source import (
15
+ SourceValidationConfigs,
16
16
  source_connector_validation,
17
17
  )
18
18
  from test.integration.utils import requires_env
@@ -82,7 +82,7 @@ async def test_volumes_native_source():
82
82
  await source_connector_validation(
83
83
  indexer=indexer,
84
84
  downloader=downloader,
85
- configs=ValidationConfigs(
85
+ configs=SourceValidationConfigs(
86
86
  test_id="databricks_volumes_native",
87
87
  expected_num_files=1,
88
88
  ),
@@ -156,6 +156,7 @@ async def test_volumes_native_destination(upload_file: Path):
156
156
  catalog=env_data.catalog,
157
157
  ),
158
158
  )
159
+ uploader.precheck()
159
160
  if uploader.is_async():
160
161
  await uploader.run_async(path=upload_file, file_data=file_data)
161
162
  else:
@@ -0,0 +1,14 @@
1
+ from pathlib import Path
2
+
3
+ import pytest
4
+
5
+ int_test_dir = Path(__file__).parent
6
+ assets_dir = int_test_dir / "assets"
7
+
8
+
9
+ @pytest.fixture
10
+ def duckdb_schema() -> Path:
11
+ schema_file = assets_dir / "duckdb-schema.sql"
12
+ assert schema_file.exists()
13
+ assert schema_file.is_file()
14
+ return schema_file
@@ -1,13 +1,15 @@
1
- import tempfile
2
- from contextlib import contextmanager
1
+ import json
3
2
  from pathlib import Path
4
- from typing import Generator
5
3
 
6
4
  import duckdb
7
- import pandas as pd
8
5
  import pytest
6
+ from _pytest.fixtures import TopRequest
9
7
 
10
8
  from test.integration.connectors.utils.constants import DESTINATION_TAG
9
+ from test.integration.connectors.utils.validation.destination import (
10
+ StagerValidationConfigs,
11
+ stager_validation,
12
+ )
11
13
  from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
12
14
  from unstructured_ingest.v2.processes.connectors.duckdb.duckdb import (
13
15
  CONNECTOR_TYPE,
@@ -18,19 +20,15 @@ from unstructured_ingest.v2.processes.connectors.duckdb.duckdb import (
18
20
  )
19
21
 
20
22
 
21
- @contextmanager
22
- def duckdbd_setup() -> Generator[Path, None, None]:
23
- with tempfile.TemporaryDirectory() as temp_dir:
24
- db_path = Path(temp_dir) / "temp_duck.db"
25
- db_init_path = Path(__file__).parent / "duckdb-schema.sql"
26
- assert db_init_path.exists()
27
- assert db_init_path.is_file()
28
- with duckdb.connect(database=db_path) as duckdb_connection:
29
- with db_init_path.open("r") as f:
30
- query = f.read()
31
- duckdb_connection.execute(query)
32
- duckdb_connection.close()
33
- yield db_path
23
+ @pytest.fixture
24
+ def provisioned_db_file(duckdb_schema: Path, temp_dir: Path) -> Path:
25
+ db_path = Path(temp_dir) / "temp_duck.db"
26
+ with duckdb.connect(database=db_path) as duckdb_connection:
27
+ with duckdb_schema.open("r") as f:
28
+ query = f.read()
29
+ duckdb_connection.execute(query)
30
+ duckdb_connection.close()
31
+ return db_path
34
32
 
35
33
 
36
34
  def validate_duckdb_destination(db_path: Path, expected_num_elements: int):
@@ -49,34 +47,43 @@ def validate_duckdb_destination(db_path: Path, expected_num_elements: int):
49
47
 
50
48
 
51
49
  @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "duckdb")
52
- def test_duckdb_destination(upload_file: Path):
53
- with duckdbd_setup() as test_db_path:
54
- with tempfile.TemporaryDirectory() as temp_dir:
55
- file_data = FileData(
56
- source_identifiers=SourceIdentifiers(
57
- fullpath=upload_file.name, filename=upload_file.name
58
- ),
59
- connector_type=CONNECTOR_TYPE,
60
- identifier="mock-file-data",
61
- )
50
+ def test_duckdb_destination(upload_file: Path, provisioned_db_file: Path, temp_dir: Path):
51
+ file_data = FileData(
52
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
53
+ connector_type=CONNECTOR_TYPE,
54
+ identifier="mock-file-data",
55
+ )
56
+
57
+ stager = DuckDBUploadStager()
58
+ staged_path = stager.run(
59
+ elements_filepath=upload_file,
60
+ file_data=file_data,
61
+ output_dir=temp_dir,
62
+ output_filename=upload_file.name,
63
+ )
64
+
65
+ connection_config = DuckDBConnectionConfig(database=str(provisioned_db_file))
66
+ upload_config = DuckDBUploaderConfig()
67
+ uploader = DuckDBUploader(connection_config=connection_config, upload_config=upload_config)
62
68
 
63
- # deafults to default stager config
64
- stager = DuckDBUploadStager()
65
- stager_params = {
66
- "elements_filepath": upload_file,
67
- "file_data": file_data,
68
- "output_dir": temp_dir,
69
- "output_filename": "test_db",
70
- }
71
- staged_path = stager.run(**stager_params)
69
+ uploader.run(path=staged_path, file_data=file_data)
72
70
 
73
- connection_config = DuckDBConnectionConfig(database=str(test_db_path))
74
- upload_config = DuckDBUploaderConfig()
75
- uploader = DuckDBUploader(
76
- connection_config=connection_config, upload_config=upload_config
77
- )
71
+ with staged_path.open() as f:
72
+ data = json.load(f)
73
+ validate_duckdb_destination(db_path=provisioned_db_file, expected_num_elements=len(data))
78
74
 
79
- uploader.run(path=staged_path, file_data=file_data)
80
75
 
81
- staged_df = pd.read_json(staged_path, orient="records", lines=True)
82
- validate_duckdb_destination(db_path=test_db_path, expected_num_elements=len(staged_df))
76
+ @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
77
+ def test_duckdb_stager(
78
+ request: TopRequest,
79
+ upload_file_str: str,
80
+ tmp_path: Path,
81
+ ):
82
+ upload_file: Path = request.getfixturevalue(upload_file_str)
83
+ stager = DuckDBUploadStager()
84
+ stager_validation(
85
+ configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
86
+ input_file=upload_file,
87
+ stager=stager,
88
+ tmp_dir=tmp_path,
89
+ )
@@ -1,7 +1,5 @@
1
1
  import os
2
- import tempfile
3
2
  import uuid
4
- from contextlib import contextmanager
5
3
  from pathlib import Path
6
4
  from typing import Generator
7
5
 
@@ -22,15 +20,19 @@ from unstructured_ingest.v2.processes.connectors.duckdb.motherduck import (
22
20
  )
23
21
 
24
22
 
25
- @contextmanager
26
- def motherduck_setup(md_token: str) -> Generator[Path, None, None]:
23
+ @pytest.fixture
24
+ def md_token() -> str:
25
+ motherduck_token = os.getenv("MOTHERDUCK_TOKEN", None)
26
+ assert motherduck_token
27
+ return motherduck_token
28
+
29
+
30
+ @pytest.fixture
31
+ def provisioned_db(md_token: str, duckdb_schema: Path) -> Generator[str, None, None]:
27
32
  database_name = f"test_{str(uuid.uuid4()).replace('-', '_')}"
28
33
  try:
29
- db_init_path = Path(__file__).parent / "duckdb-schema.sql"
30
- assert db_init_path.exists()
31
- assert db_init_path.is_file()
32
34
  with duckdb.connect(f"md:?motherduck_token={md_token}") as md_conn:
33
- with db_init_path.open("r") as f:
35
+ with duckdb_schema.open("r") as f:
34
36
  query = f.read()
35
37
  md_conn.execute(f"CREATE DATABASE {database_name}")
36
38
  md_conn.execute(f"USE {database_name}")
@@ -59,48 +61,35 @@ def validate_motherduck_destination(database: str, expected_num_elements: int, m
59
61
  conn.close()
60
62
 
61
63
 
62
- def get_motherduck_token() -> dict:
63
- motherduck_token = os.getenv("MOTHERDUCK_TOKEN", None)
64
- assert motherduck_token
65
- return motherduck_token
66
-
67
-
68
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "motherduck")
64
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
69
65
  @requires_env("MOTHERDUCK_TOKEN")
70
- def test_motherduck_destination(upload_file: Path):
71
- md_token = get_motherduck_token()
72
- with motherduck_setup(md_token) as test_database:
73
- with tempfile.TemporaryDirectory() as temp_dir:
74
- file_data = FileData(
75
- source_identifiers=SourceIdentifiers(
76
- fullpath=upload_file.name, filename=upload_file.name
77
- ),
78
- connector_type=CONNECTOR_TYPE,
79
- identifier="mock-file-data",
80
- )
66
+ def test_motherduck_destination(
67
+ md_token: str, upload_file: Path, provisioned_db: str, temp_dir: Path
68
+ ):
69
+ file_data = FileData(
70
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
71
+ connector_type=CONNECTOR_TYPE,
72
+ identifier="mock-file-data",
73
+ )
81
74
 
82
- # deafults to default stager config
83
- stager = MotherDuckUploadStager()
84
- stager_params = {
85
- "elements_filepath": upload_file,
86
- "file_data": file_data,
87
- "output_dir": temp_dir,
88
- "output_filename": "test_db",
89
- }
90
- staged_path = stager.run(**stager_params)
75
+ stager = MotherDuckUploadStager()
76
+ staged_path = stager.run(
77
+ elements_filepath=upload_file,
78
+ file_data=file_data,
79
+ output_dir=temp_dir,
80
+ output_filename=upload_file.name,
81
+ )
91
82
 
92
- access_config = MotherDuckAccessConfig(md_token=md_token)
93
- connection_config = MotherDuckConnectionConfig(
94
- database=test_database, access_config=access_config
95
- )
96
- upload_config = MotherDuckUploaderConfig()
97
- uploader = MotherDuckUploader(
98
- connection_config=connection_config, upload_config=upload_config
99
- )
83
+ access_config = MotherDuckAccessConfig(md_token=md_token)
84
+ connection_config = MotherDuckConnectionConfig(
85
+ database=provisioned_db, access_config=access_config
86
+ )
87
+ upload_config = MotherDuckUploaderConfig()
88
+ uploader = MotherDuckUploader(connection_config=connection_config, upload_config=upload_config)
100
89
 
101
- uploader.run(path=staged_path, file_data=file_data)
90
+ uploader.run(path=staged_path, file_data=file_data)
102
91
 
103
- staged_df = pd.read_json(staged_path, orient="records", lines=True)
104
- validate_motherduck_destination(
105
- database=test_database, expected_num_elements=len(staged_df), md_token=md_token
106
- )
92
+ staged_df = pd.read_json(staged_path, orient="records", lines=True)
93
+ validate_motherduck_destination(
94
+ database=provisioned_db, expected_num_elements=len(staged_df), md_token=md_token
95
+ )
@@ -5,16 +5,20 @@ import time
5
5
  from contextlib import contextmanager
6
6
  from pathlib import Path
7
7
  from typing import Generator
8
-
8
+ from test.integration.connectors.utils.validation.destination import (
9
+ StagerValidationConfigs,
10
+ stager_validation,
11
+ )
9
12
  import pandas as pd
10
13
  import pytest
14
+ from _pytest.fixtures import TopRequest
11
15
  from elasticsearch import Elasticsearch as ElasticsearchClient
12
16
  from elasticsearch.helpers import bulk
13
17
 
14
18
  from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
15
19
  from test.integration.connectors.utils.docker import HealthCheck, container_context
16
- from test.integration.connectors.utils.validation import (
17
- ValidationConfigs,
20
+ from test.integration.connectors.utils.validation.source import (
21
+ SourceValidationConfigs,
18
22
  source_connector_validation,
19
23
  )
20
24
  from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
@@ -194,7 +198,7 @@ async def test_elasticsearch_source(source_index: str, movies_dataframe: pd.Data
194
198
  await source_connector_validation(
195
199
  indexer=indexer,
196
200
  downloader=downloader,
197
- configs=ValidationConfigs(
201
+ configs=SourceValidationConfigs(
198
202
  test_id=CONNECTOR_TYPE,
199
203
  expected_num_files=expected_num_files,
200
204
  expected_number_indexed_file_data=1,
@@ -306,3 +310,21 @@ def test_elasticsearch_destination_precheck_fail_no_index(destination_index: str
306
310
  )
307
311
  with pytest.raises(DestinationConnectionError):
308
312
  uploader.precheck()
313
+
314
+
315
+ @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
316
+ def test_elasticsearch_stager(
317
+ request: TopRequest,
318
+ upload_file_str: str,
319
+ tmp_path: Path,
320
+ ):
321
+ upload_file: Path = request.getfixturevalue(upload_file_str)
322
+ stager = ElasticsearchUploadStager(
323
+ upload_stager_config=ElasticsearchUploadStagerConfig(index_name="mock_index")
324
+ )
325
+ stager_validation(
326
+ configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
327
+ input_file=upload_file,
328
+ stager=stager,
329
+ tmp_dir=tmp_path,
330
+ )
@@ -7,12 +7,17 @@ from typing import Generator
7
7
 
8
8
  import pandas as pd
9
9
  import pytest
10
+ from _pytest.fixtures import TopRequest
10
11
  from opensearchpy import Document, Keyword, OpenSearch, Text
11
12
 
12
13
  from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
13
14
  from test.integration.connectors.utils.docker import HealthCheck, container_context
14
- from test.integration.connectors.utils.validation import (
15
- ValidationConfigs,
15
+ from test.integration.connectors.utils.validation.destination import (
16
+ StagerValidationConfigs,
17
+ stager_validation,
18
+ )
19
+ from test.integration.connectors.utils.validation.source import (
20
+ SourceValidationConfigs,
16
21
  source_connector_validation,
17
22
  )
18
23
  from unstructured_ingest.error import (
@@ -183,7 +188,7 @@ async def test_opensearch_source(source_index: str, movies_dataframe: pd.DataFra
183
188
  await source_connector_validation(
184
189
  indexer=indexer,
185
190
  downloader=downloader,
186
- configs=ValidationConfigs(
191
+ configs=SourceValidationConfigs(
187
192
  test_id=CONNECTOR_TYPE,
188
193
  expected_num_files=expected_num_files,
189
194
  expected_number_indexed_file_data=1,
@@ -300,3 +305,21 @@ def test_opensearch_destination_precheck_fail_no_index(destination_index: str):
300
305
  )
301
306
  with pytest.raises(DestinationConnectionError):
302
307
  uploader.precheck()
308
+
309
+
310
+ @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
311
+ def test_opensearch_stager(
312
+ request: TopRequest,
313
+ upload_file_str: str,
314
+ tmp_path: Path,
315
+ ):
316
+ upload_file: Path = request.getfixturevalue(upload_file_str)
317
+ stager = OpenSearchUploadStager(
318
+ upload_stager_config=OpenSearchUploadStagerConfig(index_name="mock_index")
319
+ )
320
+ stager_validation(
321
+ configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
322
+ input_file=upload_file,
323
+ stager=stager,
324
+ tmp_dir=tmp_path,
325
+ )