unstructured-ingest 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (87) hide show
  1. test/integration/chunkers/test_chunkers.py +0 -11
  2. test/integration/connectors/conftest.py +11 -1
  3. test/integration/connectors/databricks_tests/test_volumes_native.py +4 -3
  4. test/integration/connectors/duckdb/conftest.py +14 -0
  5. test/integration/connectors/duckdb/test_duckdb.py +51 -44
  6. test/integration/connectors/duckdb/test_motherduck.py +37 -48
  7. test/integration/connectors/elasticsearch/test_elasticsearch.py +26 -4
  8. test/integration/connectors/elasticsearch/test_opensearch.py +26 -3
  9. test/integration/connectors/sql/test_postgres.py +103 -92
  10. test/integration/connectors/sql/test_singlestore.py +112 -100
  11. test/integration/connectors/sql/test_snowflake.py +142 -117
  12. test/integration/connectors/sql/test_sqlite.py +87 -76
  13. test/integration/connectors/test_astradb.py +62 -1
  14. test/integration/connectors/test_azure_ai_search.py +25 -3
  15. test/integration/connectors/test_chroma.py +120 -0
  16. test/integration/connectors/test_confluence.py +4 -4
  17. test/integration/connectors/test_delta_table.py +1 -0
  18. test/integration/connectors/test_kafka.py +6 -6
  19. test/integration/connectors/test_milvus.py +21 -0
  20. test/integration/connectors/test_mongodb.py +7 -4
  21. test/integration/connectors/test_neo4j.py +236 -0
  22. test/integration/connectors/test_pinecone.py +25 -1
  23. test/integration/connectors/test_qdrant.py +25 -2
  24. test/integration/connectors/test_s3.py +9 -6
  25. test/integration/connectors/utils/docker.py +6 -0
  26. test/integration/connectors/utils/validation/__init__.py +0 -0
  27. test/integration/connectors/utils/validation/destination.py +88 -0
  28. test/integration/connectors/utils/validation/equality.py +75 -0
  29. test/integration/connectors/utils/{validation.py → validation/source.py} +42 -98
  30. test/integration/connectors/utils/validation/utils.py +36 -0
  31. unstructured_ingest/__version__.py +1 -1
  32. unstructured_ingest/utils/chunking.py +11 -0
  33. unstructured_ingest/utils/data_prep.py +36 -0
  34. unstructured_ingest/v2/interfaces/__init__.py +3 -1
  35. unstructured_ingest/v2/interfaces/file_data.py +58 -14
  36. unstructured_ingest/v2/interfaces/upload_stager.py +70 -6
  37. unstructured_ingest/v2/interfaces/uploader.py +11 -2
  38. unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
  39. unstructured_ingest/v2/pipeline/steps/download.py +5 -4
  40. unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
  41. unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
  42. unstructured_ingest/v2/pipeline/steps/index.py +4 -4
  43. unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
  44. unstructured_ingest/v2/pipeline/steps/stage.py +5 -3
  45. unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
  46. unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
  47. unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
  48. unstructured_ingest/v2/processes/connectors/astradb.py +43 -63
  49. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +16 -40
  50. unstructured_ingest/v2/processes/connectors/chroma.py +36 -59
  51. unstructured_ingest/v2/processes/connectors/couchbase.py +92 -93
  52. unstructured_ingest/v2/processes/connectors/delta_table.py +11 -33
  53. unstructured_ingest/v2/processes/connectors/duckdb/base.py +26 -26
  54. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +29 -20
  55. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +37 -44
  56. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +46 -75
  57. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
  58. unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
  59. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
  60. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
  61. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
  62. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
  63. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
  64. unstructured_ingest/v2/processes/connectors/gitlab.py +32 -31
  65. unstructured_ingest/v2/processes/connectors/google_drive.py +32 -29
  66. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +2 -4
  67. unstructured_ingest/v2/processes/connectors/kdbai.py +44 -70
  68. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -10
  69. unstructured_ingest/v2/processes/connectors/local.py +13 -2
  70. unstructured_ingest/v2/processes/connectors/milvus.py +16 -57
  71. unstructured_ingest/v2/processes/connectors/mongodb.py +99 -108
  72. unstructured_ingest/v2/processes/connectors/neo4j.py +383 -0
  73. unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
  74. unstructured_ingest/v2/processes/connectors/pinecone.py +3 -33
  75. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +32 -41
  76. unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
  77. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
  78. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
  79. unstructured_ingest/v2/processes/connectors/sql/sql.py +72 -66
  80. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
  81. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +9 -31
  82. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/METADATA +20 -15
  83. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/RECORD +87 -79
  84. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/LICENSE.md +0 -0
  85. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/WHEEL +0 -0
  86. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/entry_points.txt +0 -0
  87. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,19 @@
1
+ import json
1
2
  import os
2
- import tempfile
3
3
  from pathlib import Path
4
4
 
5
- import docker
6
- import pandas as pd
7
5
  import pytest
8
6
  import snowflake.connector as sf
7
+ from _pytest.fixtures import TopRequest
9
8
 
10
9
  from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, env_setup_path
11
10
  from test.integration.connectors.utils.docker import container_context
12
- from test.integration.connectors.utils.validation import (
13
- ValidationConfigs,
11
+ from test.integration.connectors.utils.validation.destination import (
12
+ StagerValidationConfigs,
13
+ stager_validation,
14
+ )
15
+ from test.integration.connectors.utils.validation.source import (
16
+ SourceValidationConfigs,
14
17
  source_connector_validation,
15
18
  )
16
19
  from test.integration.utils import requires_env
@@ -30,14 +33,15 @@ from unstructured_ingest.v2.processes.connectors.sql.snowflake import (
30
33
  SEED_DATA_ROWS = 20
31
34
 
32
35
 
33
- def seed_data():
34
- conn = sf.connect(
35
- user="test",
36
- password="test",
37
- account="test",
38
- database="test",
39
- host="snowflake.localhost.localstack.cloud",
40
- )
36
+ def seed_data() -> dict:
37
+ connect_params = {
38
+ "user": "test",
39
+ "password": "test",
40
+ "account": "test",
41
+ "database": "test",
42
+ "host": "snowflake.localhost.localstack.cloud",
43
+ }
44
+ conn = sf.connect(**connect_params)
41
45
 
42
46
  file = Path(env_setup_path / "sql" / "snowflake" / "source" / "snowflake-schema.sql")
43
47
 
@@ -52,16 +56,31 @@ def seed_data():
52
56
 
53
57
  cur.close()
54
58
  conn.close()
59
+ return connect_params
55
60
 
56
61
 
57
- def init_db_destination():
58
- conn = sf.connect(
59
- user="test",
60
- password="test",
61
- account="test",
62
- database="test",
63
- host="snowflake.localhost.localstack.cloud",
64
- )
62
+ @pytest.fixture
63
+ def source_database_setup() -> dict:
64
+ token = os.getenv("LOCALSTACK_AUTH_TOKEN")
65
+ with container_context(
66
+ image="localstack/snowflake",
67
+ environment={"LOCALSTACK_AUTH_TOKEN": token, "EXTRA_CORS_ALLOWED_ORIGINS": "*"},
68
+ ports={4566: 4566, 443: 443},
69
+ healthcheck_retries=30,
70
+ ):
71
+ connect_params = seed_data()
72
+ yield connect_params
73
+
74
+
75
+ def init_db_destination() -> dict:
76
+ connect_params = {
77
+ "user": "test",
78
+ "password": "test",
79
+ "account": "test",
80
+ "database": "test",
81
+ "host": "snowflake.localhost.localstack.cloud",
82
+ }
83
+ conn = sf.connect(**connect_params)
65
84
 
66
85
  file = Path(env_setup_path / "sql" / "snowflake" / "destination" / "snowflake-schema.sql")
67
86
 
@@ -73,52 +92,53 @@ def init_db_destination():
73
92
 
74
93
  cur.close()
75
94
  conn.close()
95
+ return connect_params
76
96
 
77
97
 
78
- @pytest.mark.asyncio
79
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
80
- @requires_env("LOCALSTACK_AUTH_TOKEN")
81
- async def test_snowflake_source():
82
- docker_client = docker.from_env()
98
+ @pytest.fixture
99
+ def destination_database_setup() -> dict:
83
100
  token = os.getenv("LOCALSTACK_AUTH_TOKEN")
84
101
  with container_context(
85
- docker_client=docker_client,
86
102
  image="localstack/snowflake",
87
103
  environment={"LOCALSTACK_AUTH_TOKEN": token, "EXTRA_CORS_ALLOWED_ORIGINS": "*"},
88
104
  ports={4566: 4566, 443: 443},
89
105
  healthcheck_retries=30,
90
106
  ):
91
- seed_data()
92
- with tempfile.TemporaryDirectory() as tmpdir:
93
- connection_config = SnowflakeConnectionConfig(
94
- access_config=SnowflakeAccessConfig(password="test"),
95
- account="test",
96
- user="test",
97
- database="test",
98
- host="snowflake.localhost.localstack.cloud",
99
- )
100
- indexer = SnowflakeIndexer(
101
- connection_config=connection_config,
102
- index_config=SnowflakeIndexerConfig(
103
- table_name="cars", id_column="CAR_ID", batch_size=5
104
- ),
105
- )
106
- downloader = SnowflakeDownloader(
107
- connection_config=connection_config,
108
- download_config=SnowflakeDownloaderConfig(
109
- fields=["CAR_ID", "BRAND"], download_dir=Path(tmpdir)
110
- ),
111
- )
112
- await source_connector_validation(
113
- indexer=indexer,
114
- downloader=downloader,
115
- configs=ValidationConfigs(
116
- test_id="snowflake",
117
- expected_num_files=SEED_DATA_ROWS,
118
- expected_number_indexed_file_data=4,
119
- validate_downloaded_files=True,
120
- ),
121
- )
107
+ connect_params = init_db_destination()
108
+ yield connect_params
109
+
110
+
111
+ @pytest.mark.asyncio
112
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
113
+ @requires_env("LOCALSTACK_AUTH_TOKEN")
114
+ async def test_snowflake_source(temp_dir: Path, source_database_setup: dict):
115
+ connection_config = SnowflakeConnectionConfig(
116
+ access_config=SnowflakeAccessConfig(password="test"),
117
+ account="test",
118
+ user="test",
119
+ database="test",
120
+ host="snowflake.localhost.localstack.cloud",
121
+ )
122
+ indexer = SnowflakeIndexer(
123
+ connection_config=connection_config,
124
+ index_config=SnowflakeIndexerConfig(table_name="cars", id_column="CAR_ID", batch_size=5),
125
+ )
126
+ downloader = SnowflakeDownloader(
127
+ connection_config=connection_config,
128
+ download_config=SnowflakeDownloaderConfig(
129
+ fields=["CAR_ID", "BRAND"], download_dir=temp_dir
130
+ ),
131
+ )
132
+ await source_connector_validation(
133
+ indexer=indexer,
134
+ downloader=downloader,
135
+ configs=SourceValidationConfigs(
136
+ test_id="snowflake",
137
+ expected_num_files=SEED_DATA_ROWS,
138
+ expected_number_indexed_file_data=4,
139
+ validate_downloaded_files=True,
140
+ ),
141
+ )
122
142
 
123
143
 
124
144
  def validate_destination(
@@ -145,65 +165,70 @@ def validate_destination(
145
165
  @pytest.mark.asyncio
146
166
  @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
147
167
  @requires_env("LOCALSTACK_AUTH_TOKEN")
148
- async def test_snowflake_destination(upload_file: Path):
168
+ async def test_snowflake_destination(
169
+ upload_file: Path, temp_dir: Path, destination_database_setup: dict
170
+ ):
149
171
  # the postgres destination connector doesn't leverage the file data but is required as an input,
150
172
  # mocking it with arbitrary values to meet the base requirements:
151
173
  mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
152
- docker_client = docker.from_env()
153
- token = os.getenv("LOCALSTACK_AUTH_TOKEN")
154
- with container_context(
155
- docker_client=docker_client,
156
- image="localstack/snowflake",
157
- environment={"LOCALSTACK_AUTH_TOKEN": token, "EXTRA_CORS_ALLOWED_ORIGINS": "*"},
158
- ports={4566: 4566, 443: 443},
159
- healthcheck_retries=30,
160
- ):
161
- init_db_destination()
162
- with tempfile.TemporaryDirectory() as tmpdir:
163
- stager = SnowflakeUploadStager()
164
- stager_params = {
165
- "elements_filepath": upload_file,
166
- "file_data": mock_file_data,
167
- "output_dir": Path(tmpdir),
168
- "output_filename": "test_db",
169
- }
170
- if stager.is_async():
171
- staged_path = await stager.run_async(**stager_params)
172
- else:
173
- staged_path = stager.run(**stager_params)
174
-
175
- # The stager should append the `.json` suffix to the output filename passed in.
176
- assert staged_path.name == "test_db.json"
177
-
178
- connect_params = {
179
- "user": "test",
180
- "password": "test",
181
- "account": "test",
182
- "database": "test",
183
- "host": "snowflake.localhost.localstack.cloud",
184
- }
185
-
186
- uploader = SnowflakeUploader(
187
- connection_config=SnowflakeConnectionConfig(
188
- access_config=SnowflakeAccessConfig(password=connect_params["password"]),
189
- account=connect_params["account"],
190
- user=connect_params["user"],
191
- database=connect_params["database"],
192
- host=connect_params["host"],
193
- )
194
- )
195
-
196
- uploader.run(path=staged_path, file_data=mock_file_data)
197
-
198
- staged_df = pd.read_json(staged_path, orient="records", lines=True)
199
- expected_num_elements = len(staged_df)
200
- validate_destination(
201
- connect_params=connect_params,
202
- expected_num_elements=expected_num_elements,
203
- )
204
-
205
- uploader.run(path=staged_path, file_data=mock_file_data)
206
- validate_destination(
207
- connect_params=connect_params,
208
- expected_num_elements=expected_num_elements,
209
- )
174
+ init_db_destination()
175
+ stager = SnowflakeUploadStager()
176
+ staged_path = stager.run(
177
+ elements_filepath=upload_file,
178
+ file_data=mock_file_data,
179
+ output_dir=temp_dir,
180
+ output_filename=upload_file.name,
181
+ )
182
+
183
+ # The stager should append the `.json` suffix to the output filename passed in.
184
+ assert staged_path.suffix == upload_file.suffix
185
+
186
+ connect_params = {
187
+ "user": "test",
188
+ "password": "test",
189
+ "account": "test",
190
+ "database": "test",
191
+ "host": "snowflake.localhost.localstack.cloud",
192
+ }
193
+
194
+ uploader = SnowflakeUploader(
195
+ connection_config=SnowflakeConnectionConfig(
196
+ access_config=SnowflakeAccessConfig(password=connect_params["password"]),
197
+ account=connect_params["account"],
198
+ user=connect_params["user"],
199
+ database=connect_params["database"],
200
+ host=connect_params["host"],
201
+ )
202
+ )
203
+ uploader.precheck()
204
+ uploader.run(path=staged_path, file_data=mock_file_data)
205
+
206
+ with staged_path.open("r") as f:
207
+ staged_data = json.load(f)
208
+ expected_num_elements = len(staged_data)
209
+ validate_destination(
210
+ connect_params=connect_params,
211
+ expected_num_elements=expected_num_elements,
212
+ )
213
+
214
+ uploader.run(path=staged_path, file_data=mock_file_data)
215
+ validate_destination(
216
+ connect_params=connect_params,
217
+ expected_num_elements=expected_num_elements,
218
+ )
219
+
220
+
221
+ @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
222
+ def test_snowflake_stager(
223
+ request: TopRequest,
224
+ upload_file_str: str,
225
+ tmp_path: Path,
226
+ ):
227
+ upload_file: Path = request.getfixturevalue(upload_file_str)
228
+ stager = SnowflakeUploadStager()
229
+ stager_validation(
230
+ configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
231
+ input_file=upload_file,
232
+ stager=stager,
233
+ tmp_dir=tmp_path,
234
+ )
@@ -1,14 +1,18 @@
1
+ import json
1
2
  import sqlite3
2
3
  import tempfile
3
- from contextlib import contextmanager
4
4
  from pathlib import Path
5
5
 
6
- import pandas as pd
7
6
  import pytest
7
+ from _pytest.fixtures import TopRequest
8
8
 
9
9
  from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, env_setup_path
10
- from test.integration.connectors.utils.validation import (
11
- ValidationConfigs,
10
+ from test.integration.connectors.utils.validation.destination import (
11
+ StagerValidationConfigs,
12
+ stager_validation,
13
+ )
14
+ from test.integration.connectors.utils.validation.source import (
15
+ SourceValidationConfigs,
12
16
  source_connector_validation,
13
17
  )
14
18
  from unstructured_ingest.v2.interfaces import FileData
@@ -23,11 +27,11 @@ from unstructured_ingest.v2.processes.connectors.sql.sqlite import (
23
27
  SQLiteUploadStager,
24
28
  )
25
29
 
26
- SEED_DATA_ROWS = 20
30
+ SEED_DATA_ROWS = 10
27
31
 
28
32
 
29
- @contextmanager
30
- def sqlite_download_setup() -> Path:
33
+ @pytest.fixture
34
+ def source_database_setup() -> Path:
31
35
  with tempfile.TemporaryDirectory() as tmpdir:
32
36
  db_path = Path(tmpdir) / "mock_database.db"
33
37
  db_init_path = env_setup_path / "sql" / "sqlite" / "source" / "sqlite-schema.sql"
@@ -49,49 +53,42 @@ def sqlite_download_setup() -> Path:
49
53
 
50
54
  @pytest.mark.asyncio
51
55
  @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
52
- async def test_sqlite_source():
53
- with sqlite_download_setup() as db_path:
54
- with tempfile.TemporaryDirectory() as tmpdir:
55
- connection_config = SQLiteConnectionConfig(database_path=db_path)
56
- indexer = SQLiteIndexer(
57
- connection_config=connection_config,
58
- index_config=SQLiteIndexerConfig(
59
- table_name="cars", id_column="car_id", batch_size=5
60
- ),
61
- )
62
- downloader = SQLiteDownloader(
63
- connection_config=connection_config,
64
- download_config=SQLiteDownloaderConfig(
65
- fields=["car_id", "brand"], download_dir=Path(tmpdir)
66
- ),
67
- )
68
- await source_connector_validation(
69
- indexer=indexer,
70
- downloader=downloader,
71
- configs=ValidationConfigs(
72
- test_id="sqlite",
73
- expected_num_files=SEED_DATA_ROWS,
74
- expected_number_indexed_file_data=4,
75
- validate_downloaded_files=True,
76
- ),
77
- )
78
-
79
-
80
- @contextmanager
81
- def sqlite_upload_setup() -> Path:
56
+ async def test_sqlite_source(source_database_setup: Path, temp_dir: Path):
57
+ connection_config = SQLiteConnectionConfig(database_path=source_database_setup)
58
+ indexer = SQLiteIndexer(
59
+ connection_config=connection_config,
60
+ index_config=SQLiteIndexerConfig(table_name="cars", id_column="car_id", batch_size=6),
61
+ )
62
+ downloader = SQLiteDownloader(
63
+ connection_config=connection_config,
64
+ download_config=SQLiteDownloaderConfig(fields=["car_id", "brand"], download_dir=temp_dir),
65
+ )
66
+ await source_connector_validation(
67
+ indexer=indexer,
68
+ downloader=downloader,
69
+ configs=SourceValidationConfigs(
70
+ test_id="sqlite",
71
+ expected_num_files=SEED_DATA_ROWS,
72
+ expected_number_indexed_file_data=2,
73
+ validate_downloaded_files=True,
74
+ ),
75
+ )
76
+
77
+
78
+ @pytest.fixture
79
+ def destination_database_setup(temp_dir: Path) -> Path:
82
80
  # Provision the local file that sqlite points to to have the desired schema for the integration
83
81
  # tests and make sure the file and connection get cleaned up by using a context manager.
84
- with tempfile.TemporaryDirectory() as tmpdir:
85
- db_path = Path(tmpdir) / "elements.db"
86
- db_init_path = env_setup_path / "sql" / "sqlite" / "destination" / "sqlite-schema.sql"
87
- assert db_init_path.exists()
88
- assert db_init_path.is_file()
89
- with sqlite3.connect(database=db_path) as sqlite_connection:
90
- with db_init_path.open("r") as f:
91
- query = f.read()
92
- cursor = sqlite_connection.cursor()
93
- cursor.executescript(query)
94
- yield db_path
82
+ db_path = temp_dir / "elements.db"
83
+ db_init_path = env_setup_path / "sql" / "sqlite" / "destination" / "sqlite-schema.sql"
84
+ assert db_init_path.exists()
85
+ assert db_init_path.is_file()
86
+ with sqlite3.connect(database=db_path) as sqlite_connection:
87
+ with db_init_path.open("r") as f:
88
+ query = f.read()
89
+ cursor = sqlite_connection.cursor()
90
+ cursor.executescript(query)
91
+ return db_path
95
92
 
96
93
 
97
94
  def validate_destination(db_path: Path, expected_num_elements: int):
@@ -114,34 +111,48 @@ def validate_destination(db_path: Path, expected_num_elements: int):
114
111
 
115
112
  @pytest.mark.asyncio
116
113
  @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
117
- async def test_sqlite_destination(upload_file: Path):
114
+ async def test_sqlite_destination(
115
+ upload_file: Path, temp_dir: Path, destination_database_setup: Path
116
+ ):
118
117
  # the sqlite destination connector doesn't leverage the file data but is required as an input,
119
118
  # mocking it with arbitrary values to meet the base requirements:
120
119
  mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
121
- with sqlite_upload_setup() as db_path:
122
- with tempfile.TemporaryDirectory() as tmpdir:
123
- stager = SQLiteUploadStager()
124
- stager_params = {
125
- "elements_filepath": upload_file,
126
- "file_data": mock_file_data,
127
- "output_dir": Path(tmpdir),
128
- "output_filename": "test_db",
129
- }
130
- if stager.is_async():
131
- staged_path = await stager.run_async(**stager_params)
132
- else:
133
- staged_path = stager.run(**stager_params)
134
-
135
- # The stager should append the `.json` suffix to the output filename passed in.
136
- assert staged_path.name == "test_db.json"
137
-
138
- uploader = SQLiteUploader(
139
- connection_config=SQLiteConnectionConfig(database_path=db_path)
140
- )
141
- uploader.run(path=staged_path, file_data=mock_file_data)
142
-
143
- staged_df = pd.read_json(staged_path, orient="records", lines=True)
144
- validate_destination(db_path=db_path, expected_num_elements=len(staged_df))
145
-
146
- uploader.run(path=staged_path, file_data=mock_file_data)
147
- validate_destination(db_path=db_path, expected_num_elements=len(staged_df))
120
+ stager = SQLiteUploadStager()
121
+ staged_path = stager.run(
122
+ elements_filepath=upload_file,
123
+ file_data=mock_file_data,
124
+ output_dir=temp_dir,
125
+ output_filename=upload_file.name,
126
+ )
127
+
128
+ # The stager should append the `.json` suffix to the output filename passed in.
129
+ assert staged_path.suffix == upload_file.suffix
130
+
131
+ uploader = SQLiteUploader(
132
+ connection_config=SQLiteConnectionConfig(database_path=destination_database_setup)
133
+ )
134
+ uploader.precheck()
135
+ uploader.run(path=staged_path, file_data=mock_file_data)
136
+
137
+ with staged_path.open("r") as f:
138
+ staged_data = json.load(f)
139
+ validate_destination(db_path=destination_database_setup, expected_num_elements=len(staged_data))
140
+
141
+ uploader.run(path=staged_path, file_data=mock_file_data)
142
+ validate_destination(db_path=destination_database_setup, expected_num_elements=len(staged_data))
143
+
144
+
145
+ @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
146
+ def test_sqlite_stager(
147
+ request: TopRequest,
148
+ upload_file_str: str,
149
+ tmp_path: Path,
150
+ ):
151
+ upload_file: Path = request.getfixturevalue(upload_file_str)
152
+ stager = SQLiteUploadStager()
153
+ stager_validation(
154
+ configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
155
+ input_file=upload_file,
156
+ stager=stager,
157
+ tmp_dir=tmp_path,
158
+ )
@@ -5,16 +5,27 @@ from pathlib import Path
5
5
  from uuid import uuid4
6
6
 
7
7
  import pytest
8
+ from _pytest.fixtures import TopRequest
8
9
  from astrapy import Collection
9
10
  from astrapy import DataAPIClient as AstraDBClient
10
11
 
11
12
  from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
13
+ from test.integration.connectors.utils.validation.destination import (
14
+ StagerValidationConfigs,
15
+ stager_validation,
16
+ )
17
+ from test.integration.connectors.utils.validation.source import (
18
+ SourceValidationConfigs,
19
+ source_connector_validation,
20
+ )
12
21
  from test.integration.utils import requires_env
13
22
  from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
14
23
  from unstructured_ingest.v2.processes.connectors.astradb import (
15
24
  CONNECTOR_TYPE,
16
25
  AstraDBAccessConfig,
17
26
  AstraDBConnectionConfig,
27
+ AstraDBDownloader,
28
+ AstraDBDownloaderConfig,
18
29
  AstraDBIndexer,
19
30
  AstraDBIndexerConfig,
20
31
  AstraDBUploader,
@@ -105,10 +116,44 @@ def collection(upload_file: Path) -> Collection:
105
116
  astra_db.drop_collection(collection)
106
117
 
107
118
 
119
+ @pytest.mark.asyncio
120
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
121
+ @requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
122
+ async def test_astra_search_source(
123
+ tmp_path: Path,
124
+ ):
125
+ env_data = get_env_data()
126
+ collection_name = "ingest_test_src"
127
+ connection_config = AstraDBConnectionConfig(
128
+ access_config=AstraDBAccessConfig(token=env_data.token, api_endpoint=env_data.api_endpoint)
129
+ )
130
+ indexer = AstraDBIndexer(
131
+ index_config=AstraDBIndexerConfig(
132
+ collection_name=collection_name,
133
+ ),
134
+ connection_config=connection_config,
135
+ )
136
+ downloader = AstraDBDownloader(
137
+ connection_config=connection_config,
138
+ download_config=AstraDBDownloaderConfig(download_dir=tmp_path),
139
+ )
140
+
141
+ await source_connector_validation(
142
+ indexer=indexer,
143
+ downloader=downloader,
144
+ configs=SourceValidationConfigs(
145
+ test_id=CONNECTOR_TYPE,
146
+ expected_num_files=5,
147
+ expected_number_indexed_file_data=1,
148
+ validate_downloaded_files=True,
149
+ ),
150
+ )
151
+
152
+
108
153
  @pytest.mark.asyncio
109
154
  @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
110
155
  @requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
111
- async def test_azure_ai_search_destination(
156
+ async def test_astra_search_destination(
112
157
  upload_file: Path,
113
158
  collection: Collection,
114
159
  tmp_path: Path,
@@ -154,3 +199,19 @@ async def test_azure_ai_search_destination(
154
199
  f"Expected count ({expected_count}) doesn't match how "
155
200
  f"much came back from collection: {current_count}"
156
201
  )
202
+
203
+
204
+ @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
205
+ def test_astra_stager(
206
+ request: TopRequest,
207
+ upload_file_str: str,
208
+ tmp_path: Path,
209
+ ):
210
+ upload_file: Path = request.getfixturevalue(upload_file_str)
211
+ stager = AstraDBUploadStager()
212
+ stager_validation(
213
+ configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
214
+ input_file=upload_file,
215
+ stager=stager,
216
+ tmp_dir=tmp_path,
217
+ )
@@ -5,6 +5,7 @@ from pathlib import Path
5
5
  from uuid import uuid4
6
6
 
7
7
  import pytest
8
+ from _pytest.fixtures import TopRequest
8
9
  from azure.core.credentials import AzureKeyCredential
9
10
  from azure.search.documents import SearchClient
10
11
  from azure.search.documents.indexes import SearchIndexClient
@@ -25,6 +26,10 @@ from azure.search.documents.indexes.models import (
25
26
  from test.integration.connectors.utils.constants import (
26
27
  DESTINATION_TAG,
27
28
  )
29
+ from test.integration.connectors.utils.validation.destination import (
30
+ StagerValidationConfigs,
31
+ stager_validation,
32
+ )
28
33
  from test.integration.utils import requires_env
29
34
  from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
30
35
  from unstructured_ingest.v2.processes.connectors.azure_ai_search import (
@@ -225,9 +230,26 @@ async def test_azure_ai_search_destination(
225
230
  with staged_filepath.open() as f:
226
231
  staged_elements = json.load(f)
227
232
  expected_count = len(staged_elements)
228
- search_client: SearchClient = uploader.connection_config.get_search_client()
229
- validate_count(search_client=search_client, expected_count=expected_count)
233
+ with uploader.connection_config.get_search_client() as search_client:
234
+ validate_count(search_client=search_client, expected_count=expected_count)
230
235
 
231
236
  # Rerun and make sure the same documents get updated
232
237
  uploader.run(path=staged_filepath, file_data=file_data)
233
- validate_count(search_client=search_client, expected_count=expected_count)
238
+ with uploader.connection_config.get_search_client() as search_client:
239
+ validate_count(search_client=search_client, expected_count=expected_count)
240
+
241
+
242
+ @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
243
+ def test_azure_ai_search_stager(
244
+ request: TopRequest,
245
+ upload_file_str: str,
246
+ tmp_path: Path,
247
+ ):
248
+ upload_file: Path = request.getfixturevalue(upload_file_str)
249
+ stager = AzureAISearchUploadStager()
250
+ stager_validation(
251
+ configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
252
+ input_file=upload_file,
253
+ stager=stager,
254
+ tmp_dir=tmp_path,
255
+ )