unstructured-ingest 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/{databricks_tests → databricks}/test_volumes_native.py +75 -19
- test/integration/connectors/sql/test_postgres.py +9 -5
- test/integration/connectors/sql/test_singlestore.py +9 -5
- test/integration/connectors/sql/test_snowflake.py +6 -2
- test/integration/connectors/sql/test_sqlite.py +9 -5
- test/integration/connectors/test_astradb.py +40 -0
- test/integration/connectors/test_kafka.py +2 -2
- test/integration/connectors/test_mongodb.py +4 -1
- test/integration/connectors/utils/validation/source.py +31 -11
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/interfaces/__init__.py +3 -1
- unstructured_ingest/v2/interfaces/file_data.py +69 -15
- unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
- unstructured_ingest/v2/pipeline/steps/download.py +5 -4
- unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
- unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
- unstructured_ingest/v2/pipeline/steps/index.py +4 -4
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
- unstructured_ingest/v2/pipeline/steps/stage.py +2 -2
- unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
- unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +37 -33
- unstructured_ingest/v2/processes/connectors/couchbase.py +52 -41
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +5 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +2 -2
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +2 -2
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +2 -2
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +41 -45
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
- unstructured_ingest/v2/processes/connectors/mongodb.py +94 -100
- unstructured_ingest/v2/processes/connectors/neo4j.py +5 -3
- unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/sql.py +36 -26
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/METADATA +11 -10
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/RECORD +52 -52
- /test/integration/connectors/{databricks_tests → databricks}/__init__.py +0 -0
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/top_level.txt +0 -0
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
|
-
import tempfile
|
|
4
3
|
import uuid
|
|
5
4
|
from contextlib import contextmanager
|
|
6
5
|
from dataclasses import dataclass
|
|
7
6
|
from pathlib import Path
|
|
7
|
+
from unittest import mock
|
|
8
8
|
|
|
9
9
|
import pytest
|
|
10
10
|
from databricks.sdk import WorkspaceClient
|
|
@@ -31,11 +31,15 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes_native impor
|
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
@dataclass
|
|
34
|
-
class
|
|
34
|
+
class BaseEnvData:
|
|
35
35
|
host: str
|
|
36
|
+
catalog: str
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class BasicAuthEnvData(BaseEnvData):
|
|
36
41
|
client_id: str
|
|
37
42
|
client_secret: str
|
|
38
|
-
catalog: str
|
|
39
43
|
|
|
40
44
|
def get_connection_config(self) -> DatabricksNativeVolumesConnectionConfig:
|
|
41
45
|
return DatabricksNativeVolumesConnectionConfig(
|
|
@@ -47,8 +51,21 @@ class EnvData:
|
|
|
47
51
|
)
|
|
48
52
|
|
|
49
53
|
|
|
50
|
-
|
|
51
|
-
|
|
54
|
+
@dataclass
|
|
55
|
+
class PATEnvData(BaseEnvData):
|
|
56
|
+
token: str
|
|
57
|
+
|
|
58
|
+
def get_connection_config(self) -> DatabricksNativeVolumesConnectionConfig:
|
|
59
|
+
return DatabricksNativeVolumesConnectionConfig(
|
|
60
|
+
host=self.host,
|
|
61
|
+
access_config=DatabricksNativeVolumesAccessConfig(
|
|
62
|
+
token=self.token,
|
|
63
|
+
),
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def get_basic_auth_env_data() -> BasicAuthEnvData:
|
|
68
|
+
return BasicAuthEnvData(
|
|
52
69
|
host=os.environ["DATABRICKS_HOST"],
|
|
53
70
|
client_id=os.environ["DATABRICKS_CLIENT_ID"],
|
|
54
71
|
client_secret=os.environ["DATABRICKS_CLIENT_SECRET"],
|
|
@@ -56,23 +73,30 @@ def get_env_data() -> EnvData:
|
|
|
56
73
|
)
|
|
57
74
|
|
|
58
75
|
|
|
76
|
+
def get_pat_env_data() -> PATEnvData:
|
|
77
|
+
return PATEnvData(
|
|
78
|
+
host=os.environ["DATABRICKS_HOST"],
|
|
79
|
+
catalog=os.environ["DATABRICKS_CATALOG"],
|
|
80
|
+
token=os.environ["DATABRICKS_PAT"],
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
59
84
|
@pytest.mark.asyncio
|
|
60
85
|
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
61
86
|
@requires_env(
|
|
62
87
|
"DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
|
|
63
88
|
)
|
|
64
|
-
async def test_volumes_native_source():
|
|
65
|
-
env_data =
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tempdir_path)
|
|
89
|
+
async def test_volumes_native_source(tmp_path: Path):
|
|
90
|
+
env_data = get_basic_auth_env_data()
|
|
91
|
+
with mock.patch.dict(os.environ, clear=True):
|
|
92
|
+
indexer_config = DatabricksNativeVolumesIndexerConfig(
|
|
93
|
+
recursive=True,
|
|
94
|
+
volume="test-platform",
|
|
95
|
+
volume_path="databricks-volumes-test-input",
|
|
96
|
+
catalog=env_data.catalog,
|
|
97
|
+
)
|
|
98
|
+
connection_config = env_data.get_connection_config()
|
|
99
|
+
download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tmp_path)
|
|
76
100
|
indexer = DatabricksNativeVolumesIndexer(
|
|
77
101
|
connection_config=connection_config, index_config=indexer_config
|
|
78
102
|
)
|
|
@@ -89,12 +113,44 @@ async def test_volumes_native_source():
|
|
|
89
113
|
)
|
|
90
114
|
|
|
91
115
|
|
|
116
|
+
@pytest.mark.asyncio
|
|
117
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
118
|
+
@requires_env("DATABRICKS_HOST", "DATABRICKS_PAT", "DATABRICKS_CATALOG")
|
|
119
|
+
async def test_volumes_native_source_pat(tmp_path: Path):
|
|
120
|
+
env_data = get_pat_env_data()
|
|
121
|
+
with mock.patch.dict(os.environ, clear=True):
|
|
122
|
+
indexer_config = DatabricksNativeVolumesIndexerConfig(
|
|
123
|
+
recursive=True,
|
|
124
|
+
volume="test-platform",
|
|
125
|
+
volume_path="databricks-volumes-test-input",
|
|
126
|
+
catalog=env_data.catalog,
|
|
127
|
+
)
|
|
128
|
+
connection_config = env_data.get_connection_config()
|
|
129
|
+
download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tmp_path)
|
|
130
|
+
indexer = DatabricksNativeVolumesIndexer(
|
|
131
|
+
connection_config=connection_config, index_config=indexer_config
|
|
132
|
+
)
|
|
133
|
+
downloader = DatabricksNativeVolumesDownloader(
|
|
134
|
+
connection_config=connection_config, download_config=download_config
|
|
135
|
+
)
|
|
136
|
+
await source_connector_validation(
|
|
137
|
+
indexer=indexer,
|
|
138
|
+
downloader=downloader,
|
|
139
|
+
configs=SourceValidationConfigs(
|
|
140
|
+
test_id="databricks_volumes_native_pat",
|
|
141
|
+
expected_num_files=1,
|
|
142
|
+
),
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
|
|
92
146
|
def _get_volume_path(catalog: str, volume: str, volume_path: str):
|
|
93
147
|
return f"/Volumes/{catalog}/default/{volume}/{volume_path}"
|
|
94
148
|
|
|
95
149
|
|
|
96
150
|
@contextmanager
|
|
97
|
-
def databricks_destination_context(
|
|
151
|
+
def databricks_destination_context(
|
|
152
|
+
env_data: BasicAuthEnvData, volume: str, volume_path
|
|
153
|
+
) -> WorkspaceClient:
|
|
98
154
|
client = WorkspaceClient(
|
|
99
155
|
host=env_data.host, client_id=env_data.client_id, client_secret=env_data.client_secret
|
|
100
156
|
)
|
|
@@ -137,7 +193,7 @@ def validate_upload(client: WorkspaceClient, catalog: str, volume: str, volume_p
|
|
|
137
193
|
"DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
|
|
138
194
|
)
|
|
139
195
|
async def test_volumes_native_destination(upload_file: Path):
|
|
140
|
-
env_data =
|
|
196
|
+
env_data = get_basic_auth_env_data()
|
|
141
197
|
volume_path = f"databricks-volumes-test-output-{uuid.uuid4()}"
|
|
142
198
|
file_data = FileData(
|
|
143
199
|
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
@@ -15,7 +15,7 @@ from test.integration.connectors.utils.validation.source import (
|
|
|
15
15
|
SourceValidationConfigs,
|
|
16
16
|
source_connector_validation,
|
|
17
17
|
)
|
|
18
|
-
from unstructured_ingest.v2.interfaces import FileData
|
|
18
|
+
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
19
19
|
from unstructured_ingest.v2.processes.connectors.sql.postgres import (
|
|
20
20
|
CONNECTOR_TYPE,
|
|
21
21
|
PostgresAccessConfig,
|
|
@@ -28,7 +28,7 @@ from unstructured_ingest.v2.processes.connectors.sql.postgres import (
|
|
|
28
28
|
PostgresUploadStager,
|
|
29
29
|
)
|
|
30
30
|
|
|
31
|
-
SEED_DATA_ROWS =
|
|
31
|
+
SEED_DATA_ROWS = 10
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
@pytest.fixture
|
|
@@ -69,7 +69,7 @@ async def test_postgres_source(temp_dir: Path, source_database_setup: str):
|
|
|
69
69
|
)
|
|
70
70
|
indexer = PostgresIndexer(
|
|
71
71
|
connection_config=connection_config,
|
|
72
|
-
index_config=PostgresIndexerConfig(table_name="cars", id_column="car_id", batch_size=
|
|
72
|
+
index_config=PostgresIndexerConfig(table_name="cars", id_column="car_id", batch_size=6),
|
|
73
73
|
)
|
|
74
74
|
downloader = PostgresDownloader(
|
|
75
75
|
connection_config=connection_config,
|
|
@@ -81,7 +81,7 @@ async def test_postgres_source(temp_dir: Path, source_database_setup: str):
|
|
|
81
81
|
configs=SourceValidationConfigs(
|
|
82
82
|
test_id="postgres",
|
|
83
83
|
expected_num_files=SEED_DATA_ROWS,
|
|
84
|
-
expected_number_indexed_file_data=
|
|
84
|
+
expected_number_indexed_file_data=2,
|
|
85
85
|
validate_downloaded_files=True,
|
|
86
86
|
),
|
|
87
87
|
)
|
|
@@ -119,7 +119,11 @@ def validate_destination(
|
|
|
119
119
|
async def test_postgres_destination(upload_file: Path, temp_dir: Path):
|
|
120
120
|
# the postgres destination connector doesn't leverage the file data but is required as an input,
|
|
121
121
|
# mocking it with arbitrary values to meet the base requirements:
|
|
122
|
-
mock_file_data = FileData(
|
|
122
|
+
mock_file_data = FileData(
|
|
123
|
+
identifier="mock file data",
|
|
124
|
+
connector_type=CONNECTOR_TYPE,
|
|
125
|
+
source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
|
|
126
|
+
)
|
|
123
127
|
with docker_compose_context(
|
|
124
128
|
docker_compose_path=env_setup_path / "sql" / "postgres" / "destination"
|
|
125
129
|
):
|
|
@@ -15,7 +15,7 @@ from test.integration.connectors.utils.validation.source import (
|
|
|
15
15
|
SourceValidationConfigs,
|
|
16
16
|
source_connector_validation,
|
|
17
17
|
)
|
|
18
|
-
from unstructured_ingest.v2.interfaces import FileData
|
|
18
|
+
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
19
19
|
from unstructured_ingest.v2.processes.connectors.sql.singlestore import (
|
|
20
20
|
CONNECTOR_TYPE,
|
|
21
21
|
SingleStoreAccessConfig,
|
|
@@ -29,7 +29,7 @@ from unstructured_ingest.v2.processes.connectors.sql.singlestore import (
|
|
|
29
29
|
SingleStoreUploadStager,
|
|
30
30
|
)
|
|
31
31
|
|
|
32
|
-
SEED_DATA_ROWS =
|
|
32
|
+
SEED_DATA_ROWS = 10
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
@pytest.fixture
|
|
@@ -66,7 +66,7 @@ async def test_singlestore_source(temp_dir: Path, source_database_setup: dict):
|
|
|
66
66
|
)
|
|
67
67
|
indexer = SingleStoreIndexer(
|
|
68
68
|
connection_config=connection_config,
|
|
69
|
-
index_config=SingleStoreIndexerConfig(table_name="cars", id_column="car_id", batch_size=
|
|
69
|
+
index_config=SingleStoreIndexerConfig(table_name="cars", id_column="car_id", batch_size=6),
|
|
70
70
|
)
|
|
71
71
|
downloader = SingleStoreDownloader(
|
|
72
72
|
connection_config=connection_config,
|
|
@@ -80,7 +80,7 @@ async def test_singlestore_source(temp_dir: Path, source_database_setup: dict):
|
|
|
80
80
|
configs=SourceValidationConfigs(
|
|
81
81
|
test_id="singlestore",
|
|
82
82
|
expected_num_files=SEED_DATA_ROWS,
|
|
83
|
-
expected_number_indexed_file_data=
|
|
83
|
+
expected_number_indexed_file_data=2,
|
|
84
84
|
validate_downloaded_files=True,
|
|
85
85
|
),
|
|
86
86
|
)
|
|
@@ -103,7 +103,11 @@ def validate_destination(
|
|
|
103
103
|
@pytest.mark.asyncio
|
|
104
104
|
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
|
|
105
105
|
async def test_singlestore_destination(upload_file: Path, temp_dir: Path):
|
|
106
|
-
mock_file_data = FileData(
|
|
106
|
+
mock_file_data = FileData(
|
|
107
|
+
identifier="mock file data",
|
|
108
|
+
connector_type=CONNECTOR_TYPE,
|
|
109
|
+
source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
|
|
110
|
+
)
|
|
107
111
|
with docker_compose_context(
|
|
108
112
|
docker_compose_path=env_setup_path / "sql" / "singlestore" / "destination"
|
|
109
113
|
):
|
|
@@ -17,7 +17,7 @@ from test.integration.connectors.utils.validation.source import (
|
|
|
17
17
|
source_connector_validation,
|
|
18
18
|
)
|
|
19
19
|
from test.integration.utils import requires_env
|
|
20
|
-
from unstructured_ingest.v2.interfaces import FileData
|
|
20
|
+
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
21
21
|
from unstructured_ingest.v2.processes.connectors.sql.snowflake import (
|
|
22
22
|
CONNECTOR_TYPE,
|
|
23
23
|
SnowflakeAccessConfig,
|
|
@@ -170,7 +170,11 @@ async def test_snowflake_destination(
|
|
|
170
170
|
):
|
|
171
171
|
# the postgres destination connector doesn't leverage the file data but is required as an input,
|
|
172
172
|
# mocking it with arbitrary values to meet the base requirements:
|
|
173
|
-
mock_file_data = FileData(
|
|
173
|
+
mock_file_data = FileData(
|
|
174
|
+
identifier="mock file data",
|
|
175
|
+
connector_type=CONNECTOR_TYPE,
|
|
176
|
+
source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
|
|
177
|
+
)
|
|
174
178
|
init_db_destination()
|
|
175
179
|
stager = SnowflakeUploadStager()
|
|
176
180
|
staged_path = stager.run(
|
|
@@ -15,7 +15,7 @@ from test.integration.connectors.utils.validation.source import (
|
|
|
15
15
|
SourceValidationConfigs,
|
|
16
16
|
source_connector_validation,
|
|
17
17
|
)
|
|
18
|
-
from unstructured_ingest.v2.interfaces import FileData
|
|
18
|
+
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
19
19
|
from unstructured_ingest.v2.processes.connectors.sql.sqlite import (
|
|
20
20
|
CONNECTOR_TYPE,
|
|
21
21
|
SQLiteConnectionConfig,
|
|
@@ -27,7 +27,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sqlite import (
|
|
|
27
27
|
SQLiteUploadStager,
|
|
28
28
|
)
|
|
29
29
|
|
|
30
|
-
SEED_DATA_ROWS =
|
|
30
|
+
SEED_DATA_ROWS = 10
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
@pytest.fixture
|
|
@@ -57,7 +57,7 @@ async def test_sqlite_source(source_database_setup: Path, temp_dir: Path):
|
|
|
57
57
|
connection_config = SQLiteConnectionConfig(database_path=source_database_setup)
|
|
58
58
|
indexer = SQLiteIndexer(
|
|
59
59
|
connection_config=connection_config,
|
|
60
|
-
index_config=SQLiteIndexerConfig(table_name="cars", id_column="car_id", batch_size=
|
|
60
|
+
index_config=SQLiteIndexerConfig(table_name="cars", id_column="car_id", batch_size=6),
|
|
61
61
|
)
|
|
62
62
|
downloader = SQLiteDownloader(
|
|
63
63
|
connection_config=connection_config,
|
|
@@ -69,7 +69,7 @@ async def test_sqlite_source(source_database_setup: Path, temp_dir: Path):
|
|
|
69
69
|
configs=SourceValidationConfigs(
|
|
70
70
|
test_id="sqlite",
|
|
71
71
|
expected_num_files=SEED_DATA_ROWS,
|
|
72
|
-
expected_number_indexed_file_data=
|
|
72
|
+
expected_number_indexed_file_data=2,
|
|
73
73
|
validate_downloaded_files=True,
|
|
74
74
|
),
|
|
75
75
|
)
|
|
@@ -116,7 +116,11 @@ async def test_sqlite_destination(
|
|
|
116
116
|
):
|
|
117
117
|
# the sqlite destination connector doesn't leverage the file data but is required as an input,
|
|
118
118
|
# mocking it with arbitrary values to meet the base requirements:
|
|
119
|
-
mock_file_data = FileData(
|
|
119
|
+
mock_file_data = FileData(
|
|
120
|
+
identifier="mock file data",
|
|
121
|
+
connector_type=CONNECTOR_TYPE,
|
|
122
|
+
source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
|
|
123
|
+
)
|
|
120
124
|
stager = SQLiteUploadStager()
|
|
121
125
|
staged_path = stager.run(
|
|
122
126
|
elements_filepath=upload_file,
|
|
@@ -14,12 +14,18 @@ from test.integration.connectors.utils.validation.destination import (
|
|
|
14
14
|
StagerValidationConfigs,
|
|
15
15
|
stager_validation,
|
|
16
16
|
)
|
|
17
|
+
from test.integration.connectors.utils.validation.source import (
|
|
18
|
+
SourceValidationConfigs,
|
|
19
|
+
source_connector_validation,
|
|
20
|
+
)
|
|
17
21
|
from test.integration.utils import requires_env
|
|
18
22
|
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
19
23
|
from unstructured_ingest.v2.processes.connectors.astradb import (
|
|
20
24
|
CONNECTOR_TYPE,
|
|
21
25
|
AstraDBAccessConfig,
|
|
22
26
|
AstraDBConnectionConfig,
|
|
27
|
+
AstraDBDownloader,
|
|
28
|
+
AstraDBDownloaderConfig,
|
|
23
29
|
AstraDBIndexer,
|
|
24
30
|
AstraDBIndexerConfig,
|
|
25
31
|
AstraDBUploader,
|
|
@@ -110,6 +116,40 @@ def collection(upload_file: Path) -> Collection:
|
|
|
110
116
|
astra_db.drop_collection(collection)
|
|
111
117
|
|
|
112
118
|
|
|
119
|
+
@pytest.mark.asyncio
|
|
120
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
121
|
+
@requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
|
|
122
|
+
async def test_astra_search_source(
|
|
123
|
+
tmp_path: Path,
|
|
124
|
+
):
|
|
125
|
+
env_data = get_env_data()
|
|
126
|
+
collection_name = "ingest_test_src"
|
|
127
|
+
connection_config = AstraDBConnectionConfig(
|
|
128
|
+
access_config=AstraDBAccessConfig(token=env_data.token, api_endpoint=env_data.api_endpoint)
|
|
129
|
+
)
|
|
130
|
+
indexer = AstraDBIndexer(
|
|
131
|
+
index_config=AstraDBIndexerConfig(
|
|
132
|
+
collection_name=collection_name,
|
|
133
|
+
),
|
|
134
|
+
connection_config=connection_config,
|
|
135
|
+
)
|
|
136
|
+
downloader = AstraDBDownloader(
|
|
137
|
+
connection_config=connection_config,
|
|
138
|
+
download_config=AstraDBDownloaderConfig(download_dir=tmp_path),
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
await source_connector_validation(
|
|
142
|
+
indexer=indexer,
|
|
143
|
+
downloader=downloader,
|
|
144
|
+
configs=SourceValidationConfigs(
|
|
145
|
+
test_id=CONNECTOR_TYPE,
|
|
146
|
+
expected_num_files=5,
|
|
147
|
+
expected_number_indexed_file_data=1,
|
|
148
|
+
validate_downloaded_files=True,
|
|
149
|
+
),
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
|
|
113
153
|
@pytest.mark.asyncio
|
|
114
154
|
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
115
155
|
@requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
|
|
@@ -122,7 +122,7 @@ async def test_kafka_source_local(kafka_seed_topic: str):
|
|
|
122
122
|
indexer=indexer,
|
|
123
123
|
downloader=downloader,
|
|
124
124
|
configs=SourceValidationConfigs(
|
|
125
|
-
test_id="kafka", expected_num_files=5, validate_downloaded_files=True
|
|
125
|
+
test_id="kafka-local", expected_num_files=5, validate_downloaded_files=True
|
|
126
126
|
),
|
|
127
127
|
)
|
|
128
128
|
|
|
@@ -204,7 +204,7 @@ async def test_kafka_source_cloud(kafka_seed_topic_cloud: int):
|
|
|
204
204
|
indexer=indexer,
|
|
205
205
|
downloader=downloader,
|
|
206
206
|
configs=SourceValidationConfigs(
|
|
207
|
-
test_id="kafka",
|
|
207
|
+
test_id="kafka-cloud",
|
|
208
208
|
exclude_fields_extend=["connector_type"],
|
|
209
209
|
expected_num_files=expected_messages,
|
|
210
210
|
validate_downloaded_files=True,
|
|
@@ -197,7 +197,10 @@ async def test_mongodb_source(temp_dir: Path):
|
|
|
197
197
|
indexer=indexer,
|
|
198
198
|
downloader=downloader,
|
|
199
199
|
configs=SourceValidationConfigs(
|
|
200
|
-
test_id=CONNECTOR_TYPE,
|
|
200
|
+
test_id=CONNECTOR_TYPE,
|
|
201
|
+
expected_num_files=4,
|
|
202
|
+
validate_downloaded_files=True,
|
|
203
|
+
expected_number_indexed_file_data=1,
|
|
201
204
|
),
|
|
202
205
|
)
|
|
203
206
|
|
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
3
|
import shutil
|
|
4
|
-
from dataclasses import replace
|
|
5
4
|
from pathlib import Path
|
|
6
5
|
from typing import Callable, Optional
|
|
7
6
|
|
|
8
7
|
from deepdiff import DeepDiff
|
|
9
8
|
from pydantic import Field
|
|
10
9
|
|
|
11
|
-
from test.integration.connectors.utils.validation.utils import ValidationConfig
|
|
10
|
+
from test.integration.connectors.utils.validation.utils import ValidationConfig
|
|
12
11
|
from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
|
|
13
12
|
|
|
14
13
|
|
|
@@ -92,7 +91,7 @@ def check_contents(
|
|
|
92
91
|
file_data_path = expected_output_dir / f"{file_data.identifier}.json"
|
|
93
92
|
with file_data_path.open("r") as file:
|
|
94
93
|
expected_file_data_contents = json.load(file)
|
|
95
|
-
current_file_data_contents = file_data.
|
|
94
|
+
current_file_data_contents = file_data.model_dump()
|
|
96
95
|
expected_file_data_contents = configs.omit_ignored_fields(expected_file_data_contents)
|
|
97
96
|
current_file_data_contents = configs.omit_ignored_fields(current_file_data_contents)
|
|
98
97
|
diff = DeepDiff(expected_file_data_contents, current_file_data_contents)
|
|
@@ -160,9 +159,11 @@ def update_fixtures(
|
|
|
160
159
|
save_filedata: bool = True,
|
|
161
160
|
):
|
|
162
161
|
# Rewrite the current file data
|
|
162
|
+
if not output_dir.exists():
|
|
163
|
+
output_dir.mkdir(parents=True)
|
|
163
164
|
if save_filedata:
|
|
164
165
|
file_data_output_path = output_dir / "file_data"
|
|
165
|
-
|
|
166
|
+
shutil.rmtree(path=file_data_output_path, ignore_errors=True)
|
|
166
167
|
print(
|
|
167
168
|
f"Writing {len(all_file_data)} file data to "
|
|
168
169
|
f"saved fixture location {file_data_output_path}"
|
|
@@ -171,7 +172,7 @@ def update_fixtures(
|
|
|
171
172
|
for file_data in all_file_data:
|
|
172
173
|
file_data_path = file_data_output_path / f"{file_data.identifier}.json"
|
|
173
174
|
with file_data_path.open(mode="w") as f:
|
|
174
|
-
json.dump(file_data.
|
|
175
|
+
json.dump(file_data.model_dump(), f, indent=2)
|
|
175
176
|
|
|
176
177
|
# Record file structure of download directory
|
|
177
178
|
download_files = get_files(dir_path=download_dir)
|
|
@@ -183,7 +184,7 @@ def update_fixtures(
|
|
|
183
184
|
# If applicable, save raw downloads
|
|
184
185
|
if save_downloads:
|
|
185
186
|
raw_download_output_path = output_dir / "downloads"
|
|
186
|
-
|
|
187
|
+
shutil.rmtree(path=raw_download_output_path, ignore_errors=True)
|
|
187
188
|
print(
|
|
188
189
|
f"Writing {len(download_files)} downloaded files to "
|
|
189
190
|
f"saved fixture location {raw_download_output_path}"
|
|
@@ -213,7 +214,10 @@ def run_all_validations(
|
|
|
213
214
|
if configs.validate_file_data:
|
|
214
215
|
run_expected_results_validation(
|
|
215
216
|
expected_output_dir=test_output_dir / "file_data",
|
|
216
|
-
all_file_data=
|
|
217
|
+
all_file_data=get_all_file_data(
|
|
218
|
+
all_predownload_file_data=predownload_file_data,
|
|
219
|
+
all_postdownload_file_data=postdownload_file_data,
|
|
220
|
+
),
|
|
217
221
|
configs=configs,
|
|
218
222
|
)
|
|
219
223
|
download_files = get_files(dir_path=download_dir)
|
|
@@ -229,6 +233,19 @@ def run_all_validations(
|
|
|
229
233
|
)
|
|
230
234
|
|
|
231
235
|
|
|
236
|
+
def get_all_file_data(
|
|
237
|
+
all_postdownload_file_data: list[FileData], all_predownload_file_data: list[FileData]
|
|
238
|
+
) -> list[FileData]:
|
|
239
|
+
all_file_data = all_postdownload_file_data
|
|
240
|
+
indexed_file_data = [
|
|
241
|
+
fd
|
|
242
|
+
for fd in all_predownload_file_data
|
|
243
|
+
if fd.identifier not in [f.identifier for f in all_file_data]
|
|
244
|
+
]
|
|
245
|
+
all_file_data += indexed_file_data
|
|
246
|
+
return all_file_data
|
|
247
|
+
|
|
248
|
+
|
|
232
249
|
async def source_connector_validation(
|
|
233
250
|
indexer: Indexer,
|
|
234
251
|
downloader: Downloader,
|
|
@@ -246,7 +263,7 @@ async def source_connector_validation(
|
|
|
246
263
|
test_output_dir = configs.test_output_dir()
|
|
247
264
|
for file_data in indexer.run():
|
|
248
265
|
assert file_data
|
|
249
|
-
predownload_file_data =
|
|
266
|
+
predownload_file_data = file_data.model_copy(deep=True)
|
|
250
267
|
all_predownload_file_data.append(predownload_file_data)
|
|
251
268
|
if downloader.is_async():
|
|
252
269
|
resp = await downloader.run_async(file_data=file_data)
|
|
@@ -254,10 +271,10 @@ async def source_connector_validation(
|
|
|
254
271
|
resp = downloader.run(file_data=file_data)
|
|
255
272
|
if isinstance(resp, list):
|
|
256
273
|
for r in resp:
|
|
257
|
-
postdownload_file_data =
|
|
274
|
+
postdownload_file_data = r["file_data"].model_copy(deep=True)
|
|
258
275
|
all_postdownload_file_data.append(postdownload_file_data)
|
|
259
276
|
else:
|
|
260
|
-
postdownload_file_data =
|
|
277
|
+
postdownload_file_data = resp["file_data"].model_copy(deep=True)
|
|
261
278
|
all_postdownload_file_data.append(postdownload_file_data)
|
|
262
279
|
if not overwrite_fixtures:
|
|
263
280
|
print("Running validation")
|
|
@@ -273,7 +290,10 @@ async def source_connector_validation(
|
|
|
273
290
|
update_fixtures(
|
|
274
291
|
output_dir=test_output_dir,
|
|
275
292
|
download_dir=download_dir,
|
|
276
|
-
all_file_data=
|
|
293
|
+
all_file_data=get_all_file_data(
|
|
294
|
+
all_predownload_file_data=all_predownload_file_data,
|
|
295
|
+
all_postdownload_file_data=all_postdownload_file_data,
|
|
296
|
+
),
|
|
277
297
|
save_downloads=configs.validate_downloaded_files,
|
|
278
298
|
save_filedata=configs.validate_file_data,
|
|
279
299
|
)
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.3.
|
|
1
|
+
__version__ = "0.3.11" # pragma: no cover
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from .connector import AccessConfig, BaseConnector, ConnectionConfig
|
|
2
2
|
from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
|
|
3
|
-
from .file_data import FileData, FileDataSourceMetadata, SourceIdentifiers
|
|
3
|
+
from .file_data import BatchFileData, BatchItem, FileData, FileDataSourceMetadata, SourceIdentifiers
|
|
4
4
|
from .indexer import Indexer, IndexerConfig
|
|
5
5
|
from .process import BaseProcess
|
|
6
6
|
from .processor import ProcessorConfig
|
|
@@ -27,4 +27,6 @@ __all__ = [
|
|
|
27
27
|
"ConnectionConfig",
|
|
28
28
|
"BaseConnector",
|
|
29
29
|
"FileDataSourceMetadata",
|
|
30
|
+
"BatchFileData",
|
|
31
|
+
"BatchItem",
|
|
30
32
|
]
|
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
import json
|
|
2
|
-
from dataclasses import dataclass, field
|
|
3
2
|
from pathlib import Path
|
|
4
|
-
from typing import Any,
|
|
3
|
+
from typing import Any, Optional
|
|
4
|
+
from uuid import NAMESPACE_DNS, uuid5
|
|
5
5
|
|
|
6
|
-
from
|
|
6
|
+
from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
|
|
7
7
|
|
|
8
|
+
from unstructured_ingest.v2.logger import logger
|
|
8
9
|
|
|
9
|
-
|
|
10
|
-
class SourceIdentifiers:
|
|
10
|
+
|
|
11
|
+
class SourceIdentifiers(BaseModel):
|
|
11
12
|
filename: str
|
|
12
13
|
fullpath: str
|
|
13
14
|
rel_path: Optional[str] = None
|
|
@@ -21,8 +22,7 @@ class SourceIdentifiers:
|
|
|
21
22
|
return self.rel_path or self.fullpath
|
|
22
23
|
|
|
23
24
|
|
|
24
|
-
|
|
25
|
-
class FileDataSourceMetadata(DataClassJsonMixin):
|
|
25
|
+
class FileDataSourceMetadata(BaseModel):
|
|
26
26
|
url: Optional[str] = None
|
|
27
27
|
version: Optional[str] = None
|
|
28
28
|
record_locator: Optional[dict[str, Any]] = None
|
|
@@ -33,14 +33,12 @@ class FileDataSourceMetadata(DataClassJsonMixin):
|
|
|
33
33
|
filesize_bytes: Optional[int] = None
|
|
34
34
|
|
|
35
35
|
|
|
36
|
-
|
|
37
|
-
class FileData(DataClassJsonMixin):
|
|
36
|
+
class FileData(BaseModel):
|
|
38
37
|
identifier: str
|
|
39
38
|
connector_type: str
|
|
40
|
-
source_identifiers:
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
additional_metadata: dict[str, Any] = field(default_factory=dict)
|
|
39
|
+
source_identifiers: SourceIdentifiers
|
|
40
|
+
metadata: FileDataSourceMetadata = Field(default_factory=lambda: FileDataSourceMetadata())
|
|
41
|
+
additional_metadata: dict[str, Any] = Field(default_factory=dict)
|
|
44
42
|
reprocess: bool = False
|
|
45
43
|
local_download_path: Optional[str] = None
|
|
46
44
|
display_name: Optional[str] = None
|
|
@@ -52,11 +50,67 @@ class FileData(DataClassJsonMixin):
|
|
|
52
50
|
raise ValueError(f"file path not valid: {path}")
|
|
53
51
|
with open(str(path.resolve()), "rb") as f:
|
|
54
52
|
file_data_dict = json.load(f)
|
|
55
|
-
file_data =
|
|
53
|
+
file_data = cls.model_validate(file_data_dict)
|
|
56
54
|
return file_data
|
|
57
55
|
|
|
56
|
+
@classmethod
|
|
57
|
+
def cast(cls, file_data: "FileData", **kwargs) -> "FileData":
|
|
58
|
+
file_data_dict = file_data.model_dump()
|
|
59
|
+
return cls.model_validate(file_data_dict, **kwargs)
|
|
60
|
+
|
|
58
61
|
def to_file(self, path: str) -> None:
|
|
59
62
|
path = Path(path).resolve()
|
|
60
63
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
61
64
|
with open(str(path.resolve()), "w") as f:
|
|
62
|
-
json.dump(self.
|
|
65
|
+
json.dump(self.model_dump(), f, indent=2)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class BatchItem(BaseModel):
|
|
69
|
+
identifier: str
|
|
70
|
+
version: Optional[str] = None
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class BatchFileData(FileData):
|
|
74
|
+
identifier: str = Field(init=False)
|
|
75
|
+
batch_items: list[BatchItem]
|
|
76
|
+
source_identifiers: Optional[SourceIdentifiers] = None
|
|
77
|
+
|
|
78
|
+
@field_validator("batch_items")
|
|
79
|
+
@classmethod
|
|
80
|
+
def check_batch_items(cls, v: list[BatchItem]) -> list[BatchItem]:
|
|
81
|
+
if not v:
|
|
82
|
+
raise ValueError("batch items cannot be empty")
|
|
83
|
+
all_identifiers = [item.identifier for item in v]
|
|
84
|
+
if len(all_identifiers) != len(set(all_identifiers)):
|
|
85
|
+
raise ValueError(f"duplicate identifiers: {all_identifiers}")
|
|
86
|
+
sorted_batch_items = sorted(v, key=lambda item: item.identifier)
|
|
87
|
+
return sorted_batch_items
|
|
88
|
+
|
|
89
|
+
@model_validator(mode="before")
|
|
90
|
+
@classmethod
|
|
91
|
+
def populate_identifier(cls, data: Any) -> Any:
|
|
92
|
+
if isinstance(data, dict) and "identifier" not in data:
|
|
93
|
+
batch_items = data["batch_items"]
|
|
94
|
+
identifier_data = json.dumps(
|
|
95
|
+
{item.identifier: item.version for item in batch_items}, sort_keys=True
|
|
96
|
+
)
|
|
97
|
+
data["identifier"] = str(uuid5(NAMESPACE_DNS, str(identifier_data)))
|
|
98
|
+
return data
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def file_data_from_file(path: str) -> FileData:
|
|
102
|
+
try:
|
|
103
|
+
return BatchFileData.from_file(path=path)
|
|
104
|
+
except ValidationError:
|
|
105
|
+
logger.debug(f"{path} not valid for batch file data")
|
|
106
|
+
|
|
107
|
+
return FileData.from_file(path=path)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def file_data_from_dict(data: dict) -> FileData:
|
|
111
|
+
try:
|
|
112
|
+
return BatchFileData.model_validate(data)
|
|
113
|
+
except ValidationError:
|
|
114
|
+
logger.debug(f"{data} not valid for batch file data")
|
|
115
|
+
|
|
116
|
+
return FileData.model_validate(data)
|