unstructured-ingest 0.0.24__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +42 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +15 -0
- test/integration/connectors/databricks_tests/__init__.py +0 -0
- test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
- test/integration/connectors/test_postgres.py +100 -0
- test/integration/connectors/test_s3.py +152 -0
- test/integration/connectors/test_sqlite.py +91 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker_compose.py +44 -0
- test/integration/connectors/utils/validation.py +198 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_bedrock.py +49 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +47 -0
- test/integration/embedders/test_octoai.py +41 -0
- test/integration/embedders/test_openai.py +41 -0
- test/integration/embedders/test_vertexai.py +41 -0
- test/integration/embedders/test_voyageai.py +41 -0
- test/integration/embedders/togetherai.py +43 -0
- test/integration/embedders/utils.py +44 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +41 -0
- test/unit/embed/test_octoai.py +20 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_chunking_utils.py +36 -0
- test/unit/test_error.py +27 -0
- test/unit/test_interfaces.py +280 -0
- test/unit/test_interfaces_v2.py +26 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +164 -0
- test/unit/test_utils_v2.py +82 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/interfaces.py +2 -2
- unstructured_ingest/connector/notion/types/block.py +1 -0
- unstructured_ingest/connector/notion/types/database.py +1 -0
- unstructured_ingest/connector/notion/types/page.py +1 -0
- unstructured_ingest/embed/bedrock.py +0 -20
- unstructured_ingest/embed/huggingface.py +0 -21
- unstructured_ingest/embed/interfaces.py +29 -3
- unstructured_ingest/embed/mixedbreadai.py +0 -36
- unstructured_ingest/embed/octoai.py +2 -24
- unstructured_ingest/embed/openai.py +0 -20
- unstructured_ingest/embed/togetherai.py +40 -0
- unstructured_ingest/embed/vertexai.py +0 -20
- unstructured_ingest/embed/voyageai.py +1 -24
- unstructured_ingest/interfaces.py +1 -1
- unstructured_ingest/utils/dep_check.py +12 -0
- unstructured_ingest/v2/cli/utils/click.py +21 -2
- unstructured_ingest/v2/interfaces/connector.py +22 -2
- unstructured_ingest/v2/interfaces/downloader.py +1 -0
- unstructured_ingest/v2/processes/chunker.py +1 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +9 -11
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +125 -32
- unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
- unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +9 -1
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +121 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +181 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +109 -0
- unstructured_ingest/v2/processes/embedder.py +13 -0
- unstructured_ingest/v2/processes/partitioner.py +2 -1
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/METADATA +12 -10
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/RECORD +86 -32
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/top_level.txt +1 -0
- unstructured_ingest/v2/processes/connectors/sql.py +0 -275
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/entry_points.txt +0 -0
test/__init__.py
ADDED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from test.integration.utils import requires_env
|
|
7
|
+
from unstructured_ingest.v2.processes.chunker import Chunker, ChunkerConfig
|
|
8
|
+
|
|
9
|
+
int_test_dir = Path(__file__).parent
|
|
10
|
+
assets_dir = int_test_dir / "assets"
|
|
11
|
+
|
|
12
|
+
chunker_files = [path for path in assets_dir.iterdir() if path.is_file()]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@pytest.mark.parametrize("chunker_file", chunker_files, ids=[path.name for path in chunker_files])
|
|
16
|
+
@pytest.mark.parametrize("strategy", ["basic", "by_title", "by_similarity", "by_page"])
|
|
17
|
+
@requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
|
|
18
|
+
@pytest.mark.asyncio
|
|
19
|
+
async def test_chunker_api(chunker_file: Path, strategy: str):
|
|
20
|
+
api_key = os.getenv("UNSTRUCTURED_API_KEY")
|
|
21
|
+
api_url = os.getenv("UNSTRUCTURED_API_URL")
|
|
22
|
+
|
|
23
|
+
chunker_config = ChunkerConfig(
|
|
24
|
+
chunking_strategy=strategy,
|
|
25
|
+
chunk_by_api=True,
|
|
26
|
+
chunk_api_key=api_key,
|
|
27
|
+
chunking_endpoint=api_url,
|
|
28
|
+
)
|
|
29
|
+
chunker = Chunker(config=chunker_config)
|
|
30
|
+
results = await chunker.run_async(elements_filepath=chunker_file)
|
|
31
|
+
assert results
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@pytest.mark.parametrize("chunker_file", chunker_files, ids=[path.name for path in chunker_files])
|
|
35
|
+
@pytest.mark.parametrize("strategy", ["basic", "by_title"])
|
|
36
|
+
def test_chunker_basic(chunker_file: Path, strategy: str):
|
|
37
|
+
chunker_config = ChunkerConfig(
|
|
38
|
+
chunking_strategy=strategy,
|
|
39
|
+
)
|
|
40
|
+
chunker = Chunker(config=chunker_config)
|
|
41
|
+
results = chunker.run(elements_filepath=chunker_file)
|
|
42
|
+
assert results
|
|
File without changes
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
FILENAME = "DA-1p-with-duplicate-pages.pdf.json"
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.fixture
|
|
9
|
+
def upload_file() -> Path:
|
|
10
|
+
int_test_dir = Path(__file__).parent
|
|
11
|
+
assets_dir = int_test_dir / "assets"
|
|
12
|
+
upload_file = assets_dir / FILENAME
|
|
13
|
+
assert upload_file.exists()
|
|
14
|
+
assert upload_file.is_file()
|
|
15
|
+
return upload_file
|
|
File without changes
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import tempfile
|
|
4
|
+
import uuid
|
|
5
|
+
from contextlib import contextmanager
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import pytest
|
|
10
|
+
from databricks.sdk import WorkspaceClient
|
|
11
|
+
from databricks.sdk.errors.platform import NotFound
|
|
12
|
+
|
|
13
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
|
|
14
|
+
from test.integration.connectors.utils.validation import (
|
|
15
|
+
ValidationConfigs,
|
|
16
|
+
source_connector_validation,
|
|
17
|
+
)
|
|
18
|
+
from test.integration.utils import requires_env
|
|
19
|
+
from unstructured_ingest.v2.interfaces import FileData
|
|
20
|
+
from unstructured_ingest.v2.processes.connectors.databricks.volumes_native import (
|
|
21
|
+
CONNECTOR_TYPE,
|
|
22
|
+
DatabricksNativeVolumesAccessConfig,
|
|
23
|
+
DatabricksNativeVolumesConnectionConfig,
|
|
24
|
+
DatabricksNativeVolumesDownloader,
|
|
25
|
+
DatabricksNativeVolumesDownloaderConfig,
|
|
26
|
+
DatabricksNativeVolumesIndexer,
|
|
27
|
+
DatabricksNativeVolumesIndexerConfig,
|
|
28
|
+
DatabricksNativeVolumesUploader,
|
|
29
|
+
DatabricksNativeVolumesUploaderConfig,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class EnvData:
|
|
35
|
+
host: str
|
|
36
|
+
client_id: str
|
|
37
|
+
client_secret: str
|
|
38
|
+
catalog: str
|
|
39
|
+
|
|
40
|
+
def get_connection_config(self) -> DatabricksNativeVolumesConnectionConfig:
|
|
41
|
+
return DatabricksNativeVolumesConnectionConfig(
|
|
42
|
+
host=self.host,
|
|
43
|
+
access_config=DatabricksNativeVolumesAccessConfig(
|
|
44
|
+
client_id=self.client_id,
|
|
45
|
+
client_secret=self.client_secret,
|
|
46
|
+
),
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def get_env_data() -> EnvData:
|
|
51
|
+
return EnvData(
|
|
52
|
+
host=os.environ["DATABRICKS_HOST"],
|
|
53
|
+
client_id=os.environ["DATABRICKS_CLIENT_ID"],
|
|
54
|
+
client_secret=os.environ["DATABRICKS_CLIENT_SECRET"],
|
|
55
|
+
catalog=os.environ["DATABRICKS_CATALOG"],
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@pytest.mark.asyncio
|
|
60
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
61
|
+
@requires_env(
|
|
62
|
+
"DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
|
|
63
|
+
)
|
|
64
|
+
async def test_volumes_native_source():
|
|
65
|
+
env_data = get_env_data()
|
|
66
|
+
indexer_config = DatabricksNativeVolumesIndexerConfig(
|
|
67
|
+
recursive=True,
|
|
68
|
+
volume="test-platform",
|
|
69
|
+
volume_path="databricks-volumes-test-input",
|
|
70
|
+
catalog=env_data.catalog,
|
|
71
|
+
)
|
|
72
|
+
connection_config = env_data.get_connection_config()
|
|
73
|
+
with tempfile.TemporaryDirectory() as tempdir:
|
|
74
|
+
tempdir_path = Path(tempdir)
|
|
75
|
+
download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tempdir_path)
|
|
76
|
+
indexer = DatabricksNativeVolumesIndexer(
|
|
77
|
+
connection_config=connection_config, index_config=indexer_config
|
|
78
|
+
)
|
|
79
|
+
downloader = DatabricksNativeVolumesDownloader(
|
|
80
|
+
connection_config=connection_config, download_config=download_config
|
|
81
|
+
)
|
|
82
|
+
await source_connector_validation(
|
|
83
|
+
indexer=indexer,
|
|
84
|
+
downloader=downloader,
|
|
85
|
+
configs=ValidationConfigs(
|
|
86
|
+
test_id="databricks_volumes_native",
|
|
87
|
+
expected_num_files=1,
|
|
88
|
+
),
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _get_volume_path(catalog: str, volume: str, volume_path: str):
|
|
93
|
+
return f"/Volumes/{catalog}/default/{volume}/{volume_path}"
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@contextmanager
|
|
97
|
+
def databricks_destination_context(env_data: EnvData, volume: str, volume_path) -> WorkspaceClient:
|
|
98
|
+
client = WorkspaceClient(
|
|
99
|
+
host=env_data.host, client_id=env_data.client_id, client_secret=env_data.client_secret
|
|
100
|
+
)
|
|
101
|
+
try:
|
|
102
|
+
yield client
|
|
103
|
+
finally:
|
|
104
|
+
# Cleanup
|
|
105
|
+
try:
|
|
106
|
+
for file in client.files.list_directory_contents(
|
|
107
|
+
directory_path=_get_volume_path(env_data.catalog, volume, volume_path)
|
|
108
|
+
):
|
|
109
|
+
client.files.delete(file.path)
|
|
110
|
+
client.files.delete_directory(_get_volume_path(env_data.catalog, volume, volume_path))
|
|
111
|
+
except NotFound:
|
|
112
|
+
# Directory was never created, don't need to delete
|
|
113
|
+
pass
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def validate_upload(client: WorkspaceClient, catalog: str, volume: str, volume_path: str):
|
|
117
|
+
files = list(
|
|
118
|
+
client.files.list_directory_contents(
|
|
119
|
+
directory_path=_get_volume_path(catalog, volume, volume_path)
|
|
120
|
+
)
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
assert len(files) == 1
|
|
124
|
+
|
|
125
|
+
resp = client.files.download(files[0].path)
|
|
126
|
+
data = json.loads(resp.contents.read())
|
|
127
|
+
|
|
128
|
+
assert len(data) == 22
|
|
129
|
+
element_types = {v["type"] for v in data}
|
|
130
|
+
assert len(element_types) == 1
|
|
131
|
+
assert "CompositeElement" in element_types
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@pytest.mark.asyncio
|
|
135
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
136
|
+
@requires_env(
|
|
137
|
+
"DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
|
|
138
|
+
)
|
|
139
|
+
async def test_volumes_native_destination(upload_file: Path):
|
|
140
|
+
env_data = get_env_data()
|
|
141
|
+
volume_path = f"databricks-volumes-test-output-{uuid.uuid4()}"
|
|
142
|
+
mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
|
|
143
|
+
with databricks_destination_context(
|
|
144
|
+
volume="test-platform", volume_path=volume_path, env_data=env_data
|
|
145
|
+
) as workspace_client:
|
|
146
|
+
connection_config = env_data.get_connection_config()
|
|
147
|
+
uploader = DatabricksNativeVolumesUploader(
|
|
148
|
+
connection_config=connection_config,
|
|
149
|
+
upload_config=DatabricksNativeVolumesUploaderConfig(
|
|
150
|
+
volume="test-platform",
|
|
151
|
+
volume_path=volume_path,
|
|
152
|
+
catalog=env_data.catalog,
|
|
153
|
+
),
|
|
154
|
+
)
|
|
155
|
+
if uploader.is_async():
|
|
156
|
+
await uploader.run_async(path=upload_file, file_data=mock_file_data)
|
|
157
|
+
else:
|
|
158
|
+
uploader.run(path=upload_file, file_data=mock_file_data)
|
|
159
|
+
|
|
160
|
+
validate_upload(
|
|
161
|
+
client=workspace_client,
|
|
162
|
+
catalog=env_data.catalog,
|
|
163
|
+
volume="test-platform",
|
|
164
|
+
volume_path=volume_path,
|
|
165
|
+
)
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import tempfile
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import pytest
|
|
6
|
+
from psycopg2 import connect
|
|
7
|
+
|
|
8
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG, env_setup_path
|
|
9
|
+
from test.integration.connectors.utils.docker_compose import docker_compose_context
|
|
10
|
+
from unstructured_ingest.v2.interfaces import FileData
|
|
11
|
+
from unstructured_ingest.v2.processes.connectors.sql.postgres import (
|
|
12
|
+
CONNECTOR_TYPE,
|
|
13
|
+
PostgresAccessConfig,
|
|
14
|
+
PostgresConnectionConfig,
|
|
15
|
+
PostgresUploader,
|
|
16
|
+
PostgresUploadStager,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def validate_destination(
|
|
21
|
+
connect_params: dict,
|
|
22
|
+
expected_num_elements: int,
|
|
23
|
+
test_embedding: list[float],
|
|
24
|
+
expected_text: str,
|
|
25
|
+
):
|
|
26
|
+
# Run the following validations:
|
|
27
|
+
# * Check that the number of records in the table match the expected value
|
|
28
|
+
# * Given the embedding, make sure it matches the associated text it belongs to
|
|
29
|
+
with connect(**connect_params) as connection:
|
|
30
|
+
cursor = connection.cursor()
|
|
31
|
+
query = "select count(*) from elements;"
|
|
32
|
+
cursor.execute(query)
|
|
33
|
+
count = cursor.fetchone()[0]
|
|
34
|
+
assert (
|
|
35
|
+
count == expected_num_elements
|
|
36
|
+
), f"dest check failed: got {count}, expected {expected_num_elements}"
|
|
37
|
+
|
|
38
|
+
cursor.execute("SELECT embeddings FROM elements order by text limit 1")
|
|
39
|
+
similarity_query = (
|
|
40
|
+
f"SELECT text FROM elements ORDER BY embeddings <-> '{test_embedding}' LIMIT 1;"
|
|
41
|
+
)
|
|
42
|
+
cursor.execute(similarity_query)
|
|
43
|
+
res = cursor.fetchone()
|
|
44
|
+
assert res[0] == expected_text
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@pytest.mark.asyncio
|
|
48
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
|
|
49
|
+
async def test_postgres_destination(upload_file: Path):
|
|
50
|
+
# the postgres destination connector doesn't leverage the file data but is required as an input,
|
|
51
|
+
# mocking it with arbitrary values to meet the base requirements:
|
|
52
|
+
mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
|
|
53
|
+
with docker_compose_context(docker_compose_path=env_setup_path / "sql"):
|
|
54
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
55
|
+
stager = PostgresUploadStager()
|
|
56
|
+
stager_params = {
|
|
57
|
+
"elements_filepath": upload_file,
|
|
58
|
+
"file_data": mock_file_data,
|
|
59
|
+
"output_dir": Path(tmpdir),
|
|
60
|
+
"output_filename": "test_db",
|
|
61
|
+
}
|
|
62
|
+
if stager.is_async():
|
|
63
|
+
staged_path = await stager.run_async(**stager_params)
|
|
64
|
+
else:
|
|
65
|
+
staged_path = stager.run(**stager_params)
|
|
66
|
+
|
|
67
|
+
# The stager should append the `.json` suffix to the output filename passed in.
|
|
68
|
+
assert staged_path.name == "test_db.json"
|
|
69
|
+
|
|
70
|
+
connect_params = {
|
|
71
|
+
"host": "localhost",
|
|
72
|
+
"port": 5433,
|
|
73
|
+
"database": "elements",
|
|
74
|
+
"user": "unstructured",
|
|
75
|
+
"password": "test",
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
uploader = PostgresUploader(
|
|
79
|
+
connection_config=PostgresConnectionConfig(
|
|
80
|
+
host=connect_params["host"],
|
|
81
|
+
port=connect_params["port"],
|
|
82
|
+
database=connect_params["database"],
|
|
83
|
+
username=connect_params["user"],
|
|
84
|
+
access_config=PostgresAccessConfig(password=connect_params["password"]),
|
|
85
|
+
)
|
|
86
|
+
)
|
|
87
|
+
if uploader.is_async():
|
|
88
|
+
await uploader.run_async(path=staged_path, file_data=mock_file_data)
|
|
89
|
+
else:
|
|
90
|
+
uploader.run(path=staged_path, file_data=mock_file_data)
|
|
91
|
+
|
|
92
|
+
staged_df = pd.read_json(staged_path, orient="records", lines=True)
|
|
93
|
+
sample_element = staged_df.iloc[0]
|
|
94
|
+
expected_num_elements = len(staged_df)
|
|
95
|
+
validate_destination(
|
|
96
|
+
connect_params=connect_params,
|
|
97
|
+
expected_num_elements=expected_num_elements,
|
|
98
|
+
expected_text=sample_element["text"],
|
|
99
|
+
test_embedding=sample_element["embeddings"],
|
|
100
|
+
)
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import tempfile
|
|
3
|
+
import uuid
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import pytest
|
|
7
|
+
|
|
8
|
+
from test.integration.connectors.utils.constants import (
|
|
9
|
+
DESTINATION_TAG,
|
|
10
|
+
SOURCE_TAG,
|
|
11
|
+
env_setup_path,
|
|
12
|
+
)
|
|
13
|
+
from test.integration.connectors.utils.docker_compose import docker_compose_context
|
|
14
|
+
from test.integration.connectors.utils.validation import (
|
|
15
|
+
ValidationConfigs,
|
|
16
|
+
source_connector_validation,
|
|
17
|
+
)
|
|
18
|
+
from test.integration.utils import requires_env
|
|
19
|
+
from unstructured_ingest.error import (
|
|
20
|
+
SourceConnectionError,
|
|
21
|
+
)
|
|
22
|
+
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
23
|
+
from unstructured_ingest.v2.processes.connectors.fsspec.s3 import (
|
|
24
|
+
CONNECTOR_TYPE,
|
|
25
|
+
S3AccessConfig,
|
|
26
|
+
S3ConnectionConfig,
|
|
27
|
+
S3Downloader,
|
|
28
|
+
S3DownloaderConfig,
|
|
29
|
+
S3Indexer,
|
|
30
|
+
S3IndexerConfig,
|
|
31
|
+
S3Uploader,
|
|
32
|
+
S3UploaderConfig,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def validate_predownload_file_data(file_data: FileData):
|
|
37
|
+
assert file_data.connector_type == CONNECTOR_TYPE
|
|
38
|
+
assert file_data.local_download_path is None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def validate_postdownload_file_data(file_data: FileData):
|
|
42
|
+
assert file_data.connector_type == CONNECTOR_TYPE
|
|
43
|
+
assert file_data.local_download_path is not None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@pytest.fixture
|
|
47
|
+
def anon_connection_config() -> S3ConnectionConfig:
|
|
48
|
+
return S3ConnectionConfig(access_config=S3AccessConfig(), anonymous=True)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@pytest.mark.asyncio
|
|
52
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
53
|
+
async def test_s3_source(anon_connection_config: S3ConnectionConfig):
|
|
54
|
+
indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/small-pdf-set/")
|
|
55
|
+
with tempfile.TemporaryDirectory() as tempdir:
|
|
56
|
+
tempdir_path = Path(tempdir)
|
|
57
|
+
download_config = S3DownloaderConfig(download_dir=tempdir_path)
|
|
58
|
+
indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
|
|
59
|
+
downloader = S3Downloader(
|
|
60
|
+
connection_config=anon_connection_config, download_config=download_config
|
|
61
|
+
)
|
|
62
|
+
await source_connector_validation(
|
|
63
|
+
indexer=indexer,
|
|
64
|
+
downloader=downloader,
|
|
65
|
+
configs=ValidationConfigs(
|
|
66
|
+
test_id="s3",
|
|
67
|
+
predownload_file_data_check=validate_predownload_file_data,
|
|
68
|
+
postdownload_file_data_check=validate_postdownload_file_data,
|
|
69
|
+
expected_num_files=4,
|
|
70
|
+
),
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@pytest.mark.asyncio
|
|
75
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
76
|
+
async def test_s3_source_no_access(anon_connection_config: S3ConnectionConfig):
|
|
77
|
+
indexer_config = S3IndexerConfig(remote_url="s3://utic-ingest-test-fixtures/destination/")
|
|
78
|
+
indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
|
|
79
|
+
with pytest.raises(SourceConnectionError):
|
|
80
|
+
indexer.precheck()
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@pytest.mark.asyncio
|
|
84
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "minio")
|
|
85
|
+
async def test_s3_minio_source(anon_connection_config: S3ConnectionConfig):
|
|
86
|
+
anon_connection_config.endpoint_url = "http://localhost:9000"
|
|
87
|
+
indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/")
|
|
88
|
+
with docker_compose_context(docker_compose_path=env_setup_path / "minio"):
|
|
89
|
+
with tempfile.TemporaryDirectory() as tempdir:
|
|
90
|
+
tempdir_path = Path(tempdir)
|
|
91
|
+
download_config = S3DownloaderConfig(download_dir=tempdir_path)
|
|
92
|
+
indexer = S3Indexer(
|
|
93
|
+
connection_config=anon_connection_config, index_config=indexer_config
|
|
94
|
+
)
|
|
95
|
+
downloader = S3Downloader(
|
|
96
|
+
connection_config=anon_connection_config, download_config=download_config
|
|
97
|
+
)
|
|
98
|
+
await source_connector_validation(
|
|
99
|
+
indexer=indexer,
|
|
100
|
+
downloader=downloader,
|
|
101
|
+
configs=ValidationConfigs(
|
|
102
|
+
test_id="s3-minio",
|
|
103
|
+
predownload_file_data_check=validate_predownload_file_data,
|
|
104
|
+
postdownload_file_data_check=validate_postdownload_file_data,
|
|
105
|
+
expected_num_files=1,
|
|
106
|
+
exclude_fields_extend=[
|
|
107
|
+
"metadata.date_modified",
|
|
108
|
+
"metadata.date_created",
|
|
109
|
+
"additional_metadata.LastModified",
|
|
110
|
+
],
|
|
111
|
+
),
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def get_aws_credentials() -> dict:
|
|
116
|
+
access_key = os.getenv("S3_INGEST_TEST_ACCESS_KEY", None)
|
|
117
|
+
assert access_key
|
|
118
|
+
secret_key = os.getenv("S3_INGEST_TEST_SECRET_KEY", None)
|
|
119
|
+
assert secret_key
|
|
120
|
+
return {"aws_access_key_id": access_key, "aws_secret_access_key": secret_key}
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@pytest.mark.asyncio
|
|
124
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
125
|
+
@requires_env("S3_INGEST_TEST_ACCESS_KEY", "S3_INGEST_TEST_SECRET_KEY")
|
|
126
|
+
async def test_s3_destination(upload_file: Path):
|
|
127
|
+
aws_credentials = get_aws_credentials()
|
|
128
|
+
s3_bucket = "s3://utic-ingest-test-fixtures"
|
|
129
|
+
destination_path = f"{s3_bucket}/destination/{uuid.uuid4()}"
|
|
130
|
+
connection_config = S3ConnectionConfig(
|
|
131
|
+
access_config=S3AccessConfig(
|
|
132
|
+
key=aws_credentials["aws_access_key_id"],
|
|
133
|
+
secret=aws_credentials["aws_secret_access_key"],
|
|
134
|
+
),
|
|
135
|
+
)
|
|
136
|
+
upload_config = S3UploaderConfig(remote_url=destination_path)
|
|
137
|
+
uploader = S3Uploader(connection_config=connection_config, upload_config=upload_config)
|
|
138
|
+
s3fs = uploader.fs
|
|
139
|
+
file_data = FileData(
|
|
140
|
+
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
141
|
+
connector_type=CONNECTOR_TYPE,
|
|
142
|
+
identifier="mock file data",
|
|
143
|
+
)
|
|
144
|
+
try:
|
|
145
|
+
if uploader.is_async():
|
|
146
|
+
await uploader.run_async(path=upload_file, file_data=file_data)
|
|
147
|
+
else:
|
|
148
|
+
uploader.run(path=upload_file, file_data=file_data)
|
|
149
|
+
uploaded_files = s3fs.ls(path=destination_path)
|
|
150
|
+
assert len(uploaded_files) == 1
|
|
151
|
+
finally:
|
|
152
|
+
s3fs.rm(path=destination_path, recursive=True)
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import sqlite3
|
|
2
|
+
import tempfile
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import pytest
|
|
8
|
+
|
|
9
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG, env_setup_path
|
|
10
|
+
from unstructured_ingest.v2.interfaces import FileData
|
|
11
|
+
from unstructured_ingest.v2.processes.connectors.sql.sqlite import (
|
|
12
|
+
CONNECTOR_TYPE,
|
|
13
|
+
SQLiteConnectionConfig,
|
|
14
|
+
SQLiteUploader,
|
|
15
|
+
SQLiteUploadStager,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@contextmanager
|
|
20
|
+
def sqlite_setup() -> Path:
|
|
21
|
+
# Provision the local file that sqlite points to to have the desired schema for the integration
|
|
22
|
+
# tests and make sure the file and connection get cleaned up by using a context manager.
|
|
23
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
24
|
+
db_path = Path(tmpdir) / "elements.db"
|
|
25
|
+
db_init_path = env_setup_path / "sql" / "sqlite-schema.sql"
|
|
26
|
+
assert db_init_path.exists()
|
|
27
|
+
assert db_init_path.is_file()
|
|
28
|
+
connection = None
|
|
29
|
+
try:
|
|
30
|
+
connection = sqlite3.connect(database=db_path)
|
|
31
|
+
with db_init_path.open("r") as f:
|
|
32
|
+
query = f.read()
|
|
33
|
+
cursor = connection.cursor()
|
|
34
|
+
cursor.executescript(query)
|
|
35
|
+
yield db_path
|
|
36
|
+
finally:
|
|
37
|
+
if connection:
|
|
38
|
+
connection.close()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def validate_destination(db_path: Path, expected_num_elements: int):
|
|
42
|
+
# Run the following validations:
|
|
43
|
+
# * Check that the number of records in the table match the expected value
|
|
44
|
+
connection = None
|
|
45
|
+
try:
|
|
46
|
+
connection = sqlite3.connect(database=db_path)
|
|
47
|
+
query = "select count(*) from elements;"
|
|
48
|
+
cursor = connection.cursor()
|
|
49
|
+
cursor.execute(query)
|
|
50
|
+
count = cursor.fetchone()[0]
|
|
51
|
+
assert (
|
|
52
|
+
count == expected_num_elements
|
|
53
|
+
), f"dest check failed: got {count}, expected {expected_num_elements}"
|
|
54
|
+
finally:
|
|
55
|
+
if connection:
|
|
56
|
+
connection.close()
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@pytest.mark.asyncio
|
|
60
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
|
|
61
|
+
async def test_sqlite_destination(upload_file: Path):
|
|
62
|
+
# the sqlite destination connector doesn't leverage the file data but is required as an input,
|
|
63
|
+
# mocking it with arbitrary values to meet the base requirements:
|
|
64
|
+
mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
|
|
65
|
+
with sqlite_setup() as db_path:
|
|
66
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
67
|
+
stager = SQLiteUploadStager()
|
|
68
|
+
stager_params = {
|
|
69
|
+
"elements_filepath": upload_file,
|
|
70
|
+
"file_data": mock_file_data,
|
|
71
|
+
"output_dir": Path(tmpdir),
|
|
72
|
+
"output_filename": "test_db",
|
|
73
|
+
}
|
|
74
|
+
if stager.is_async():
|
|
75
|
+
staged_path = await stager.run_async(**stager_params)
|
|
76
|
+
else:
|
|
77
|
+
staged_path = stager.run(**stager_params)
|
|
78
|
+
|
|
79
|
+
# The stager should append the `.json` suffix to the output filename passed in.
|
|
80
|
+
assert staged_path.name == "test_db.json"
|
|
81
|
+
|
|
82
|
+
uploader = SQLiteUploader(
|
|
83
|
+
connection_config=SQLiteConnectionConfig(database_path=db_path)
|
|
84
|
+
)
|
|
85
|
+
if uploader.is_async():
|
|
86
|
+
await uploader.run_async(path=staged_path, file_data=mock_file_data)
|
|
87
|
+
else:
|
|
88
|
+
uploader.run(path=staged_path, file_data=mock_file_data)
|
|
89
|
+
|
|
90
|
+
staged_df = pd.read_json(staged_path, orient="records", lines=True)
|
|
91
|
+
validate_destination(db_path=db_path, expected_num_elements=len(staged_df))
|
|
File without changes
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@contextmanager
|
|
7
|
+
def docker_compose_context(docker_compose_path: Path):
|
|
8
|
+
# Dynamically run a specific docker compose file and make sure it gets cleanup by
|
|
9
|
+
# by leveraging a context manager. Uses subprocess to map docker compose commands
|
|
10
|
+
# to the underlying shell.
|
|
11
|
+
assert docker_compose_path.exists()
|
|
12
|
+
if docker_compose_path.is_dir():
|
|
13
|
+
if (docker_compose_path / "docker-compose.yml").exists():
|
|
14
|
+
docker_compose_path = docker_compose_path / "docker-compose.yml"
|
|
15
|
+
elif (docker_compose_path / "docker-compose.yaml").exists():
|
|
16
|
+
docker_compose_path = docker_compose_path / "docker-compose.yaml"
|
|
17
|
+
assert docker_compose_path.is_file()
|
|
18
|
+
resp = None
|
|
19
|
+
try:
|
|
20
|
+
cmd = f"docker compose -f {docker_compose_path.resolve()} up -d --wait"
|
|
21
|
+
print(f"Running command: {cmd}")
|
|
22
|
+
resp = subprocess.run(
|
|
23
|
+
cmd,
|
|
24
|
+
shell=True,
|
|
25
|
+
capture_output=True,
|
|
26
|
+
)
|
|
27
|
+
# Return code from docker compose using --wait can be 1 even if no error
|
|
28
|
+
yield
|
|
29
|
+
except Exception as e:
|
|
30
|
+
if resp:
|
|
31
|
+
print("STDOUT: {}".format(resp.stdout.decode("utf-8")))
|
|
32
|
+
print("STDERR: {}".format(resp.stderr.decode("utf-8")))
|
|
33
|
+
raise e
|
|
34
|
+
finally:
|
|
35
|
+
cmd = f"docker compose -f {docker_compose_path.resolve()} down --remove-orphans -v"
|
|
36
|
+
print(f"Running command: {cmd}")
|
|
37
|
+
final_resp = subprocess.run(
|
|
38
|
+
cmd,
|
|
39
|
+
shell=True,
|
|
40
|
+
capture_output=True,
|
|
41
|
+
)
|
|
42
|
+
if final_resp.returncode != 0:
|
|
43
|
+
print("STDOUT: {}".format(final_resp.stdout.decode("utf-8")))
|
|
44
|
+
print("STDERR: {}".format(final_resp.stderr.decode("utf-8")))
|