unstructured-ingest 0.0.24__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (87) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +42 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +15 -0
  7. test/integration/connectors/databricks_tests/__init__.py +0 -0
  8. test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
  9. test/integration/connectors/test_postgres.py +100 -0
  10. test/integration/connectors/test_s3.py +152 -0
  11. test/integration/connectors/test_sqlite.py +91 -0
  12. test/integration/connectors/utils/__init__.py +0 -0
  13. test/integration/connectors/utils/constants.py +7 -0
  14. test/integration/connectors/utils/docker_compose.py +44 -0
  15. test/integration/connectors/utils/validation.py +198 -0
  16. test/integration/embedders/__init__.py +0 -0
  17. test/integration/embedders/conftest.py +13 -0
  18. test/integration/embedders/test_bedrock.py +49 -0
  19. test/integration/embedders/test_huggingface.py +26 -0
  20. test/integration/embedders/test_mixedbread.py +47 -0
  21. test/integration/embedders/test_octoai.py +41 -0
  22. test/integration/embedders/test_openai.py +41 -0
  23. test/integration/embedders/test_vertexai.py +41 -0
  24. test/integration/embedders/test_voyageai.py +41 -0
  25. test/integration/embedders/togetherai.py +43 -0
  26. test/integration/embedders/utils.py +44 -0
  27. test/integration/partitioners/__init__.py +0 -0
  28. test/integration/partitioners/test_partitioner.py +75 -0
  29. test/integration/utils.py +15 -0
  30. test/unit/__init__.py +0 -0
  31. test/unit/embed/__init__.py +0 -0
  32. test/unit/embed/test_mixedbreadai.py +41 -0
  33. test/unit/embed/test_octoai.py +20 -0
  34. test/unit/embed/test_openai.py +20 -0
  35. test/unit/embed/test_vertexai.py +25 -0
  36. test/unit/embed/test_voyageai.py +24 -0
  37. test/unit/test_chunking_utils.py +36 -0
  38. test/unit/test_error.py +27 -0
  39. test/unit/test_interfaces.py +280 -0
  40. test/unit/test_interfaces_v2.py +26 -0
  41. test/unit/test_logger.py +78 -0
  42. test/unit/test_utils.py +164 -0
  43. test/unit/test_utils_v2.py +82 -0
  44. unstructured_ingest/__version__.py +1 -1
  45. unstructured_ingest/cli/interfaces.py +2 -2
  46. unstructured_ingest/connector/notion/types/block.py +1 -0
  47. unstructured_ingest/connector/notion/types/database.py +1 -0
  48. unstructured_ingest/connector/notion/types/page.py +1 -0
  49. unstructured_ingest/embed/bedrock.py +0 -20
  50. unstructured_ingest/embed/huggingface.py +0 -21
  51. unstructured_ingest/embed/interfaces.py +29 -3
  52. unstructured_ingest/embed/mixedbreadai.py +0 -36
  53. unstructured_ingest/embed/octoai.py +2 -24
  54. unstructured_ingest/embed/openai.py +0 -20
  55. unstructured_ingest/embed/togetherai.py +40 -0
  56. unstructured_ingest/embed/vertexai.py +0 -20
  57. unstructured_ingest/embed/voyageai.py +1 -24
  58. unstructured_ingest/interfaces.py +1 -1
  59. unstructured_ingest/utils/dep_check.py +12 -0
  60. unstructured_ingest/v2/cli/utils/click.py +21 -2
  61. unstructured_ingest/v2/interfaces/connector.py +22 -2
  62. unstructured_ingest/v2/interfaces/downloader.py +1 -0
  63. unstructured_ingest/v2/processes/chunker.py +1 -1
  64. unstructured_ingest/v2/processes/connectors/__init__.py +9 -11
  65. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  66. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
  67. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  68. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  69. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  70. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  71. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +125 -32
  72. unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
  73. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  74. unstructured_ingest/v2/processes/connectors/pinecone.py +9 -1
  75. unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
  76. unstructured_ingest/v2/processes/connectors/sql/postgres.py +121 -0
  77. unstructured_ingest/v2/processes/connectors/sql/sql.py +181 -0
  78. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +109 -0
  79. unstructured_ingest/v2/processes/embedder.py +13 -0
  80. unstructured_ingest/v2/processes/partitioner.py +2 -1
  81. {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/METADATA +12 -10
  82. {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/RECORD +86 -32
  83. {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/top_level.txt +1 -0
  84. unstructured_ingest/v2/processes/connectors/sql.py +0 -275
  85. {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/LICENSE.md +0 -0
  86. {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/WHEEL +0 -0
  87. {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/entry_points.txt +0 -0
test/__init__.py ADDED
File without changes
File without changes
File without changes
@@ -0,0 +1,42 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ import pytest
5
+
6
+ from test.integration.utils import requires_env
7
+ from unstructured_ingest.v2.processes.chunker import Chunker, ChunkerConfig
8
+
9
+ int_test_dir = Path(__file__).parent
10
+ assets_dir = int_test_dir / "assets"
11
+
12
+ chunker_files = [path for path in assets_dir.iterdir() if path.is_file()]
13
+
14
+
15
+ @pytest.mark.parametrize("chunker_file", chunker_files, ids=[path.name for path in chunker_files])
16
+ @pytest.mark.parametrize("strategy", ["basic", "by_title", "by_similarity", "by_page"])
17
+ @requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
18
+ @pytest.mark.asyncio
19
+ async def test_chunker_api(chunker_file: Path, strategy: str):
20
+ api_key = os.getenv("UNSTRUCTURED_API_KEY")
21
+ api_url = os.getenv("UNSTRUCTURED_API_URL")
22
+
23
+ chunker_config = ChunkerConfig(
24
+ chunking_strategy=strategy,
25
+ chunk_by_api=True,
26
+ chunk_api_key=api_key,
27
+ chunking_endpoint=api_url,
28
+ )
29
+ chunker = Chunker(config=chunker_config)
30
+ results = await chunker.run_async(elements_filepath=chunker_file)
31
+ assert results
32
+
33
+
34
+ @pytest.mark.parametrize("chunker_file", chunker_files, ids=[path.name for path in chunker_files])
35
+ @pytest.mark.parametrize("strategy", ["basic", "by_title"])
36
+ def test_chunker_basic(chunker_file: Path, strategy: str):
37
+ chunker_config = ChunkerConfig(
38
+ chunking_strategy=strategy,
39
+ )
40
+ chunker = Chunker(config=chunker_config)
41
+ results = chunker.run(elements_filepath=chunker_file)
42
+ assert results
File without changes
@@ -0,0 +1,15 @@
1
+ from pathlib import Path
2
+
3
+ import pytest
4
+
5
+ FILENAME = "DA-1p-with-duplicate-pages.pdf.json"
6
+
7
+
8
+ @pytest.fixture
9
+ def upload_file() -> Path:
10
+ int_test_dir = Path(__file__).parent
11
+ assets_dir = int_test_dir / "assets"
12
+ upload_file = assets_dir / FILENAME
13
+ assert upload_file.exists()
14
+ assert upload_file.is_file()
15
+ return upload_file
@@ -0,0 +1,165 @@
1
+ import json
2
+ import os
3
+ import tempfile
4
+ import uuid
5
+ from contextlib import contextmanager
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+
9
+ import pytest
10
+ from databricks.sdk import WorkspaceClient
11
+ from databricks.sdk.errors.platform import NotFound
12
+
13
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
14
+ from test.integration.connectors.utils.validation import (
15
+ ValidationConfigs,
16
+ source_connector_validation,
17
+ )
18
+ from test.integration.utils import requires_env
19
+ from unstructured_ingest.v2.interfaces import FileData
20
+ from unstructured_ingest.v2.processes.connectors.databricks.volumes_native import (
21
+ CONNECTOR_TYPE,
22
+ DatabricksNativeVolumesAccessConfig,
23
+ DatabricksNativeVolumesConnectionConfig,
24
+ DatabricksNativeVolumesDownloader,
25
+ DatabricksNativeVolumesDownloaderConfig,
26
+ DatabricksNativeVolumesIndexer,
27
+ DatabricksNativeVolumesIndexerConfig,
28
+ DatabricksNativeVolumesUploader,
29
+ DatabricksNativeVolumesUploaderConfig,
30
+ )
31
+
32
+
33
+ @dataclass
34
+ class EnvData:
35
+ host: str
36
+ client_id: str
37
+ client_secret: str
38
+ catalog: str
39
+
40
+ def get_connection_config(self) -> DatabricksNativeVolumesConnectionConfig:
41
+ return DatabricksNativeVolumesConnectionConfig(
42
+ host=self.host,
43
+ access_config=DatabricksNativeVolumesAccessConfig(
44
+ client_id=self.client_id,
45
+ client_secret=self.client_secret,
46
+ ),
47
+ )
48
+
49
+
50
+ def get_env_data() -> EnvData:
51
+ return EnvData(
52
+ host=os.environ["DATABRICKS_HOST"],
53
+ client_id=os.environ["DATABRICKS_CLIENT_ID"],
54
+ client_secret=os.environ["DATABRICKS_CLIENT_SECRET"],
55
+ catalog=os.environ["DATABRICKS_CATALOG"],
56
+ )
57
+
58
+
59
+ @pytest.mark.asyncio
60
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
61
+ @requires_env(
62
+ "DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
63
+ )
64
+ async def test_volumes_native_source():
65
+ env_data = get_env_data()
66
+ indexer_config = DatabricksNativeVolumesIndexerConfig(
67
+ recursive=True,
68
+ volume="test-platform",
69
+ volume_path="databricks-volumes-test-input",
70
+ catalog=env_data.catalog,
71
+ )
72
+ connection_config = env_data.get_connection_config()
73
+ with tempfile.TemporaryDirectory() as tempdir:
74
+ tempdir_path = Path(tempdir)
75
+ download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tempdir_path)
76
+ indexer = DatabricksNativeVolumesIndexer(
77
+ connection_config=connection_config, index_config=indexer_config
78
+ )
79
+ downloader = DatabricksNativeVolumesDownloader(
80
+ connection_config=connection_config, download_config=download_config
81
+ )
82
+ await source_connector_validation(
83
+ indexer=indexer,
84
+ downloader=downloader,
85
+ configs=ValidationConfigs(
86
+ test_id="databricks_volumes_native",
87
+ expected_num_files=1,
88
+ ),
89
+ )
90
+
91
+
92
+ def _get_volume_path(catalog: str, volume: str, volume_path: str):
93
+ return f"/Volumes/{catalog}/default/{volume}/{volume_path}"
94
+
95
+
96
+ @contextmanager
97
+ def databricks_destination_context(env_data: EnvData, volume: str, volume_path) -> WorkspaceClient:
98
+ client = WorkspaceClient(
99
+ host=env_data.host, client_id=env_data.client_id, client_secret=env_data.client_secret
100
+ )
101
+ try:
102
+ yield client
103
+ finally:
104
+ # Cleanup
105
+ try:
106
+ for file in client.files.list_directory_contents(
107
+ directory_path=_get_volume_path(env_data.catalog, volume, volume_path)
108
+ ):
109
+ client.files.delete(file.path)
110
+ client.files.delete_directory(_get_volume_path(env_data.catalog, volume, volume_path))
111
+ except NotFound:
112
+ # Directory was never created, don't need to delete
113
+ pass
114
+
115
+
116
+ def validate_upload(client: WorkspaceClient, catalog: str, volume: str, volume_path: str):
117
+ files = list(
118
+ client.files.list_directory_contents(
119
+ directory_path=_get_volume_path(catalog, volume, volume_path)
120
+ )
121
+ )
122
+
123
+ assert len(files) == 1
124
+
125
+ resp = client.files.download(files[0].path)
126
+ data = json.loads(resp.contents.read())
127
+
128
+ assert len(data) == 22
129
+ element_types = {v["type"] for v in data}
130
+ assert len(element_types) == 1
131
+ assert "CompositeElement" in element_types
132
+
133
+
134
+ @pytest.mark.asyncio
135
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
136
+ @requires_env(
137
+ "DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
138
+ )
139
+ async def test_volumes_native_destination(upload_file: Path):
140
+ env_data = get_env_data()
141
+ volume_path = f"databricks-volumes-test-output-{uuid.uuid4()}"
142
+ mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
143
+ with databricks_destination_context(
144
+ volume="test-platform", volume_path=volume_path, env_data=env_data
145
+ ) as workspace_client:
146
+ connection_config = env_data.get_connection_config()
147
+ uploader = DatabricksNativeVolumesUploader(
148
+ connection_config=connection_config,
149
+ upload_config=DatabricksNativeVolumesUploaderConfig(
150
+ volume="test-platform",
151
+ volume_path=volume_path,
152
+ catalog=env_data.catalog,
153
+ ),
154
+ )
155
+ if uploader.is_async():
156
+ await uploader.run_async(path=upload_file, file_data=mock_file_data)
157
+ else:
158
+ uploader.run(path=upload_file, file_data=mock_file_data)
159
+
160
+ validate_upload(
161
+ client=workspace_client,
162
+ catalog=env_data.catalog,
163
+ volume="test-platform",
164
+ volume_path=volume_path,
165
+ )
@@ -0,0 +1,100 @@
1
+ import tempfile
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+ import pytest
6
+ from psycopg2 import connect
7
+
8
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, env_setup_path
9
+ from test.integration.connectors.utils.docker_compose import docker_compose_context
10
+ from unstructured_ingest.v2.interfaces import FileData
11
+ from unstructured_ingest.v2.processes.connectors.sql.postgres import (
12
+ CONNECTOR_TYPE,
13
+ PostgresAccessConfig,
14
+ PostgresConnectionConfig,
15
+ PostgresUploader,
16
+ PostgresUploadStager,
17
+ )
18
+
19
+
20
+ def validate_destination(
21
+ connect_params: dict,
22
+ expected_num_elements: int,
23
+ test_embedding: list[float],
24
+ expected_text: str,
25
+ ):
26
+ # Run the following validations:
27
+ # * Check that the number of records in the table match the expected value
28
+ # * Given the embedding, make sure it matches the associated text it belongs to
29
+ with connect(**connect_params) as connection:
30
+ cursor = connection.cursor()
31
+ query = "select count(*) from elements;"
32
+ cursor.execute(query)
33
+ count = cursor.fetchone()[0]
34
+ assert (
35
+ count == expected_num_elements
36
+ ), f"dest check failed: got {count}, expected {expected_num_elements}"
37
+
38
+ cursor.execute("SELECT embeddings FROM elements order by text limit 1")
39
+ similarity_query = (
40
+ f"SELECT text FROM elements ORDER BY embeddings <-> '{test_embedding}' LIMIT 1;"
41
+ )
42
+ cursor.execute(similarity_query)
43
+ res = cursor.fetchone()
44
+ assert res[0] == expected_text
45
+
46
+
47
+ @pytest.mark.asyncio
48
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
49
+ async def test_postgres_destination(upload_file: Path):
50
+ # the postgres destination connector doesn't leverage the file data but is required as an input,
51
+ # mocking it with arbitrary values to meet the base requirements:
52
+ mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
53
+ with docker_compose_context(docker_compose_path=env_setup_path / "sql"):
54
+ with tempfile.TemporaryDirectory() as tmpdir:
55
+ stager = PostgresUploadStager()
56
+ stager_params = {
57
+ "elements_filepath": upload_file,
58
+ "file_data": mock_file_data,
59
+ "output_dir": Path(tmpdir),
60
+ "output_filename": "test_db",
61
+ }
62
+ if stager.is_async():
63
+ staged_path = await stager.run_async(**stager_params)
64
+ else:
65
+ staged_path = stager.run(**stager_params)
66
+
67
+ # The stager should append the `.json` suffix to the output filename passed in.
68
+ assert staged_path.name == "test_db.json"
69
+
70
+ connect_params = {
71
+ "host": "localhost",
72
+ "port": 5433,
73
+ "database": "elements",
74
+ "user": "unstructured",
75
+ "password": "test",
76
+ }
77
+
78
+ uploader = PostgresUploader(
79
+ connection_config=PostgresConnectionConfig(
80
+ host=connect_params["host"],
81
+ port=connect_params["port"],
82
+ database=connect_params["database"],
83
+ username=connect_params["user"],
84
+ access_config=PostgresAccessConfig(password=connect_params["password"]),
85
+ )
86
+ )
87
+ if uploader.is_async():
88
+ await uploader.run_async(path=staged_path, file_data=mock_file_data)
89
+ else:
90
+ uploader.run(path=staged_path, file_data=mock_file_data)
91
+
92
+ staged_df = pd.read_json(staged_path, orient="records", lines=True)
93
+ sample_element = staged_df.iloc[0]
94
+ expected_num_elements = len(staged_df)
95
+ validate_destination(
96
+ connect_params=connect_params,
97
+ expected_num_elements=expected_num_elements,
98
+ expected_text=sample_element["text"],
99
+ test_embedding=sample_element["embeddings"],
100
+ )
@@ -0,0 +1,152 @@
1
+ import os
2
+ import tempfile
3
+ import uuid
4
+ from pathlib import Path
5
+
6
+ import pytest
7
+
8
+ from test.integration.connectors.utils.constants import (
9
+ DESTINATION_TAG,
10
+ SOURCE_TAG,
11
+ env_setup_path,
12
+ )
13
+ from test.integration.connectors.utils.docker_compose import docker_compose_context
14
+ from test.integration.connectors.utils.validation import (
15
+ ValidationConfigs,
16
+ source_connector_validation,
17
+ )
18
+ from test.integration.utils import requires_env
19
+ from unstructured_ingest.error import (
20
+ SourceConnectionError,
21
+ )
22
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
23
+ from unstructured_ingest.v2.processes.connectors.fsspec.s3 import (
24
+ CONNECTOR_TYPE,
25
+ S3AccessConfig,
26
+ S3ConnectionConfig,
27
+ S3Downloader,
28
+ S3DownloaderConfig,
29
+ S3Indexer,
30
+ S3IndexerConfig,
31
+ S3Uploader,
32
+ S3UploaderConfig,
33
+ )
34
+
35
+
36
+ def validate_predownload_file_data(file_data: FileData):
37
+ assert file_data.connector_type == CONNECTOR_TYPE
38
+ assert file_data.local_download_path is None
39
+
40
+
41
+ def validate_postdownload_file_data(file_data: FileData):
42
+ assert file_data.connector_type == CONNECTOR_TYPE
43
+ assert file_data.local_download_path is not None
44
+
45
+
46
+ @pytest.fixture
47
+ def anon_connection_config() -> S3ConnectionConfig:
48
+ return S3ConnectionConfig(access_config=S3AccessConfig(), anonymous=True)
49
+
50
+
51
+ @pytest.mark.asyncio
52
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
53
+ async def test_s3_source(anon_connection_config: S3ConnectionConfig):
54
+ indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/small-pdf-set/")
55
+ with tempfile.TemporaryDirectory() as tempdir:
56
+ tempdir_path = Path(tempdir)
57
+ download_config = S3DownloaderConfig(download_dir=tempdir_path)
58
+ indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
59
+ downloader = S3Downloader(
60
+ connection_config=anon_connection_config, download_config=download_config
61
+ )
62
+ await source_connector_validation(
63
+ indexer=indexer,
64
+ downloader=downloader,
65
+ configs=ValidationConfigs(
66
+ test_id="s3",
67
+ predownload_file_data_check=validate_predownload_file_data,
68
+ postdownload_file_data_check=validate_postdownload_file_data,
69
+ expected_num_files=4,
70
+ ),
71
+ )
72
+
73
+
74
+ @pytest.mark.asyncio
75
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
76
+ async def test_s3_source_no_access(anon_connection_config: S3ConnectionConfig):
77
+ indexer_config = S3IndexerConfig(remote_url="s3://utic-ingest-test-fixtures/destination/")
78
+ indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
79
+ with pytest.raises(SourceConnectionError):
80
+ indexer.precheck()
81
+
82
+
83
+ @pytest.mark.asyncio
84
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "minio")
85
+ async def test_s3_minio_source(anon_connection_config: S3ConnectionConfig):
86
+ anon_connection_config.endpoint_url = "http://localhost:9000"
87
+ indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/")
88
+ with docker_compose_context(docker_compose_path=env_setup_path / "minio"):
89
+ with tempfile.TemporaryDirectory() as tempdir:
90
+ tempdir_path = Path(tempdir)
91
+ download_config = S3DownloaderConfig(download_dir=tempdir_path)
92
+ indexer = S3Indexer(
93
+ connection_config=anon_connection_config, index_config=indexer_config
94
+ )
95
+ downloader = S3Downloader(
96
+ connection_config=anon_connection_config, download_config=download_config
97
+ )
98
+ await source_connector_validation(
99
+ indexer=indexer,
100
+ downloader=downloader,
101
+ configs=ValidationConfigs(
102
+ test_id="s3-minio",
103
+ predownload_file_data_check=validate_predownload_file_data,
104
+ postdownload_file_data_check=validate_postdownload_file_data,
105
+ expected_num_files=1,
106
+ exclude_fields_extend=[
107
+ "metadata.date_modified",
108
+ "metadata.date_created",
109
+ "additional_metadata.LastModified",
110
+ ],
111
+ ),
112
+ )
113
+
114
+
115
+ def get_aws_credentials() -> dict:
116
+ access_key = os.getenv("S3_INGEST_TEST_ACCESS_KEY", None)
117
+ assert access_key
118
+ secret_key = os.getenv("S3_INGEST_TEST_SECRET_KEY", None)
119
+ assert secret_key
120
+ return {"aws_access_key_id": access_key, "aws_secret_access_key": secret_key}
121
+
122
+
123
+ @pytest.mark.asyncio
124
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
125
+ @requires_env("S3_INGEST_TEST_ACCESS_KEY", "S3_INGEST_TEST_SECRET_KEY")
126
+ async def test_s3_destination(upload_file: Path):
127
+ aws_credentials = get_aws_credentials()
128
+ s3_bucket = "s3://utic-ingest-test-fixtures"
129
+ destination_path = f"{s3_bucket}/destination/{uuid.uuid4()}"
130
+ connection_config = S3ConnectionConfig(
131
+ access_config=S3AccessConfig(
132
+ key=aws_credentials["aws_access_key_id"],
133
+ secret=aws_credentials["aws_secret_access_key"],
134
+ ),
135
+ )
136
+ upload_config = S3UploaderConfig(remote_url=destination_path)
137
+ uploader = S3Uploader(connection_config=connection_config, upload_config=upload_config)
138
+ s3fs = uploader.fs
139
+ file_data = FileData(
140
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
141
+ connector_type=CONNECTOR_TYPE,
142
+ identifier="mock file data",
143
+ )
144
+ try:
145
+ if uploader.is_async():
146
+ await uploader.run_async(path=upload_file, file_data=file_data)
147
+ else:
148
+ uploader.run(path=upload_file, file_data=file_data)
149
+ uploaded_files = s3fs.ls(path=destination_path)
150
+ assert len(uploaded_files) == 1
151
+ finally:
152
+ s3fs.rm(path=destination_path, recursive=True)
@@ -0,0 +1,91 @@
1
+ import sqlite3
2
+ import tempfile
3
+ from contextlib import contextmanager
4
+ from pathlib import Path
5
+
6
+ import pandas as pd
7
+ import pytest
8
+
9
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, env_setup_path
10
+ from unstructured_ingest.v2.interfaces import FileData
11
+ from unstructured_ingest.v2.processes.connectors.sql.sqlite import (
12
+ CONNECTOR_TYPE,
13
+ SQLiteConnectionConfig,
14
+ SQLiteUploader,
15
+ SQLiteUploadStager,
16
+ )
17
+
18
+
19
+ @contextmanager
20
+ def sqlite_setup() -> Path:
21
+ # Provision the local file that sqlite points to to have the desired schema for the integration
22
+ # tests and make sure the file and connection get cleaned up by using a context manager.
23
+ with tempfile.TemporaryDirectory() as tmpdir:
24
+ db_path = Path(tmpdir) / "elements.db"
25
+ db_init_path = env_setup_path / "sql" / "sqlite-schema.sql"
26
+ assert db_init_path.exists()
27
+ assert db_init_path.is_file()
28
+ connection = None
29
+ try:
30
+ connection = sqlite3.connect(database=db_path)
31
+ with db_init_path.open("r") as f:
32
+ query = f.read()
33
+ cursor = connection.cursor()
34
+ cursor.executescript(query)
35
+ yield db_path
36
+ finally:
37
+ if connection:
38
+ connection.close()
39
+
40
+
41
+ def validate_destination(db_path: Path, expected_num_elements: int):
42
+ # Run the following validations:
43
+ # * Check that the number of records in the table match the expected value
44
+ connection = None
45
+ try:
46
+ connection = sqlite3.connect(database=db_path)
47
+ query = "select count(*) from elements;"
48
+ cursor = connection.cursor()
49
+ cursor.execute(query)
50
+ count = cursor.fetchone()[0]
51
+ assert (
52
+ count == expected_num_elements
53
+ ), f"dest check failed: got {count}, expected {expected_num_elements}"
54
+ finally:
55
+ if connection:
56
+ connection.close()
57
+
58
+
59
+ @pytest.mark.asyncio
60
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
61
+ async def test_sqlite_destination(upload_file: Path):
62
+ # the sqlite destination connector doesn't leverage the file data but is required as an input,
63
+ # mocking it with arbitrary values to meet the base requirements:
64
+ mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
65
+ with sqlite_setup() as db_path:
66
+ with tempfile.TemporaryDirectory() as tmpdir:
67
+ stager = SQLiteUploadStager()
68
+ stager_params = {
69
+ "elements_filepath": upload_file,
70
+ "file_data": mock_file_data,
71
+ "output_dir": Path(tmpdir),
72
+ "output_filename": "test_db",
73
+ }
74
+ if stager.is_async():
75
+ staged_path = await stager.run_async(**stager_params)
76
+ else:
77
+ staged_path = stager.run(**stager_params)
78
+
79
+ # The stager should append the `.json` suffix to the output filename passed in.
80
+ assert staged_path.name == "test_db.json"
81
+
82
+ uploader = SQLiteUploader(
83
+ connection_config=SQLiteConnectionConfig(database_path=db_path)
84
+ )
85
+ if uploader.is_async():
86
+ await uploader.run_async(path=staged_path, file_data=mock_file_data)
87
+ else:
88
+ uploader.run(path=staged_path, file_data=mock_file_data)
89
+
90
+ staged_df = pd.read_json(staged_path, orient="records", lines=True)
91
+ validate_destination(db_path=db_path, expected_num_elements=len(staged_df))
File without changes
@@ -0,0 +1,7 @@
1
+ from pathlib import Path
2
+
3
+ SOURCE_TAG = "source"
4
+ DESTINATION_TAG = "destination"
5
+
6
+ env_setup_path = Path(__file__).parents[4] / "test_e2e" / "env_setup"
7
+ expected_results_path = Path(__file__).parents[1] / "expected_results"
@@ -0,0 +1,44 @@
1
+ import subprocess
2
+ from contextlib import contextmanager
3
+ from pathlib import Path
4
+
5
+
6
+ @contextmanager
7
+ def docker_compose_context(docker_compose_path: Path):
8
+ # Dynamically run a specific docker compose file and make sure it gets cleanup by
9
+ # by leveraging a context manager. Uses subprocess to map docker compose commands
10
+ # to the underlying shell.
11
+ assert docker_compose_path.exists()
12
+ if docker_compose_path.is_dir():
13
+ if (docker_compose_path / "docker-compose.yml").exists():
14
+ docker_compose_path = docker_compose_path / "docker-compose.yml"
15
+ elif (docker_compose_path / "docker-compose.yaml").exists():
16
+ docker_compose_path = docker_compose_path / "docker-compose.yaml"
17
+ assert docker_compose_path.is_file()
18
+ resp = None
19
+ try:
20
+ cmd = f"docker compose -f {docker_compose_path.resolve()} up -d --wait"
21
+ print(f"Running command: {cmd}")
22
+ resp = subprocess.run(
23
+ cmd,
24
+ shell=True,
25
+ capture_output=True,
26
+ )
27
+ # Return code from docker compose using --wait can be 1 even if no error
28
+ yield
29
+ except Exception as e:
30
+ if resp:
31
+ print("STDOUT: {}".format(resp.stdout.decode("utf-8")))
32
+ print("STDERR: {}".format(resp.stderr.decode("utf-8")))
33
+ raise e
34
+ finally:
35
+ cmd = f"docker compose -f {docker_compose_path.resolve()} down --remove-orphans -v"
36
+ print(f"Running command: {cmd}")
37
+ final_resp = subprocess.run(
38
+ cmd,
39
+ shell=True,
40
+ capture_output=True,
41
+ )
42
+ if final_resp.returncode != 0:
43
+ print("STDOUT: {}".format(final_resp.stdout.decode("utf-8")))
44
+ print("STDERR: {}".format(final_resp.stderr.decode("utf-8")))