unstructured-ingest 0.0.25__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (86) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +42 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +15 -0
  7. test/integration/connectors/databricks_tests/__init__.py +0 -0
  8. test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
  9. test/integration/connectors/sql/__init__.py +0 -0
  10. test/integration/connectors/sql/test_postgres.py +178 -0
  11. test/integration/connectors/sql/test_sqlite.py +151 -0
  12. test/integration/connectors/test_s3.py +152 -0
  13. test/integration/connectors/utils/__init__.py +0 -0
  14. test/integration/connectors/utils/constants.py +7 -0
  15. test/integration/connectors/utils/docker_compose.py +44 -0
  16. test/integration/connectors/utils/validation.py +203 -0
  17. test/integration/embedders/__init__.py +0 -0
  18. test/integration/embedders/conftest.py +13 -0
  19. test/integration/embedders/test_bedrock.py +49 -0
  20. test/integration/embedders/test_huggingface.py +26 -0
  21. test/integration/embedders/test_mixedbread.py +47 -0
  22. test/integration/embedders/test_octoai.py +41 -0
  23. test/integration/embedders/test_openai.py +41 -0
  24. test/integration/embedders/test_vertexai.py +41 -0
  25. test/integration/embedders/test_voyageai.py +41 -0
  26. test/integration/embedders/togetherai.py +43 -0
  27. test/integration/embedders/utils.py +44 -0
  28. test/integration/partitioners/__init__.py +0 -0
  29. test/integration/partitioners/test_partitioner.py +75 -0
  30. test/integration/utils.py +15 -0
  31. test/unit/__init__.py +0 -0
  32. test/unit/embed/__init__.py +0 -0
  33. test/unit/embed/test_mixedbreadai.py +41 -0
  34. test/unit/embed/test_octoai.py +20 -0
  35. test/unit/embed/test_openai.py +20 -0
  36. test/unit/embed/test_vertexai.py +25 -0
  37. test/unit/embed/test_voyageai.py +24 -0
  38. test/unit/test_chunking_utils.py +36 -0
  39. test/unit/test_error.py +27 -0
  40. test/unit/test_interfaces.py +280 -0
  41. test/unit/test_interfaces_v2.py +26 -0
  42. test/unit/test_logger.py +78 -0
  43. test/unit/test_utils.py +164 -0
  44. test/unit/test_utils_v2.py +82 -0
  45. unstructured_ingest/__version__.py +1 -1
  46. unstructured_ingest/cli/interfaces.py +2 -2
  47. unstructured_ingest/connector/notion/types/block.py +1 -0
  48. unstructured_ingest/connector/notion/types/database.py +1 -0
  49. unstructured_ingest/connector/notion/types/page.py +1 -0
  50. unstructured_ingest/embed/bedrock.py +0 -20
  51. unstructured_ingest/embed/huggingface.py +0 -21
  52. unstructured_ingest/embed/interfaces.py +29 -3
  53. unstructured_ingest/embed/mixedbreadai.py +0 -36
  54. unstructured_ingest/embed/octoai.py +2 -24
  55. unstructured_ingest/embed/openai.py +0 -20
  56. unstructured_ingest/embed/togetherai.py +40 -0
  57. unstructured_ingest/embed/vertexai.py +0 -20
  58. unstructured_ingest/embed/voyageai.py +1 -24
  59. unstructured_ingest/interfaces.py +1 -1
  60. unstructured_ingest/v2/cli/utils/click.py +21 -2
  61. unstructured_ingest/v2/interfaces/connector.py +22 -2
  62. unstructured_ingest/v2/interfaces/downloader.py +1 -0
  63. unstructured_ingest/v2/processes/chunker.py +1 -1
  64. unstructured_ingest/v2/processes/connectors/__init__.py +5 -18
  65. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  66. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
  67. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  68. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  69. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  70. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  71. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +17 -0
  72. unstructured_ingest/v2/processes/connectors/kdbai.py +14 -6
  73. unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
  74. unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
  75. unstructured_ingest/v2/processes/connectors/sql/postgres.py +177 -0
  76. unstructured_ingest/v2/processes/connectors/sql/sql.py +310 -0
  77. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +172 -0
  78. unstructured_ingest/v2/processes/embedder.py +13 -0
  79. unstructured_ingest/v2/processes/partitioner.py +2 -1
  80. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/METADATA +16 -14
  81. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/RECORD +85 -31
  82. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/top_level.txt +1 -0
  83. unstructured_ingest/v2/processes/connectors/sql.py +0 -275
  84. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/LICENSE.md +0 -0
  85. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/WHEEL +0 -0
  86. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/entry_points.txt +0 -0
test/__init__.py ADDED
File without changes
File without changes
File without changes
@@ -0,0 +1,42 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ import pytest
5
+
6
+ from test.integration.utils import requires_env
7
+ from unstructured_ingest.v2.processes.chunker import Chunker, ChunkerConfig
8
+
9
+ int_test_dir = Path(__file__).parent
10
+ assets_dir = int_test_dir / "assets"
11
+
12
+ chunker_files = [path for path in assets_dir.iterdir() if path.is_file()]
13
+
14
+
15
+ @pytest.mark.parametrize("chunker_file", chunker_files, ids=[path.name for path in chunker_files])
16
+ @pytest.mark.parametrize("strategy", ["basic", "by_title", "by_similarity", "by_page"])
17
+ @requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
18
+ @pytest.mark.asyncio
19
+ async def test_chunker_api(chunker_file: Path, strategy: str):
20
+ api_key = os.getenv("UNSTRUCTURED_API_KEY")
21
+ api_url = os.getenv("UNSTRUCTURED_API_URL")
22
+
23
+ chunker_config = ChunkerConfig(
24
+ chunking_strategy=strategy,
25
+ chunk_by_api=True,
26
+ chunk_api_key=api_key,
27
+ chunking_endpoint=api_url,
28
+ )
29
+ chunker = Chunker(config=chunker_config)
30
+ results = await chunker.run_async(elements_filepath=chunker_file)
31
+ assert results
32
+
33
+
34
+ @pytest.mark.parametrize("chunker_file", chunker_files, ids=[path.name for path in chunker_files])
35
+ @pytest.mark.parametrize("strategy", ["basic", "by_title"])
36
+ def test_chunker_basic(chunker_file: Path, strategy: str):
37
+ chunker_config = ChunkerConfig(
38
+ chunking_strategy=strategy,
39
+ )
40
+ chunker = Chunker(config=chunker_config)
41
+ results = chunker.run(elements_filepath=chunker_file)
42
+ assert results
File without changes
@@ -0,0 +1,15 @@
1
+ from pathlib import Path
2
+
3
+ import pytest
4
+
5
+ FILENAME = "DA-1p-with-duplicate-pages.pdf.json"
6
+
7
+
8
+ @pytest.fixture
9
+ def upload_file() -> Path:
10
+ int_test_dir = Path(__file__).parent
11
+ assets_dir = int_test_dir / "assets"
12
+ upload_file = assets_dir / FILENAME
13
+ assert upload_file.exists()
14
+ assert upload_file.is_file()
15
+ return upload_file
@@ -0,0 +1,165 @@
1
+ import json
2
+ import os
3
+ import tempfile
4
+ import uuid
5
+ from contextlib import contextmanager
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+
9
+ import pytest
10
+ from databricks.sdk import WorkspaceClient
11
+ from databricks.sdk.errors.platform import NotFound
12
+
13
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
14
+ from test.integration.connectors.utils.validation import (
15
+ ValidationConfigs,
16
+ source_connector_validation,
17
+ )
18
+ from test.integration.utils import requires_env
19
+ from unstructured_ingest.v2.interfaces import FileData
20
+ from unstructured_ingest.v2.processes.connectors.databricks.volumes_native import (
21
+ CONNECTOR_TYPE,
22
+ DatabricksNativeVolumesAccessConfig,
23
+ DatabricksNativeVolumesConnectionConfig,
24
+ DatabricksNativeVolumesDownloader,
25
+ DatabricksNativeVolumesDownloaderConfig,
26
+ DatabricksNativeVolumesIndexer,
27
+ DatabricksNativeVolumesIndexerConfig,
28
+ DatabricksNativeVolumesUploader,
29
+ DatabricksNativeVolumesUploaderConfig,
30
+ )
31
+
32
+
33
+ @dataclass
34
+ class EnvData:
35
+ host: str
36
+ client_id: str
37
+ client_secret: str
38
+ catalog: str
39
+
40
+ def get_connection_config(self) -> DatabricksNativeVolumesConnectionConfig:
41
+ return DatabricksNativeVolumesConnectionConfig(
42
+ host=self.host,
43
+ access_config=DatabricksNativeVolumesAccessConfig(
44
+ client_id=self.client_id,
45
+ client_secret=self.client_secret,
46
+ ),
47
+ )
48
+
49
+
50
+ def get_env_data() -> EnvData:
51
+ return EnvData(
52
+ host=os.environ["DATABRICKS_HOST"],
53
+ client_id=os.environ["DATABRICKS_CLIENT_ID"],
54
+ client_secret=os.environ["DATABRICKS_CLIENT_SECRET"],
55
+ catalog=os.environ["DATABRICKS_CATALOG"],
56
+ )
57
+
58
+
59
+ @pytest.mark.asyncio
60
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
61
+ @requires_env(
62
+ "DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
63
+ )
64
+ async def test_volumes_native_source():
65
+ env_data = get_env_data()
66
+ indexer_config = DatabricksNativeVolumesIndexerConfig(
67
+ recursive=True,
68
+ volume="test-platform",
69
+ volume_path="databricks-volumes-test-input",
70
+ catalog=env_data.catalog,
71
+ )
72
+ connection_config = env_data.get_connection_config()
73
+ with tempfile.TemporaryDirectory() as tempdir:
74
+ tempdir_path = Path(tempdir)
75
+ download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tempdir_path)
76
+ indexer = DatabricksNativeVolumesIndexer(
77
+ connection_config=connection_config, index_config=indexer_config
78
+ )
79
+ downloader = DatabricksNativeVolumesDownloader(
80
+ connection_config=connection_config, download_config=download_config
81
+ )
82
+ await source_connector_validation(
83
+ indexer=indexer,
84
+ downloader=downloader,
85
+ configs=ValidationConfigs(
86
+ test_id="databricks_volumes_native",
87
+ expected_num_files=1,
88
+ ),
89
+ )
90
+
91
+
92
+ def _get_volume_path(catalog: str, volume: str, volume_path: str):
93
+ return f"/Volumes/{catalog}/default/{volume}/{volume_path}"
94
+
95
+
96
+ @contextmanager
97
+ def databricks_destination_context(env_data: EnvData, volume: str, volume_path) -> WorkspaceClient:
98
+ client = WorkspaceClient(
99
+ host=env_data.host, client_id=env_data.client_id, client_secret=env_data.client_secret
100
+ )
101
+ try:
102
+ yield client
103
+ finally:
104
+ # Cleanup
105
+ try:
106
+ for file in client.files.list_directory_contents(
107
+ directory_path=_get_volume_path(env_data.catalog, volume, volume_path)
108
+ ):
109
+ client.files.delete(file.path)
110
+ client.files.delete_directory(_get_volume_path(env_data.catalog, volume, volume_path))
111
+ except NotFound:
112
+ # Directory was never created, don't need to delete
113
+ pass
114
+
115
+
116
+ def validate_upload(client: WorkspaceClient, catalog: str, volume: str, volume_path: str):
117
+ files = list(
118
+ client.files.list_directory_contents(
119
+ directory_path=_get_volume_path(catalog, volume, volume_path)
120
+ )
121
+ )
122
+
123
+ assert len(files) == 1
124
+
125
+ resp = client.files.download(files[0].path)
126
+ data = json.loads(resp.contents.read())
127
+
128
+ assert len(data) == 22
129
+ element_types = {v["type"] for v in data}
130
+ assert len(element_types) == 1
131
+ assert "CompositeElement" in element_types
132
+
133
+
134
+ @pytest.mark.asyncio
135
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
136
+ @requires_env(
137
+ "DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
138
+ )
139
+ async def test_volumes_native_destination(upload_file: Path):
140
+ env_data = get_env_data()
141
+ volume_path = f"databricks-volumes-test-output-{uuid.uuid4()}"
142
+ mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
143
+ with databricks_destination_context(
144
+ volume="test-platform", volume_path=volume_path, env_data=env_data
145
+ ) as workspace_client:
146
+ connection_config = env_data.get_connection_config()
147
+ uploader = DatabricksNativeVolumesUploader(
148
+ connection_config=connection_config,
149
+ upload_config=DatabricksNativeVolumesUploaderConfig(
150
+ volume="test-platform",
151
+ volume_path=volume_path,
152
+ catalog=env_data.catalog,
153
+ ),
154
+ )
155
+ if uploader.is_async():
156
+ await uploader.run_async(path=upload_file, file_data=mock_file_data)
157
+ else:
158
+ uploader.run(path=upload_file, file_data=mock_file_data)
159
+
160
+ validate_upload(
161
+ client=workspace_client,
162
+ catalog=env_data.catalog,
163
+ volume="test-platform",
164
+ volume_path=volume_path,
165
+ )
File without changes
@@ -0,0 +1,178 @@
1
+ import tempfile
2
+ from contextlib import contextmanager
3
+ from pathlib import Path
4
+
5
+ import faker
6
+ import pandas as pd
7
+ import pytest
8
+ from psycopg2 import connect
9
+
10
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, env_setup_path
11
+ from test.integration.connectors.utils.docker_compose import docker_compose_context
12
+ from test.integration.connectors.utils.validation import (
13
+ ValidationConfigs,
14
+ source_connector_validation,
15
+ )
16
+ from unstructured_ingest.v2.interfaces import FileData
17
+ from unstructured_ingest.v2.processes.connectors.sql.postgres import (
18
+ CONNECTOR_TYPE,
19
+ PostgresAccessConfig,
20
+ PostgresConnectionConfig,
21
+ PostgresDownloader,
22
+ PostgresDownloaderConfig,
23
+ PostgresIndexer,
24
+ PostgresIndexerConfig,
25
+ PostgresUploader,
26
+ PostgresUploadStager,
27
+ )
28
+
29
+ faker = faker.Faker()
30
+
31
+ SEED_DATA_ROWS = 40
32
+
33
+
34
+ @contextmanager
35
+ def postgres_download_setup() -> None:
36
+ with docker_compose_context(docker_compose_path=env_setup_path / "sql" / "postgres" / "source"):
37
+ connection = connect(
38
+ user="unstructured",
39
+ password="test",
40
+ dbname="test_db",
41
+ host="localhost",
42
+ port=5433,
43
+ )
44
+ with connection.cursor() as cursor:
45
+ for _ in range(SEED_DATA_ROWS):
46
+ sql_statment = (
47
+ f"INSERT INTO cars (brand, price) VALUES "
48
+ f"('{faker.word()}', {faker.random_int()})"
49
+ )
50
+ cursor.execute(sql_statment)
51
+ connection.commit()
52
+ yield
53
+
54
+
55
+ @pytest.mark.asyncio
56
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
57
+ async def test_postgres_source():
58
+ connect_params = {
59
+ "host": "localhost",
60
+ "port": 5433,
61
+ "database": "test_db",
62
+ "user": "unstructured",
63
+ "password": "test",
64
+ }
65
+ with postgres_download_setup():
66
+ with tempfile.TemporaryDirectory() as tmpdir:
67
+ connection_config = PostgresConnectionConfig(
68
+ host=connect_params["host"],
69
+ port=connect_params["port"],
70
+ database=connect_params["database"],
71
+ username=connect_params["user"],
72
+ access_config=PostgresAccessConfig(password=connect_params["password"]),
73
+ )
74
+ indexer = PostgresIndexer(
75
+ connection_config=connection_config,
76
+ index_config=PostgresIndexerConfig(
77
+ table_name="cars", id_column="car_id", batch_size=5
78
+ ),
79
+ )
80
+ downloader = PostgresDownloader(
81
+ connection_config=connection_config,
82
+ download_config=PostgresDownloaderConfig(
83
+ fields=["car_id", "brand"], download_dir=Path(tmpdir)
84
+ ),
85
+ )
86
+ await source_connector_validation(
87
+ indexer=indexer,
88
+ downloader=downloader,
89
+ configs=ValidationConfigs(
90
+ test_id="postgres",
91
+ expected_num_files=40,
92
+ ),
93
+ )
94
+
95
+
96
+ def validate_destination(
97
+ connect_params: dict,
98
+ expected_num_elements: int,
99
+ test_embedding: list[float],
100
+ expected_text: str,
101
+ ):
102
+ # Run the following validations:
103
+ # * Check that the number of records in the table match the expected value
104
+ # * Given the embedding, make sure it matches the associated text it belongs to
105
+ with connect(**connect_params) as connection:
106
+ cursor = connection.cursor()
107
+ query = "select count(*) from elements;"
108
+ cursor.execute(query)
109
+ count = cursor.fetchone()[0]
110
+ assert (
111
+ count == expected_num_elements
112
+ ), f"dest check failed: got {count}, expected {expected_num_elements}"
113
+
114
+ cursor.execute("SELECT embeddings FROM elements order by text limit 1")
115
+ similarity_query = (
116
+ f"SELECT text FROM elements ORDER BY embeddings <-> '{test_embedding}' LIMIT 1;"
117
+ )
118
+ cursor.execute(similarity_query)
119
+ res = cursor.fetchone()
120
+ assert res[0] == expected_text
121
+
122
+
123
+ @pytest.mark.asyncio
124
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
125
+ async def test_postgres_destination(upload_file: Path):
126
+ # the postgres destination connector doesn't leverage the file data but is required as an input,
127
+ # mocking it with arbitrary values to meet the base requirements:
128
+ mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
129
+ with docker_compose_context(
130
+ docker_compose_path=env_setup_path / "sql" / "postgres" / "destination"
131
+ ):
132
+ with tempfile.TemporaryDirectory() as tmpdir:
133
+ stager = PostgresUploadStager()
134
+ stager_params = {
135
+ "elements_filepath": upload_file,
136
+ "file_data": mock_file_data,
137
+ "output_dir": Path(tmpdir),
138
+ "output_filename": "test_db",
139
+ }
140
+ if stager.is_async():
141
+ staged_path = await stager.run_async(**stager_params)
142
+ else:
143
+ staged_path = stager.run(**stager_params)
144
+
145
+ # The stager should append the `.json` suffix to the output filename passed in.
146
+ assert staged_path.name == "test_db.json"
147
+
148
+ connect_params = {
149
+ "host": "localhost",
150
+ "port": 5433,
151
+ "database": "elements",
152
+ "user": "unstructured",
153
+ "password": "test",
154
+ }
155
+
156
+ uploader = PostgresUploader(
157
+ connection_config=PostgresConnectionConfig(
158
+ host=connect_params["host"],
159
+ port=connect_params["port"],
160
+ database=connect_params["database"],
161
+ username=connect_params["user"],
162
+ access_config=PostgresAccessConfig(password=connect_params["password"]),
163
+ )
164
+ )
165
+ if uploader.is_async():
166
+ await uploader.run_async(path=staged_path, file_data=mock_file_data)
167
+ else:
168
+ uploader.run(path=staged_path, file_data=mock_file_data)
169
+
170
+ staged_df = pd.read_json(staged_path, orient="records", lines=True)
171
+ sample_element = staged_df.iloc[0]
172
+ expected_num_elements = len(staged_df)
173
+ validate_destination(
174
+ connect_params=connect_params,
175
+ expected_num_elements=expected_num_elements,
176
+ expected_text=sample_element["text"],
177
+ test_embedding=sample_element["embeddings"],
178
+ )
@@ -0,0 +1,151 @@
1
+ import sqlite3
2
+ import tempfile
3
+ from contextlib import contextmanager
4
+ from pathlib import Path
5
+
6
+ import faker
7
+ import pandas as pd
8
+ import pytest
9
+
10
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, env_setup_path
11
+ from test.integration.connectors.utils.validation import (
12
+ ValidationConfigs,
13
+ source_connector_validation,
14
+ )
15
+ from unstructured_ingest.v2.interfaces import FileData
16
+ from unstructured_ingest.v2.processes.connectors.sql.sqlite import (
17
+ CONNECTOR_TYPE,
18
+ SQLiteConnectionConfig,
19
+ SQLiteDownloader,
20
+ SQLiteDownloaderConfig,
21
+ SQLiteIndexer,
22
+ SQLiteIndexerConfig,
23
+ SQLiteUploader,
24
+ SQLiteUploadStager,
25
+ )
26
+
27
+ faker = faker.Faker()
28
+
29
+ SEED_DATA_ROWS = 40
30
+
31
+
32
+ @contextmanager
33
+ def sqlite_download_setup() -> Path:
34
+ with tempfile.TemporaryDirectory() as tmpdir:
35
+ db_path = Path(tmpdir) / "mock_database.db"
36
+ db_init_path = env_setup_path / "sql" / "sqlite" / "source" / "sqlite-schema.sql"
37
+ assert db_init_path.exists()
38
+ assert db_init_path.is_file()
39
+ with sqlite3.connect(database=db_path) as sqlite_connection:
40
+ cursor = sqlite_connection.cursor()
41
+ with db_init_path.open("r") as f:
42
+ query = f.read()
43
+ cursor.executescript(query)
44
+ for _ in range(SEED_DATA_ROWS):
45
+ sql_statment = (
46
+ f"INSERT INTO cars (brand, price) "
47
+ f"VALUES ('{faker.word()}', {faker.random_int()})"
48
+ )
49
+ cursor.execute(sql_statment)
50
+
51
+ sqlite_connection.commit()
52
+ cursor.close()
53
+ yield db_path
54
+
55
+
56
+ @pytest.mark.asyncio
57
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "sql")
58
+ async def test_sqlite_source():
59
+ with sqlite_download_setup() as db_path:
60
+ with tempfile.TemporaryDirectory() as tmpdir:
61
+ connection_config = SQLiteConnectionConfig(database_path=db_path)
62
+ indexer = SQLiteIndexer(
63
+ connection_config=connection_config,
64
+ index_config=SQLiteIndexerConfig(
65
+ table_name="cars", id_column="car_id", batch_size=5
66
+ ),
67
+ )
68
+ downloader = SQLiteDownloader(
69
+ connection_config=connection_config,
70
+ download_config=SQLiteDownloaderConfig(
71
+ fields=["car_id", "brand"], download_dir=Path(tmpdir)
72
+ ),
73
+ )
74
+ await source_connector_validation(
75
+ indexer=indexer,
76
+ downloader=downloader,
77
+ configs=ValidationConfigs(
78
+ test_id="sqlite",
79
+ expected_num_files=40,
80
+ ),
81
+ )
82
+
83
+
84
+ @contextmanager
85
+ def sqlite_upload_setup() -> Path:
86
+ # Provision the local file that sqlite points to to have the desired schema for the integration
87
+ # tests and make sure the file and connection get cleaned up by using a context manager.
88
+ with tempfile.TemporaryDirectory() as tmpdir:
89
+ db_path = Path(tmpdir) / "elements.db"
90
+ db_init_path = env_setup_path / "sql" / "sqlite" / "destination" / "sqlite-schema.sql"
91
+ assert db_init_path.exists()
92
+ assert db_init_path.is_file()
93
+ with sqlite3.connect(database=db_path) as sqlite_connection:
94
+ with db_init_path.open("r") as f:
95
+ query = f.read()
96
+ cursor = sqlite_connection.cursor()
97
+ cursor.executescript(query)
98
+ yield db_path
99
+
100
+
101
+ def validate_destination(db_path: Path, expected_num_elements: int):
102
+ # Run the following validations:
103
+ # * Check that the number of records in the table match the expected value
104
+ connection = None
105
+ try:
106
+ connection = sqlite3.connect(database=db_path)
107
+ query = "select count(*) from elements;"
108
+ cursor = connection.cursor()
109
+ cursor.execute(query)
110
+ count = cursor.fetchone()[0]
111
+ assert (
112
+ count == expected_num_elements
113
+ ), f"dest check failed: got {count}, expected {expected_num_elements}"
114
+ finally:
115
+ if connection:
116
+ connection.close()
117
+
118
+
119
+ @pytest.mark.asyncio
120
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "sql")
121
+ async def test_sqlite_destination(upload_file: Path):
122
+ # the sqlite destination connector doesn't leverage the file data but is required as an input,
123
+ # mocking it with arbitrary values to meet the base requirements:
124
+ mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
125
+ with sqlite_upload_setup() as db_path:
126
+ with tempfile.TemporaryDirectory() as tmpdir:
127
+ stager = SQLiteUploadStager()
128
+ stager_params = {
129
+ "elements_filepath": upload_file,
130
+ "file_data": mock_file_data,
131
+ "output_dir": Path(tmpdir),
132
+ "output_filename": "test_db",
133
+ }
134
+ if stager.is_async():
135
+ staged_path = await stager.run_async(**stager_params)
136
+ else:
137
+ staged_path = stager.run(**stager_params)
138
+
139
+ # The stager should append the `.json` suffix to the output filename passed in.
140
+ assert staged_path.name == "test_db.json"
141
+
142
+ uploader = SQLiteUploader(
143
+ connection_config=SQLiteConnectionConfig(database_path=db_path)
144
+ )
145
+ if uploader.is_async():
146
+ await uploader.run_async(path=staged_path, file_data=mock_file_data)
147
+ else:
148
+ uploader.run(path=staged_path, file_data=mock_file_data)
149
+
150
+ staged_df = pd.read_json(staged_path, orient="records", lines=True)
151
+ validate_destination(db_path=db_path, expected_num_elements=len(staged_df))