unstructured-ingest 0.7.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/embed/mixedbreadai.py +0 -1
- unstructured_ingest/interfaces/upload_stager.py +2 -2
- unstructured_ingest/interfaces/uploader.py +3 -3
- unstructured_ingest/logger.py +2 -93
- unstructured_ingest/main.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/processes/chunker.py +4 -0
- unstructured_ingest/processes/connectors/airtable.py +4 -2
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +10 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +2 -2
- unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
- unstructured_ingest/processes/connectors/confluence.py +0 -1
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
- unstructured_ingest/processes/connectors/delta_table.py +1 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
- unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
- unstructured_ingest/processes/connectors/gitlab.py +1 -2
- unstructured_ingest/processes/connectors/google_drive.py +0 -2
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
- unstructured_ingest/processes/connectors/kdbai.py +1 -0
- unstructured_ingest/processes/connectors/outlook.py +1 -2
- unstructured_ingest/processes/connectors/pinecone.py +0 -1
- unstructured_ingest/processes/connectors/redisdb.py +28 -24
- unstructured_ingest/processes/connectors/salesforce.py +1 -1
- unstructured_ingest/processes/connectors/slack.py +1 -2
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
- unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
- unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
- unstructured_ingest/processes/connectors/sql/sql.py +3 -4
- unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
- unstructured_ingest/processes/connectors/vectara.py +0 -2
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
- unstructured_ingest/processes/embedder.py +2 -2
- unstructured_ingest/processes/filter.py +1 -1
- unstructured_ingest/processes/partitioner.py +4 -0
- unstructured_ingest/processes/utils/blob_storage.py +2 -2
- unstructured_ingest/unstructured_api.py +13 -8
- unstructured_ingest/utils/data_prep.py +8 -32
- unstructured_ingest/utils/string_and_date_utils.py +3 -3
- unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +54 -187
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
- examples/__init__.py +0 -0
- examples/airtable.py +0 -44
- examples/azure_cognitive_search.py +0 -55
- examples/chroma.py +0 -54
- examples/couchbase.py +0 -55
- examples/databricks_volumes_dest.py +0 -55
- examples/databricks_volumes_source.py +0 -53
- examples/delta_table.py +0 -45
- examples/discord_example.py +0 -36
- examples/elasticsearch.py +0 -49
- examples/google_drive.py +0 -45
- examples/kdbai.py +0 -54
- examples/local.py +0 -36
- examples/milvus.py +0 -44
- examples/mongodb.py +0 -53
- examples/opensearch.py +0 -50
- examples/pinecone.py +0 -57
- examples/s3.py +0 -38
- examples/salesforce.py +0 -44
- examples/sharepoint.py +0 -47
- examples/singlestore.py +0 -49
- examples/sql.py +0 -90
- examples/vectara.py +0 -54
- examples/weaviate.py +0 -44
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +0 -31
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +0 -38
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +0 -273
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +0 -90
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +0 -14
- test/integration/connectors/duckdb/test_duckdb.py +0 -90
- test/integration/connectors/duckdb/test_motherduck.py +0 -95
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +0 -34
- test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
- test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
- test/integration/connectors/sql/test_postgres.py +0 -201
- test/integration/connectors/sql/test_singlestore.py +0 -182
- test/integration/connectors/sql/test_snowflake.py +0 -244
- test/integration/connectors/sql/test_sqlite.py +0 -168
- test/integration/connectors/sql/test_vastdb.py +0 -34
- test/integration/connectors/test_astradb.py +0 -287
- test/integration/connectors/test_azure_ai_search.py +0 -254
- test/integration/connectors/test_chroma.py +0 -136
- test/integration/connectors/test_confluence.py +0 -111
- test/integration/connectors/test_delta_table.py +0 -183
- test/integration/connectors/test_dropbox.py +0 -151
- test/integration/connectors/test_github.py +0 -49
- test/integration/connectors/test_google_drive.py +0 -257
- test/integration/connectors/test_jira.py +0 -67
- test/integration/connectors/test_lancedb.py +0 -247
- test/integration/connectors/test_milvus.py +0 -208
- test/integration/connectors/test_mongodb.py +0 -335
- test/integration/connectors/test_neo4j.py +0 -244
- test/integration/connectors/test_notion.py +0 -152
- test/integration/connectors/test_onedrive.py +0 -163
- test/integration/connectors/test_pinecone.py +0 -387
- test/integration/connectors/test_qdrant.py +0 -216
- test/integration/connectors/test_redis.py +0 -143
- test/integration/connectors/test_s3.py +0 -184
- test/integration/connectors/test_sharepoint.py +0 -222
- test/integration/connectors/test_vectara.py +0 -282
- test/integration/connectors/test_zendesk.py +0 -120
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +0 -13
- test/integration/connectors/utils/docker.py +0 -151
- test/integration/connectors/utils/docker_compose.py +0 -59
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +0 -77
- test/integration/connectors/utils/validation/equality.py +0 -76
- test/integration/connectors/utils/validation/source.py +0 -331
- test/integration/connectors/utils/validation/utils.py +0 -36
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +0 -15
- test/integration/connectors/weaviate/test_cloud.py +0 -39
- test/integration/connectors/weaviate/test_local.py +0 -152
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +0 -13
- test/integration/embedders/test_azure_openai.py +0 -57
- test/integration/embedders/test_bedrock.py +0 -103
- test/integration/embedders/test_huggingface.py +0 -24
- test/integration/embedders/test_mixedbread.py +0 -71
- test/integration/embedders/test_octoai.py +0 -75
- test/integration/embedders/test_openai.py +0 -74
- test/integration/embedders/test_togetherai.py +0 -71
- test/integration/embedders/test_vertexai.py +0 -63
- test/integration/embedders/test_voyageai.py +0 -79
- test/integration/embedders/utils.py +0 -66
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +0 -76
- test/integration/utils.py +0 -15
- test/unit/__init__.py +0 -0
- test/unit/chunkers/__init__.py +0 -0
- test/unit/chunkers/test_chunkers.py +0 -49
- test/unit/connectors/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
- test/unit/connectors/motherduck/__init__.py +0 -0
- test/unit/connectors/motherduck/test_base.py +0 -73
- test/unit/connectors/sql/__init__.py +0 -0
- test/unit/connectors/sql/test_sql.py +0 -152
- test/unit/connectors/test_confluence.py +0 -71
- test/unit/connectors/test_jira.py +0 -401
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +0 -42
- test/unit/embed/test_octoai.py +0 -27
- test/unit/embed/test_openai.py +0 -28
- test/unit/embed/test_vertexai.py +0 -25
- test/unit/embed/test_voyageai.py +0 -24
- test/unit/embedders/__init__.py +0 -0
- test/unit/embedders/test_bedrock.py +0 -36
- test/unit/embedders/test_huggingface.py +0 -48
- test/unit/embedders/test_mixedbread.py +0 -37
- test/unit/embedders/test_octoai.py +0 -35
- test/unit/embedders/test_openai.py +0 -35
- test/unit/embedders/test_togetherai.py +0 -37
- test/unit/embedders/test_vertexai.py +0 -37
- test/unit/embedders/test_voyageai.py +0 -38
- test/unit/partitioners/__init__.py +0 -0
- test/unit/partitioners/test_partitioner.py +0 -63
- test/unit/test_error.py +0 -27
- test/unit/test_html.py +0 -112
- test/unit/test_interfaces.py +0 -26
- test/unit/test_logger.py +0 -78
- test/unit/test_utils.py +0 -220
- test/unit/utils/__init__.py +0 -0
- test/unit/utils/data_generator.py +0 -32
- unstructured_ingest-0.7.1.dist-info/METADATA +0 -383
- unstructured_ingest-0.7.1.dist-info/top_level.txt +0 -3
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
|
@@ -1,244 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
import pytest
|
|
6
|
-
import snowflake.connector as sf
|
|
7
|
-
from _pytest.fixtures import TopRequest
|
|
8
|
-
|
|
9
|
-
from test.integration.connectors.utils.constants import (
|
|
10
|
-
DESTINATION_TAG,
|
|
11
|
-
SOURCE_TAG,
|
|
12
|
-
SQL_TAG,
|
|
13
|
-
env_setup_path,
|
|
14
|
-
)
|
|
15
|
-
from test.integration.connectors.utils.docker import container_context
|
|
16
|
-
from test.integration.connectors.utils.validation.destination import (
|
|
17
|
-
StagerValidationConfigs,
|
|
18
|
-
stager_validation,
|
|
19
|
-
)
|
|
20
|
-
from test.integration.connectors.utils.validation.source import (
|
|
21
|
-
SourceValidationConfigs,
|
|
22
|
-
source_connector_validation,
|
|
23
|
-
)
|
|
24
|
-
from test.integration.utils import requires_env
|
|
25
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
26
|
-
from unstructured_ingest.processes.connectors.sql.snowflake import (
|
|
27
|
-
CONNECTOR_TYPE,
|
|
28
|
-
SnowflakeAccessConfig,
|
|
29
|
-
SnowflakeConnectionConfig,
|
|
30
|
-
SnowflakeDownloader,
|
|
31
|
-
SnowflakeDownloaderConfig,
|
|
32
|
-
SnowflakeIndexer,
|
|
33
|
-
SnowflakeIndexerConfig,
|
|
34
|
-
SnowflakeUploader,
|
|
35
|
-
SnowflakeUploadStager,
|
|
36
|
-
)
|
|
37
|
-
|
|
38
|
-
SEED_DATA_ROWS = 20
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
def seed_data() -> dict:
|
|
42
|
-
connect_params = {
|
|
43
|
-
"user": "test",
|
|
44
|
-
"password": "test",
|
|
45
|
-
"account": "test",
|
|
46
|
-
"database": "test",
|
|
47
|
-
"host": "snowflake.localhost.localstack.cloud",
|
|
48
|
-
}
|
|
49
|
-
conn = sf.connect(**connect_params)
|
|
50
|
-
|
|
51
|
-
file = Path(env_setup_path / "sql" / "snowflake" / "source" / "snowflake-schema.sql")
|
|
52
|
-
|
|
53
|
-
with file.open() as f:
|
|
54
|
-
sql = f.read()
|
|
55
|
-
|
|
56
|
-
cur = conn.cursor()
|
|
57
|
-
cur.execute(sql)
|
|
58
|
-
for i in range(SEED_DATA_ROWS):
|
|
59
|
-
sql_statment = f"INSERT INTO cars (brand, price) VALUES " f"('brand_{i}', {i})"
|
|
60
|
-
cur.execute(sql_statment)
|
|
61
|
-
|
|
62
|
-
cur.close()
|
|
63
|
-
conn.close()
|
|
64
|
-
return connect_params
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
@pytest.fixture
|
|
68
|
-
def source_database_setup() -> dict:
|
|
69
|
-
token = os.getenv("LOCALSTACK_AUTH_TOKEN")
|
|
70
|
-
with container_context(
|
|
71
|
-
image="localstack/snowflake",
|
|
72
|
-
environment={"LOCALSTACK_AUTH_TOKEN": token, "EXTRA_CORS_ALLOWED_ORIGINS": "*"},
|
|
73
|
-
ports={4566: 4566, 443: 443},
|
|
74
|
-
healthcheck_retries=30,
|
|
75
|
-
):
|
|
76
|
-
connect_params = seed_data()
|
|
77
|
-
yield connect_params
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
def init_db_destination() -> dict:
|
|
81
|
-
connect_params = {
|
|
82
|
-
"user": "test",
|
|
83
|
-
"password": "test",
|
|
84
|
-
"account": "test",
|
|
85
|
-
"database": "test",
|
|
86
|
-
"host": "snowflake.localhost.localstack.cloud",
|
|
87
|
-
}
|
|
88
|
-
conn = sf.connect(**connect_params)
|
|
89
|
-
|
|
90
|
-
file = Path(env_setup_path / "sql" / "snowflake" / "destination" / "snowflake-schema.sql")
|
|
91
|
-
|
|
92
|
-
with file.open() as f:
|
|
93
|
-
sql = f.read()
|
|
94
|
-
|
|
95
|
-
cur = conn.cursor()
|
|
96
|
-
cur.execute(sql)
|
|
97
|
-
|
|
98
|
-
cur.close()
|
|
99
|
-
conn.close()
|
|
100
|
-
return connect_params
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
@pytest.fixture
|
|
104
|
-
def destination_database_setup() -> dict:
|
|
105
|
-
token = os.getenv("LOCALSTACK_AUTH_TOKEN")
|
|
106
|
-
with container_context(
|
|
107
|
-
image="localstack/snowflake",
|
|
108
|
-
environment={"LOCALSTACK_AUTH_TOKEN": token, "EXTRA_CORS_ALLOWED_ORIGINS": "*"},
|
|
109
|
-
ports={4566: 4566, 443: 443},
|
|
110
|
-
healthcheck_retries=30,
|
|
111
|
-
):
|
|
112
|
-
connect_params = init_db_destination()
|
|
113
|
-
yield connect_params
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
@pytest.mark.asyncio
|
|
117
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, SQL_TAG)
|
|
118
|
-
@requires_env("LOCALSTACK_AUTH_TOKEN")
|
|
119
|
-
async def test_snowflake_source(temp_dir: Path, source_database_setup: dict):
|
|
120
|
-
connection_config = SnowflakeConnectionConfig(
|
|
121
|
-
access_config=SnowflakeAccessConfig(password="test"),
|
|
122
|
-
account="test",
|
|
123
|
-
user="test",
|
|
124
|
-
database="test",
|
|
125
|
-
host="snowflake.localhost.localstack.cloud",
|
|
126
|
-
)
|
|
127
|
-
indexer = SnowflakeIndexer(
|
|
128
|
-
connection_config=connection_config,
|
|
129
|
-
index_config=SnowflakeIndexerConfig(table_name="cars", id_column="CAR_ID", batch_size=5),
|
|
130
|
-
)
|
|
131
|
-
downloader = SnowflakeDownloader(
|
|
132
|
-
connection_config=connection_config,
|
|
133
|
-
download_config=SnowflakeDownloaderConfig(
|
|
134
|
-
fields=["CAR_ID", "BRAND"], download_dir=temp_dir
|
|
135
|
-
),
|
|
136
|
-
)
|
|
137
|
-
await source_connector_validation(
|
|
138
|
-
indexer=indexer,
|
|
139
|
-
downloader=downloader,
|
|
140
|
-
configs=SourceValidationConfigs(
|
|
141
|
-
test_id="snowflake",
|
|
142
|
-
expected_num_files=SEED_DATA_ROWS,
|
|
143
|
-
expected_number_indexed_file_data=4,
|
|
144
|
-
validate_downloaded_files=True,
|
|
145
|
-
),
|
|
146
|
-
)
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
def validate_destination(
|
|
150
|
-
connect_params: dict,
|
|
151
|
-
expected_num_elements: int,
|
|
152
|
-
):
|
|
153
|
-
# Run the following validations:
|
|
154
|
-
# * Check that the number of records in the table match the expected value
|
|
155
|
-
# * Given the embedding, make sure it matches the associated text it belongs to
|
|
156
|
-
conn = sf.connect(**connect_params)
|
|
157
|
-
cursor = conn.cursor()
|
|
158
|
-
try:
|
|
159
|
-
query = "select count(*) from elements;"
|
|
160
|
-
cursor.execute(query)
|
|
161
|
-
count = cursor.fetchone()[0]
|
|
162
|
-
assert (
|
|
163
|
-
count == expected_num_elements
|
|
164
|
-
), f"dest check failed: got {count}, expected {expected_num_elements}"
|
|
165
|
-
finally:
|
|
166
|
-
cursor.close()
|
|
167
|
-
conn.close()
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
@pytest.mark.asyncio
|
|
171
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
|
|
172
|
-
@requires_env("LOCALSTACK_AUTH_TOKEN")
|
|
173
|
-
async def test_snowflake_destination(
|
|
174
|
-
upload_file: Path, temp_dir: Path, destination_database_setup: dict
|
|
175
|
-
):
|
|
176
|
-
# the postgres destination connector doesn't leverage the file data but is required as an input,
|
|
177
|
-
# mocking it with arbitrary values to meet the base requirements:
|
|
178
|
-
mock_file_data = FileData(
|
|
179
|
-
identifier="mock file data",
|
|
180
|
-
connector_type=CONNECTOR_TYPE,
|
|
181
|
-
source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
|
|
182
|
-
)
|
|
183
|
-
init_db_destination()
|
|
184
|
-
stager = SnowflakeUploadStager()
|
|
185
|
-
staged_path = stager.run(
|
|
186
|
-
elements_filepath=upload_file,
|
|
187
|
-
file_data=mock_file_data,
|
|
188
|
-
output_dir=temp_dir,
|
|
189
|
-
output_filename=upload_file.name,
|
|
190
|
-
)
|
|
191
|
-
|
|
192
|
-
# The stager should append the `.json` suffix to the output filename passed in.
|
|
193
|
-
assert staged_path.suffix == upload_file.suffix
|
|
194
|
-
|
|
195
|
-
connect_params = {
|
|
196
|
-
"user": "test",
|
|
197
|
-
"password": "test",
|
|
198
|
-
"account": "test",
|
|
199
|
-
"database": "test",
|
|
200
|
-
"host": "snowflake.localhost.localstack.cloud",
|
|
201
|
-
}
|
|
202
|
-
|
|
203
|
-
uploader = SnowflakeUploader(
|
|
204
|
-
connection_config=SnowflakeConnectionConfig(
|
|
205
|
-
access_config=SnowflakeAccessConfig(password=connect_params["password"]),
|
|
206
|
-
account=connect_params["account"],
|
|
207
|
-
user=connect_params["user"],
|
|
208
|
-
database=connect_params["database"],
|
|
209
|
-
host=connect_params["host"],
|
|
210
|
-
)
|
|
211
|
-
)
|
|
212
|
-
uploader.precheck()
|
|
213
|
-
uploader.run(path=staged_path, file_data=mock_file_data)
|
|
214
|
-
|
|
215
|
-
with staged_path.open("r") as f:
|
|
216
|
-
staged_data = json.load(f)
|
|
217
|
-
expected_num_elements = len(staged_data)
|
|
218
|
-
validate_destination(
|
|
219
|
-
connect_params=connect_params,
|
|
220
|
-
expected_num_elements=expected_num_elements,
|
|
221
|
-
)
|
|
222
|
-
|
|
223
|
-
uploader.run(path=staged_path, file_data=mock_file_data)
|
|
224
|
-
validate_destination(
|
|
225
|
-
connect_params=connect_params,
|
|
226
|
-
expected_num_elements=expected_num_elements,
|
|
227
|
-
)
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
|
|
231
|
-
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
232
|
-
def test_snowflake_stager(
|
|
233
|
-
request: TopRequest,
|
|
234
|
-
upload_file_str: str,
|
|
235
|
-
tmp_path: Path,
|
|
236
|
-
):
|
|
237
|
-
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
238
|
-
stager = SnowflakeUploadStager()
|
|
239
|
-
stager_validation(
|
|
240
|
-
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
241
|
-
input_file=upload_file,
|
|
242
|
-
stager=stager,
|
|
243
|
-
tmp_dir=tmp_path,
|
|
244
|
-
)
|
|
@@ -1,168 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import sqlite3
|
|
3
|
-
import tempfile
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
|
|
6
|
-
import pytest
|
|
7
|
-
from _pytest.fixtures import TopRequest
|
|
8
|
-
|
|
9
|
-
from test.integration.connectors.utils.constants import (
|
|
10
|
-
DESTINATION_TAG,
|
|
11
|
-
SOURCE_TAG,
|
|
12
|
-
SQL_TAG,
|
|
13
|
-
env_setup_path,
|
|
14
|
-
)
|
|
15
|
-
from test.integration.connectors.utils.validation.destination import (
|
|
16
|
-
StagerValidationConfigs,
|
|
17
|
-
stager_validation,
|
|
18
|
-
)
|
|
19
|
-
from test.integration.connectors.utils.validation.source import (
|
|
20
|
-
SourceValidationConfigs,
|
|
21
|
-
source_connector_validation,
|
|
22
|
-
)
|
|
23
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
24
|
-
from unstructured_ingest.processes.connectors.sql.sqlite import (
|
|
25
|
-
CONNECTOR_TYPE,
|
|
26
|
-
SQLiteConnectionConfig,
|
|
27
|
-
SQLiteDownloader,
|
|
28
|
-
SQLiteDownloaderConfig,
|
|
29
|
-
SQLiteIndexer,
|
|
30
|
-
SQLiteIndexerConfig,
|
|
31
|
-
SQLiteUploader,
|
|
32
|
-
SQLiteUploadStager,
|
|
33
|
-
)
|
|
34
|
-
|
|
35
|
-
SEED_DATA_ROWS = 10
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
@pytest.fixture
|
|
39
|
-
def source_database_setup() -> Path:
|
|
40
|
-
with tempfile.TemporaryDirectory() as tmpdir:
|
|
41
|
-
db_path = Path(tmpdir) / "mock_database.db"
|
|
42
|
-
db_init_path = env_setup_path / "sql" / "sqlite" / "source" / "sqlite-schema.sql"
|
|
43
|
-
assert db_init_path.exists()
|
|
44
|
-
assert db_init_path.is_file()
|
|
45
|
-
with sqlite3.connect(database=db_path) as sqlite_connection:
|
|
46
|
-
cursor = sqlite_connection.cursor()
|
|
47
|
-
with db_init_path.open("r") as f:
|
|
48
|
-
query = f.read()
|
|
49
|
-
cursor.executescript(query)
|
|
50
|
-
for i in range(SEED_DATA_ROWS):
|
|
51
|
-
sql_statment = f"INSERT INTO cars (brand, price) " f"VALUES ('brand{i}', {i})"
|
|
52
|
-
cursor.execute(sql_statment)
|
|
53
|
-
|
|
54
|
-
sqlite_connection.commit()
|
|
55
|
-
cursor.close()
|
|
56
|
-
yield db_path
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
@pytest.mark.asyncio
|
|
60
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, SQL_TAG)
|
|
61
|
-
async def test_sqlite_source(source_database_setup: Path, temp_dir: Path):
|
|
62
|
-
connection_config = SQLiteConnectionConfig(database_path=source_database_setup)
|
|
63
|
-
indexer = SQLiteIndexer(
|
|
64
|
-
connection_config=connection_config,
|
|
65
|
-
index_config=SQLiteIndexerConfig(table_name="cars", id_column="car_id", batch_size=6),
|
|
66
|
-
)
|
|
67
|
-
downloader = SQLiteDownloader(
|
|
68
|
-
connection_config=connection_config,
|
|
69
|
-
download_config=SQLiteDownloaderConfig(fields=["car_id", "brand"], download_dir=temp_dir),
|
|
70
|
-
)
|
|
71
|
-
await source_connector_validation(
|
|
72
|
-
indexer=indexer,
|
|
73
|
-
downloader=downloader,
|
|
74
|
-
configs=SourceValidationConfigs(
|
|
75
|
-
test_id="sqlite",
|
|
76
|
-
expected_num_files=SEED_DATA_ROWS,
|
|
77
|
-
expected_number_indexed_file_data=2,
|
|
78
|
-
validate_downloaded_files=True,
|
|
79
|
-
),
|
|
80
|
-
)
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
@pytest.fixture
|
|
84
|
-
def destination_database_setup(temp_dir: Path) -> Path:
|
|
85
|
-
# Provision the local file that sqlite points to to have the desired schema for the integration
|
|
86
|
-
# tests and make sure the file and connection get cleaned up by using a context manager.
|
|
87
|
-
db_path = temp_dir / "elements.db"
|
|
88
|
-
db_init_path = env_setup_path / "sql" / "sqlite" / "destination" / "sqlite-schema.sql"
|
|
89
|
-
assert db_init_path.exists()
|
|
90
|
-
assert db_init_path.is_file()
|
|
91
|
-
with sqlite3.connect(database=db_path) as sqlite_connection:
|
|
92
|
-
with db_init_path.open("r") as f:
|
|
93
|
-
query = f.read()
|
|
94
|
-
cursor = sqlite_connection.cursor()
|
|
95
|
-
cursor.executescript(query)
|
|
96
|
-
return db_path
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
def validate_destination(db_path: Path, expected_num_elements: int):
|
|
100
|
-
# Run the following validations:
|
|
101
|
-
# * Check that the number of records in the table match the expected value
|
|
102
|
-
connection = None
|
|
103
|
-
try:
|
|
104
|
-
connection = sqlite3.connect(database=db_path)
|
|
105
|
-
query = "select count(*) from elements;"
|
|
106
|
-
cursor = connection.cursor()
|
|
107
|
-
cursor.execute(query)
|
|
108
|
-
count = cursor.fetchone()[0]
|
|
109
|
-
assert (
|
|
110
|
-
count == expected_num_elements
|
|
111
|
-
), f"dest check failed: got {count}, expected {expected_num_elements}"
|
|
112
|
-
finally:
|
|
113
|
-
if connection:
|
|
114
|
-
connection.close()
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
@pytest.mark.asyncio
|
|
118
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
|
|
119
|
-
async def test_sqlite_destination(
|
|
120
|
-
upload_file: Path, temp_dir: Path, destination_database_setup: Path
|
|
121
|
-
):
|
|
122
|
-
# the sqlite destination connector doesn't leverage the file data but is required as an input,
|
|
123
|
-
# mocking it with arbitrary values to meet the base requirements:
|
|
124
|
-
mock_file_data = FileData(
|
|
125
|
-
identifier="mock file data",
|
|
126
|
-
connector_type=CONNECTOR_TYPE,
|
|
127
|
-
source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
|
|
128
|
-
)
|
|
129
|
-
stager = SQLiteUploadStager()
|
|
130
|
-
staged_path = stager.run(
|
|
131
|
-
elements_filepath=upload_file,
|
|
132
|
-
file_data=mock_file_data,
|
|
133
|
-
output_dir=temp_dir,
|
|
134
|
-
output_filename=upload_file.name,
|
|
135
|
-
)
|
|
136
|
-
|
|
137
|
-
# The stager should append the `.json` suffix to the output filename passed in.
|
|
138
|
-
assert staged_path.suffix == upload_file.suffix
|
|
139
|
-
|
|
140
|
-
uploader = SQLiteUploader(
|
|
141
|
-
connection_config=SQLiteConnectionConfig(database_path=destination_database_setup)
|
|
142
|
-
)
|
|
143
|
-
uploader.precheck()
|
|
144
|
-
uploader.run(path=staged_path, file_data=mock_file_data)
|
|
145
|
-
|
|
146
|
-
with staged_path.open("r") as f:
|
|
147
|
-
staged_data = json.load(f)
|
|
148
|
-
validate_destination(db_path=destination_database_setup, expected_num_elements=len(staged_data))
|
|
149
|
-
|
|
150
|
-
uploader.run(path=staged_path, file_data=mock_file_data)
|
|
151
|
-
validate_destination(db_path=destination_database_setup, expected_num_elements=len(staged_data))
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
|
|
155
|
-
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
156
|
-
def test_sqlite_stager(
|
|
157
|
-
request: TopRequest,
|
|
158
|
-
upload_file_str: str,
|
|
159
|
-
tmp_path: Path,
|
|
160
|
-
):
|
|
161
|
-
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
162
|
-
stager = SQLiteUploadStager()
|
|
163
|
-
stager_validation(
|
|
164
|
-
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
165
|
-
input_file=upload_file,
|
|
166
|
-
stager=stager,
|
|
167
|
-
tmp_dir=tmp_path,
|
|
168
|
-
)
|
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
|
|
3
|
-
import pytest
|
|
4
|
-
from _pytest.fixtures import TopRequest
|
|
5
|
-
|
|
6
|
-
from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
|
|
7
|
-
from test.integration.connectors.utils.validation.destination import (
|
|
8
|
-
StagerValidationConfigs,
|
|
9
|
-
stager_validation,
|
|
10
|
-
)
|
|
11
|
-
from unstructured_ingest.processes.connectors.sql.vastdb import (
|
|
12
|
-
CONNECTOR_TYPE,
|
|
13
|
-
VastdbUploadStager,
|
|
14
|
-
VastdbUploadStagerConfig,
|
|
15
|
-
)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
|
|
19
|
-
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
20
|
-
def test_vast_stager(
|
|
21
|
-
request: TopRequest,
|
|
22
|
-
upload_file_str: str,
|
|
23
|
-
tmp_path: Path,
|
|
24
|
-
):
|
|
25
|
-
upload_file: Path = request.getfixturevalue(upload_file_str)
|
|
26
|
-
stager = VastdbUploadStager(
|
|
27
|
-
upload_stager_config=VastdbUploadStagerConfig(rename_columns_map={"page_number": "page"})
|
|
28
|
-
)
|
|
29
|
-
stager_validation(
|
|
30
|
-
configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
|
|
31
|
-
input_file=upload_file,
|
|
32
|
-
stager=stager,
|
|
33
|
-
tmp_dir=tmp_path,
|
|
34
|
-
)
|