unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (187) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/README.md +28 -0
  3. unstructured_ingest/embed/mixedbreadai.py +0 -1
  4. unstructured_ingest/interfaces/upload_stager.py +2 -2
  5. unstructured_ingest/interfaces/uploader.py +3 -3
  6. unstructured_ingest/main.py +0 -0
  7. unstructured_ingest/pipeline/interfaces.py +1 -1
  8. unstructured_ingest/pipeline/pipeline.py +1 -1
  9. unstructured_ingest/processes/chunker.py +4 -0
  10. unstructured_ingest/processes/connectors/airtable.py +4 -2
  11. unstructured_ingest/processes/connectors/astradb.py +2 -2
  12. unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  13. unstructured_ingest/processes/connectors/confluence.py +0 -1
  14. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
  15. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
  16. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
  17. unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
  18. unstructured_ingest/processes/connectors/delta_table.py +1 -0
  19. unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
  20. unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
  21. unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
  22. unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
  23. unstructured_ingest/processes/connectors/gitlab.py +1 -2
  24. unstructured_ingest/processes/connectors/google_drive.py +0 -2
  25. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
  26. unstructured_ingest/processes/connectors/kdbai.py +1 -0
  27. unstructured_ingest/processes/connectors/outlook.py +1 -2
  28. unstructured_ingest/processes/connectors/pinecone.py +0 -1
  29. unstructured_ingest/processes/connectors/redisdb.py +28 -24
  30. unstructured_ingest/processes/connectors/salesforce.py +1 -1
  31. unstructured_ingest/processes/connectors/slack.py +1 -2
  32. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
  33. unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
  34. unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
  35. unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
  36. unstructured_ingest/processes/connectors/sql/sql.py +3 -4
  37. unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
  38. unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
  39. unstructured_ingest/processes/connectors/vectara.py +0 -2
  40. unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
  41. unstructured_ingest/processes/embedder.py +2 -2
  42. unstructured_ingest/processes/filter.py +1 -1
  43. unstructured_ingest/processes/partitioner.py +4 -0
  44. unstructured_ingest/processes/utils/blob_storage.py +2 -2
  45. unstructured_ingest/unstructured_api.py +13 -8
  46. unstructured_ingest/utils/data_prep.py +8 -32
  47. unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
  48. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +50 -184
  49. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
  50. examples/__init__.py +0 -0
  51. examples/airtable.py +0 -44
  52. examples/azure_cognitive_search.py +0 -55
  53. examples/chroma.py +0 -54
  54. examples/couchbase.py +0 -55
  55. examples/databricks_volumes_dest.py +0 -55
  56. examples/databricks_volumes_source.py +0 -53
  57. examples/delta_table.py +0 -45
  58. examples/discord_example.py +0 -36
  59. examples/elasticsearch.py +0 -49
  60. examples/google_drive.py +0 -45
  61. examples/kdbai.py +0 -54
  62. examples/local.py +0 -36
  63. examples/milvus.py +0 -44
  64. examples/mongodb.py +0 -53
  65. examples/opensearch.py +0 -50
  66. examples/pinecone.py +0 -57
  67. examples/s3.py +0 -38
  68. examples/salesforce.py +0 -44
  69. examples/sharepoint.py +0 -47
  70. examples/singlestore.py +0 -49
  71. examples/sql.py +0 -90
  72. examples/vectara.py +0 -54
  73. examples/weaviate.py +0 -44
  74. test/__init__.py +0 -0
  75. test/integration/__init__.py +0 -0
  76. test/integration/chunkers/__init__.py +0 -0
  77. test/integration/chunkers/test_chunkers.py +0 -31
  78. test/integration/connectors/__init__.py +0 -0
  79. test/integration/connectors/conftest.py +0 -38
  80. test/integration/connectors/databricks/__init__.py +0 -0
  81. test/integration/connectors/databricks/test_volumes_native.py +0 -273
  82. test/integration/connectors/discord/__init__.py +0 -0
  83. test/integration/connectors/discord/test_discord.py +0 -90
  84. test/integration/connectors/duckdb/__init__.py +0 -0
  85. test/integration/connectors/duckdb/conftest.py +0 -14
  86. test/integration/connectors/duckdb/test_duckdb.py +0 -90
  87. test/integration/connectors/duckdb/test_motherduck.py +0 -95
  88. test/integration/connectors/elasticsearch/__init__.py +0 -0
  89. test/integration/connectors/elasticsearch/conftest.py +0 -34
  90. test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
  91. test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
  92. test/integration/connectors/sql/__init__.py +0 -0
  93. test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
  94. test/integration/connectors/sql/test_postgres.py +0 -201
  95. test/integration/connectors/sql/test_singlestore.py +0 -182
  96. test/integration/connectors/sql/test_snowflake.py +0 -244
  97. test/integration/connectors/sql/test_sqlite.py +0 -168
  98. test/integration/connectors/sql/test_vastdb.py +0 -34
  99. test/integration/connectors/test_astradb.py +0 -287
  100. test/integration/connectors/test_azure_ai_search.py +0 -254
  101. test/integration/connectors/test_chroma.py +0 -136
  102. test/integration/connectors/test_confluence.py +0 -111
  103. test/integration/connectors/test_delta_table.py +0 -183
  104. test/integration/connectors/test_dropbox.py +0 -151
  105. test/integration/connectors/test_github.py +0 -49
  106. test/integration/connectors/test_google_drive.py +0 -257
  107. test/integration/connectors/test_jira.py +0 -67
  108. test/integration/connectors/test_lancedb.py +0 -247
  109. test/integration/connectors/test_milvus.py +0 -208
  110. test/integration/connectors/test_mongodb.py +0 -335
  111. test/integration/connectors/test_neo4j.py +0 -244
  112. test/integration/connectors/test_notion.py +0 -152
  113. test/integration/connectors/test_onedrive.py +0 -163
  114. test/integration/connectors/test_pinecone.py +0 -387
  115. test/integration/connectors/test_qdrant.py +0 -216
  116. test/integration/connectors/test_redis.py +0 -143
  117. test/integration/connectors/test_s3.py +0 -184
  118. test/integration/connectors/test_sharepoint.py +0 -222
  119. test/integration/connectors/test_vectara.py +0 -282
  120. test/integration/connectors/test_zendesk.py +0 -120
  121. test/integration/connectors/utils/__init__.py +0 -0
  122. test/integration/connectors/utils/constants.py +0 -13
  123. test/integration/connectors/utils/docker.py +0 -151
  124. test/integration/connectors/utils/docker_compose.py +0 -59
  125. test/integration/connectors/utils/validation/__init__.py +0 -0
  126. test/integration/connectors/utils/validation/destination.py +0 -77
  127. test/integration/connectors/utils/validation/equality.py +0 -76
  128. test/integration/connectors/utils/validation/source.py +0 -331
  129. test/integration/connectors/utils/validation/utils.py +0 -36
  130. test/integration/connectors/weaviate/__init__.py +0 -0
  131. test/integration/connectors/weaviate/conftest.py +0 -15
  132. test/integration/connectors/weaviate/test_cloud.py +0 -39
  133. test/integration/connectors/weaviate/test_local.py +0 -152
  134. test/integration/embedders/__init__.py +0 -0
  135. test/integration/embedders/conftest.py +0 -13
  136. test/integration/embedders/test_azure_openai.py +0 -57
  137. test/integration/embedders/test_bedrock.py +0 -103
  138. test/integration/embedders/test_huggingface.py +0 -24
  139. test/integration/embedders/test_mixedbread.py +0 -71
  140. test/integration/embedders/test_octoai.py +0 -75
  141. test/integration/embedders/test_openai.py +0 -74
  142. test/integration/embedders/test_togetherai.py +0 -71
  143. test/integration/embedders/test_vertexai.py +0 -63
  144. test/integration/embedders/test_voyageai.py +0 -79
  145. test/integration/embedders/utils.py +0 -66
  146. test/integration/partitioners/__init__.py +0 -0
  147. test/integration/partitioners/test_partitioner.py +0 -76
  148. test/integration/utils.py +0 -15
  149. test/unit/__init__.py +0 -0
  150. test/unit/chunkers/__init__.py +0 -0
  151. test/unit/chunkers/test_chunkers.py +0 -49
  152. test/unit/connectors/__init__.py +0 -0
  153. test/unit/connectors/ibm_watsonx/__init__.py +0 -0
  154. test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
  155. test/unit/connectors/motherduck/__init__.py +0 -0
  156. test/unit/connectors/motherduck/test_base.py +0 -73
  157. test/unit/connectors/sql/__init__.py +0 -0
  158. test/unit/connectors/sql/test_sql.py +0 -152
  159. test/unit/connectors/test_confluence.py +0 -71
  160. test/unit/connectors/test_jira.py +0 -401
  161. test/unit/embed/__init__.py +0 -0
  162. test/unit/embed/test_mixedbreadai.py +0 -42
  163. test/unit/embed/test_octoai.py +0 -27
  164. test/unit/embed/test_openai.py +0 -28
  165. test/unit/embed/test_vertexai.py +0 -25
  166. test/unit/embed/test_voyageai.py +0 -24
  167. test/unit/embedders/__init__.py +0 -0
  168. test/unit/embedders/test_bedrock.py +0 -36
  169. test/unit/embedders/test_huggingface.py +0 -48
  170. test/unit/embedders/test_mixedbread.py +0 -37
  171. test/unit/embedders/test_octoai.py +0 -35
  172. test/unit/embedders/test_openai.py +0 -35
  173. test/unit/embedders/test_togetherai.py +0 -37
  174. test/unit/embedders/test_vertexai.py +0 -37
  175. test/unit/embedders/test_voyageai.py +0 -38
  176. test/unit/partitioners/__init__.py +0 -0
  177. test/unit/partitioners/test_partitioner.py +0 -63
  178. test/unit/test_error.py +0 -27
  179. test/unit/test_html.py +0 -112
  180. test/unit/test_interfaces.py +0 -26
  181. test/unit/test_utils.py +0 -220
  182. test/unit/utils/__init__.py +0 -0
  183. test/unit/utils/data_generator.py +0 -32
  184. unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
  185. unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
  186. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
  187. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
@@ -1,182 +0,0 @@
1
- import json
2
- from pathlib import Path
3
-
4
- import pytest
5
- import singlestoredb as s2
6
- from _pytest.fixtures import TopRequest
7
-
8
- from test.integration.connectors.utils.constants import (
9
- DESTINATION_TAG,
10
- SOURCE_TAG,
11
- SQL_TAG,
12
- env_setup_path,
13
- )
14
- from test.integration.connectors.utils.docker_compose import docker_compose_context
15
- from test.integration.connectors.utils.validation.destination import (
16
- StagerValidationConfigs,
17
- stager_validation,
18
- )
19
- from test.integration.connectors.utils.validation.source import (
20
- SourceValidationConfigs,
21
- source_connector_validation,
22
- )
23
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
24
- from unstructured_ingest.processes.connectors.sql.singlestore import (
25
- CONNECTOR_TYPE,
26
- SingleStoreAccessConfig,
27
- SingleStoreConnectionConfig,
28
- SingleStoreDownloader,
29
- SingleStoreDownloaderConfig,
30
- SingleStoreIndexer,
31
- SingleStoreIndexerConfig,
32
- SingleStoreUploader,
33
- SingleStoreUploaderConfig,
34
- SingleStoreUploadStager,
35
- )
36
-
37
- SEED_DATA_ROWS = 10
38
-
39
-
40
- @pytest.fixture
41
- def source_database_setup() -> dict:
42
- connect_params = {
43
- "host": "localhost",
44
- "port": 3306,
45
- "database": "ingest_test",
46
- "user": "root",
47
- "password": "password",
48
- }
49
- with docker_compose_context(
50
- docker_compose_path=env_setup_path / "sql" / "singlestore" / "source"
51
- ):
52
- with s2.connect(**connect_params) as connection:
53
- with connection.cursor() as cursor:
54
- for i in range(SEED_DATA_ROWS):
55
- sql_statment = f"INSERT INTO cars (brand, price) VALUES " f"('brand_{i}', {i})"
56
- cursor.execute(sql_statment)
57
- connection.commit()
58
- yield connect_params
59
-
60
-
61
- @pytest.mark.asyncio
62
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, SQL_TAG)
63
- async def test_singlestore_source(temp_dir: Path, source_database_setup: dict):
64
-
65
- connection_config = SingleStoreConnectionConfig(
66
- host=source_database_setup["host"],
67
- port=source_database_setup["port"],
68
- database=source_database_setup["database"],
69
- user=source_database_setup["user"],
70
- access_config=SingleStoreAccessConfig(password=source_database_setup["password"]),
71
- )
72
- indexer = SingleStoreIndexer(
73
- connection_config=connection_config,
74
- index_config=SingleStoreIndexerConfig(table_name="cars", id_column="car_id", batch_size=6),
75
- )
76
- downloader = SingleStoreDownloader(
77
- connection_config=connection_config,
78
- download_config=SingleStoreDownloaderConfig(
79
- fields=["car_id", "brand"], download_dir=temp_dir
80
- ),
81
- )
82
- await source_connector_validation(
83
- indexer=indexer,
84
- downloader=downloader,
85
- configs=SourceValidationConfigs(
86
- test_id="singlestore",
87
- expected_num_files=SEED_DATA_ROWS,
88
- expected_number_indexed_file_data=2,
89
- validate_downloaded_files=True,
90
- ),
91
- )
92
-
93
-
94
- def validate_destination(
95
- connect_params: dict,
96
- expected_num_elements: int,
97
- ):
98
- with s2.connect(**connect_params) as connection:
99
- with connection.cursor() as cursor:
100
- query = "select count(*) from elements;"
101
- cursor.execute(query)
102
- count = cursor.fetchone()[0]
103
- assert (
104
- count == expected_num_elements
105
- ), f"dest check failed: got {count}, expected {expected_num_elements}"
106
-
107
-
108
- @pytest.mark.asyncio
109
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
110
- async def test_singlestore_destination(upload_file: Path, temp_dir: Path):
111
- mock_file_data = FileData(
112
- identifier="mock file data",
113
- connector_type=CONNECTOR_TYPE,
114
- source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
115
- )
116
- with docker_compose_context(
117
- docker_compose_path=env_setup_path / "sql" / "singlestore" / "destination"
118
- ):
119
- stager = SingleStoreUploadStager()
120
- staged_path = stager.run(
121
- elements_filepath=upload_file,
122
- file_data=mock_file_data,
123
- output_dir=temp_dir,
124
- output_filename=upload_file.name,
125
- )
126
-
127
- # The stager should append the `.json` suffix to the output filename passed in.
128
- assert staged_path.suffix == upload_file.suffix
129
-
130
- connect_params = {
131
- "host": "localhost",
132
- "port": 3306,
133
- "database": "ingest_test",
134
- "user": "root",
135
- "password": "password",
136
- }
137
-
138
- uploader = SingleStoreUploader(
139
- connection_config=SingleStoreConnectionConfig(
140
- host=connect_params["host"],
141
- port=connect_params["port"],
142
- database=connect_params["database"],
143
- user=connect_params["user"],
144
- access_config=SingleStoreAccessConfig(password=connect_params["password"]),
145
- ),
146
- upload_config=SingleStoreUploaderConfig(
147
- table_name="elements",
148
- ),
149
- )
150
- uploader.precheck()
151
- uploader.run(path=staged_path, file_data=mock_file_data)
152
-
153
- with staged_path.open("r") as f:
154
- staged_data = json.load(f)
155
- expected_num_elements = len(staged_data)
156
- validate_destination(
157
- connect_params=connect_params,
158
- expected_num_elements=expected_num_elements,
159
- )
160
-
161
- uploader.run(path=staged_path, file_data=mock_file_data)
162
- validate_destination(
163
- connect_params=connect_params,
164
- expected_num_elements=expected_num_elements,
165
- )
166
-
167
-
168
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
169
- @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
170
- def test_singlestore_stager(
171
- request: TopRequest,
172
- upload_file_str: str,
173
- tmp_path: Path,
174
- ):
175
- upload_file: Path = request.getfixturevalue(upload_file_str)
176
- stager = SingleStoreUploadStager()
177
- stager_validation(
178
- configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
179
- input_file=upload_file,
180
- stager=stager,
181
- tmp_dir=tmp_path,
182
- )
@@ -1,244 +0,0 @@
1
- import json
2
- import os
3
- from pathlib import Path
4
-
5
- import pytest
6
- import snowflake.connector as sf
7
- from _pytest.fixtures import TopRequest
8
-
9
- from test.integration.connectors.utils.constants import (
10
- DESTINATION_TAG,
11
- SOURCE_TAG,
12
- SQL_TAG,
13
- env_setup_path,
14
- )
15
- from test.integration.connectors.utils.docker import container_context
16
- from test.integration.connectors.utils.validation.destination import (
17
- StagerValidationConfigs,
18
- stager_validation,
19
- )
20
- from test.integration.connectors.utils.validation.source import (
21
- SourceValidationConfigs,
22
- source_connector_validation,
23
- )
24
- from test.integration.utils import requires_env
25
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
26
- from unstructured_ingest.processes.connectors.sql.snowflake import (
27
- CONNECTOR_TYPE,
28
- SnowflakeAccessConfig,
29
- SnowflakeConnectionConfig,
30
- SnowflakeDownloader,
31
- SnowflakeDownloaderConfig,
32
- SnowflakeIndexer,
33
- SnowflakeIndexerConfig,
34
- SnowflakeUploader,
35
- SnowflakeUploadStager,
36
- )
37
-
38
- SEED_DATA_ROWS = 20
39
-
40
-
41
- def seed_data() -> dict:
42
- connect_params = {
43
- "user": "test",
44
- "password": "test",
45
- "account": "test",
46
- "database": "test",
47
- "host": "snowflake.localhost.localstack.cloud",
48
- }
49
- conn = sf.connect(**connect_params)
50
-
51
- file = Path(env_setup_path / "sql" / "snowflake" / "source" / "snowflake-schema.sql")
52
-
53
- with file.open() as f:
54
- sql = f.read()
55
-
56
- cur = conn.cursor()
57
- cur.execute(sql)
58
- for i in range(SEED_DATA_ROWS):
59
- sql_statment = f"INSERT INTO cars (brand, price) VALUES " f"('brand_{i}', {i})"
60
- cur.execute(sql_statment)
61
-
62
- cur.close()
63
- conn.close()
64
- return connect_params
65
-
66
-
67
- @pytest.fixture
68
- def source_database_setup() -> dict:
69
- token = os.getenv("LOCALSTACK_AUTH_TOKEN")
70
- with container_context(
71
- image="localstack/snowflake",
72
- environment={"LOCALSTACK_AUTH_TOKEN": token, "EXTRA_CORS_ALLOWED_ORIGINS": "*"},
73
- ports={4566: 4566, 443: 443},
74
- healthcheck_retries=30,
75
- ):
76
- connect_params = seed_data()
77
- yield connect_params
78
-
79
-
80
- def init_db_destination() -> dict:
81
- connect_params = {
82
- "user": "test",
83
- "password": "test",
84
- "account": "test",
85
- "database": "test",
86
- "host": "snowflake.localhost.localstack.cloud",
87
- }
88
- conn = sf.connect(**connect_params)
89
-
90
- file = Path(env_setup_path / "sql" / "snowflake" / "destination" / "snowflake-schema.sql")
91
-
92
- with file.open() as f:
93
- sql = f.read()
94
-
95
- cur = conn.cursor()
96
- cur.execute(sql)
97
-
98
- cur.close()
99
- conn.close()
100
- return connect_params
101
-
102
-
103
- @pytest.fixture
104
- def destination_database_setup() -> dict:
105
- token = os.getenv("LOCALSTACK_AUTH_TOKEN")
106
- with container_context(
107
- image="localstack/snowflake",
108
- environment={"LOCALSTACK_AUTH_TOKEN": token, "EXTRA_CORS_ALLOWED_ORIGINS": "*"},
109
- ports={4566: 4566, 443: 443},
110
- healthcheck_retries=30,
111
- ):
112
- connect_params = init_db_destination()
113
- yield connect_params
114
-
115
-
116
- @pytest.mark.asyncio
117
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, SQL_TAG)
118
- @requires_env("LOCALSTACK_AUTH_TOKEN")
119
- async def test_snowflake_source(temp_dir: Path, source_database_setup: dict):
120
- connection_config = SnowflakeConnectionConfig(
121
- access_config=SnowflakeAccessConfig(password="test"),
122
- account="test",
123
- user="test",
124
- database="test",
125
- host="snowflake.localhost.localstack.cloud",
126
- )
127
- indexer = SnowflakeIndexer(
128
- connection_config=connection_config,
129
- index_config=SnowflakeIndexerConfig(table_name="cars", id_column="CAR_ID", batch_size=5),
130
- )
131
- downloader = SnowflakeDownloader(
132
- connection_config=connection_config,
133
- download_config=SnowflakeDownloaderConfig(
134
- fields=["CAR_ID", "BRAND"], download_dir=temp_dir
135
- ),
136
- )
137
- await source_connector_validation(
138
- indexer=indexer,
139
- downloader=downloader,
140
- configs=SourceValidationConfigs(
141
- test_id="snowflake",
142
- expected_num_files=SEED_DATA_ROWS,
143
- expected_number_indexed_file_data=4,
144
- validate_downloaded_files=True,
145
- ),
146
- )
147
-
148
-
149
- def validate_destination(
150
- connect_params: dict,
151
- expected_num_elements: int,
152
- ):
153
- # Run the following validations:
154
- # * Check that the number of records in the table match the expected value
155
- # * Given the embedding, make sure it matches the associated text it belongs to
156
- conn = sf.connect(**connect_params)
157
- cursor = conn.cursor()
158
- try:
159
- query = "select count(*) from elements;"
160
- cursor.execute(query)
161
- count = cursor.fetchone()[0]
162
- assert (
163
- count == expected_num_elements
164
- ), f"dest check failed: got {count}, expected {expected_num_elements}"
165
- finally:
166
- cursor.close()
167
- conn.close()
168
-
169
-
170
- @pytest.mark.asyncio
171
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
172
- @requires_env("LOCALSTACK_AUTH_TOKEN")
173
- async def test_snowflake_destination(
174
- upload_file: Path, temp_dir: Path, destination_database_setup: dict
175
- ):
176
- # the postgres destination connector doesn't leverage the file data but is required as an input,
177
- # mocking it with arbitrary values to meet the base requirements:
178
- mock_file_data = FileData(
179
- identifier="mock file data",
180
- connector_type=CONNECTOR_TYPE,
181
- source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
182
- )
183
- init_db_destination()
184
- stager = SnowflakeUploadStager()
185
- staged_path = stager.run(
186
- elements_filepath=upload_file,
187
- file_data=mock_file_data,
188
- output_dir=temp_dir,
189
- output_filename=upload_file.name,
190
- )
191
-
192
- # The stager should append the `.json` suffix to the output filename passed in.
193
- assert staged_path.suffix == upload_file.suffix
194
-
195
- connect_params = {
196
- "user": "test",
197
- "password": "test",
198
- "account": "test",
199
- "database": "test",
200
- "host": "snowflake.localhost.localstack.cloud",
201
- }
202
-
203
- uploader = SnowflakeUploader(
204
- connection_config=SnowflakeConnectionConfig(
205
- access_config=SnowflakeAccessConfig(password=connect_params["password"]),
206
- account=connect_params["account"],
207
- user=connect_params["user"],
208
- database=connect_params["database"],
209
- host=connect_params["host"],
210
- )
211
- )
212
- uploader.precheck()
213
- uploader.run(path=staged_path, file_data=mock_file_data)
214
-
215
- with staged_path.open("r") as f:
216
- staged_data = json.load(f)
217
- expected_num_elements = len(staged_data)
218
- validate_destination(
219
- connect_params=connect_params,
220
- expected_num_elements=expected_num_elements,
221
- )
222
-
223
- uploader.run(path=staged_path, file_data=mock_file_data)
224
- validate_destination(
225
- connect_params=connect_params,
226
- expected_num_elements=expected_num_elements,
227
- )
228
-
229
-
230
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
231
- @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
232
- def test_snowflake_stager(
233
- request: TopRequest,
234
- upload_file_str: str,
235
- tmp_path: Path,
236
- ):
237
- upload_file: Path = request.getfixturevalue(upload_file_str)
238
- stager = SnowflakeUploadStager()
239
- stager_validation(
240
- configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
241
- input_file=upload_file,
242
- stager=stager,
243
- tmp_dir=tmp_path,
244
- )
@@ -1,168 +0,0 @@
1
- import json
2
- import sqlite3
3
- import tempfile
4
- from pathlib import Path
5
-
6
- import pytest
7
- from _pytest.fixtures import TopRequest
8
-
9
- from test.integration.connectors.utils.constants import (
10
- DESTINATION_TAG,
11
- SOURCE_TAG,
12
- SQL_TAG,
13
- env_setup_path,
14
- )
15
- from test.integration.connectors.utils.validation.destination import (
16
- StagerValidationConfigs,
17
- stager_validation,
18
- )
19
- from test.integration.connectors.utils.validation.source import (
20
- SourceValidationConfigs,
21
- source_connector_validation,
22
- )
23
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
24
- from unstructured_ingest.processes.connectors.sql.sqlite import (
25
- CONNECTOR_TYPE,
26
- SQLiteConnectionConfig,
27
- SQLiteDownloader,
28
- SQLiteDownloaderConfig,
29
- SQLiteIndexer,
30
- SQLiteIndexerConfig,
31
- SQLiteUploader,
32
- SQLiteUploadStager,
33
- )
34
-
35
- SEED_DATA_ROWS = 10
36
-
37
-
38
- @pytest.fixture
39
- def source_database_setup() -> Path:
40
- with tempfile.TemporaryDirectory() as tmpdir:
41
- db_path = Path(tmpdir) / "mock_database.db"
42
- db_init_path = env_setup_path / "sql" / "sqlite" / "source" / "sqlite-schema.sql"
43
- assert db_init_path.exists()
44
- assert db_init_path.is_file()
45
- with sqlite3.connect(database=db_path) as sqlite_connection:
46
- cursor = sqlite_connection.cursor()
47
- with db_init_path.open("r") as f:
48
- query = f.read()
49
- cursor.executescript(query)
50
- for i in range(SEED_DATA_ROWS):
51
- sql_statment = f"INSERT INTO cars (brand, price) " f"VALUES ('brand{i}', {i})"
52
- cursor.execute(sql_statment)
53
-
54
- sqlite_connection.commit()
55
- cursor.close()
56
- yield db_path
57
-
58
-
59
- @pytest.mark.asyncio
60
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, SQL_TAG)
61
- async def test_sqlite_source(source_database_setup: Path, temp_dir: Path):
62
- connection_config = SQLiteConnectionConfig(database_path=source_database_setup)
63
- indexer = SQLiteIndexer(
64
- connection_config=connection_config,
65
- index_config=SQLiteIndexerConfig(table_name="cars", id_column="car_id", batch_size=6),
66
- )
67
- downloader = SQLiteDownloader(
68
- connection_config=connection_config,
69
- download_config=SQLiteDownloaderConfig(fields=["car_id", "brand"], download_dir=temp_dir),
70
- )
71
- await source_connector_validation(
72
- indexer=indexer,
73
- downloader=downloader,
74
- configs=SourceValidationConfigs(
75
- test_id="sqlite",
76
- expected_num_files=SEED_DATA_ROWS,
77
- expected_number_indexed_file_data=2,
78
- validate_downloaded_files=True,
79
- ),
80
- )
81
-
82
-
83
- @pytest.fixture
84
- def destination_database_setup(temp_dir: Path) -> Path:
85
- # Provision the local file that sqlite points to to have the desired schema for the integration
86
- # tests and make sure the file and connection get cleaned up by using a context manager.
87
- db_path = temp_dir / "elements.db"
88
- db_init_path = env_setup_path / "sql" / "sqlite" / "destination" / "sqlite-schema.sql"
89
- assert db_init_path.exists()
90
- assert db_init_path.is_file()
91
- with sqlite3.connect(database=db_path) as sqlite_connection:
92
- with db_init_path.open("r") as f:
93
- query = f.read()
94
- cursor = sqlite_connection.cursor()
95
- cursor.executescript(query)
96
- return db_path
97
-
98
-
99
- def validate_destination(db_path: Path, expected_num_elements: int):
100
- # Run the following validations:
101
- # * Check that the number of records in the table match the expected value
102
- connection = None
103
- try:
104
- connection = sqlite3.connect(database=db_path)
105
- query = "select count(*) from elements;"
106
- cursor = connection.cursor()
107
- cursor.execute(query)
108
- count = cursor.fetchone()[0]
109
- assert (
110
- count == expected_num_elements
111
- ), f"dest check failed: got {count}, expected {expected_num_elements}"
112
- finally:
113
- if connection:
114
- connection.close()
115
-
116
-
117
- @pytest.mark.asyncio
118
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
119
- async def test_sqlite_destination(
120
- upload_file: Path, temp_dir: Path, destination_database_setup: Path
121
- ):
122
- # the sqlite destination connector doesn't leverage the file data but is required as an input,
123
- # mocking it with arbitrary values to meet the base requirements:
124
- mock_file_data = FileData(
125
- identifier="mock file data",
126
- connector_type=CONNECTOR_TYPE,
127
- source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
128
- )
129
- stager = SQLiteUploadStager()
130
- staged_path = stager.run(
131
- elements_filepath=upload_file,
132
- file_data=mock_file_data,
133
- output_dir=temp_dir,
134
- output_filename=upload_file.name,
135
- )
136
-
137
- # The stager should append the `.json` suffix to the output filename passed in.
138
- assert staged_path.suffix == upload_file.suffix
139
-
140
- uploader = SQLiteUploader(
141
- connection_config=SQLiteConnectionConfig(database_path=destination_database_setup)
142
- )
143
- uploader.precheck()
144
- uploader.run(path=staged_path, file_data=mock_file_data)
145
-
146
- with staged_path.open("r") as f:
147
- staged_data = json.load(f)
148
- validate_destination(db_path=destination_database_setup, expected_num_elements=len(staged_data))
149
-
150
- uploader.run(path=staged_path, file_data=mock_file_data)
151
- validate_destination(db_path=destination_database_setup, expected_num_elements=len(staged_data))
152
-
153
-
154
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
155
- @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
156
- def test_sqlite_stager(
157
- request: TopRequest,
158
- upload_file_str: str,
159
- tmp_path: Path,
160
- ):
161
- upload_file: Path = request.getfixturevalue(upload_file_str)
162
- stager = SQLiteUploadStager()
163
- stager_validation(
164
- configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
165
- input_file=upload_file,
166
- stager=stager,
167
- tmp_dir=tmp_path,
168
- )
@@ -1,34 +0,0 @@
1
- from pathlib import Path
2
-
3
- import pytest
4
- from _pytest.fixtures import TopRequest
5
-
6
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
7
- from test.integration.connectors.utils.validation.destination import (
8
- StagerValidationConfigs,
9
- stager_validation,
10
- )
11
- from unstructured_ingest.processes.connectors.sql.vastdb import (
12
- CONNECTOR_TYPE,
13
- VastdbUploadStager,
14
- VastdbUploadStagerConfig,
15
- )
16
-
17
-
18
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
19
- @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
20
- def test_vast_stager(
21
- request: TopRequest,
22
- upload_file_str: str,
23
- tmp_path: Path,
24
- ):
25
- upload_file: Path = request.getfixturevalue(upload_file_str)
26
- stager = VastdbUploadStager(
27
- upload_stager_config=VastdbUploadStagerConfig(rename_columns_map={"page_number": "page"})
28
- )
29
- stager_validation(
30
- configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
31
- input_file=upload_file,
32
- stager=stager,
33
- tmp_dir=tmp_path,
34
- )