unstructured-ingest 0.7.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (192) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/README.md +28 -0
  3. unstructured_ingest/embed/mixedbreadai.py +0 -1
  4. unstructured_ingest/interfaces/upload_stager.py +2 -2
  5. unstructured_ingest/interfaces/uploader.py +3 -3
  6. unstructured_ingest/logger.py +2 -93
  7. unstructured_ingest/main.py +0 -0
  8. unstructured_ingest/pipeline/interfaces.py +1 -1
  9. unstructured_ingest/pipeline/pipeline.py +1 -1
  10. unstructured_ingest/processes/chunker.py +4 -0
  11. unstructured_ingest/processes/connectors/airtable.py +4 -2
  12. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +10 -0
  13. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  14. unstructured_ingest/processes/connectors/astradb.py +2 -2
  15. unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  16. unstructured_ingest/processes/connectors/confluence.py +0 -1
  17. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
  18. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
  19. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
  20. unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
  21. unstructured_ingest/processes/connectors/delta_table.py +1 -0
  22. unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
  23. unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
  24. unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
  25. unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
  26. unstructured_ingest/processes/connectors/gitlab.py +1 -2
  27. unstructured_ingest/processes/connectors/google_drive.py +0 -2
  28. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
  29. unstructured_ingest/processes/connectors/kdbai.py +1 -0
  30. unstructured_ingest/processes/connectors/outlook.py +1 -2
  31. unstructured_ingest/processes/connectors/pinecone.py +0 -1
  32. unstructured_ingest/processes/connectors/redisdb.py +28 -24
  33. unstructured_ingest/processes/connectors/salesforce.py +1 -1
  34. unstructured_ingest/processes/connectors/slack.py +1 -2
  35. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
  36. unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
  37. unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
  38. unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
  39. unstructured_ingest/processes/connectors/sql/sql.py +3 -4
  40. unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
  41. unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
  42. unstructured_ingest/processes/connectors/vectara.py +0 -2
  43. unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
  44. unstructured_ingest/processes/embedder.py +2 -2
  45. unstructured_ingest/processes/filter.py +1 -1
  46. unstructured_ingest/processes/partitioner.py +4 -0
  47. unstructured_ingest/processes/utils/blob_storage.py +2 -2
  48. unstructured_ingest/unstructured_api.py +13 -8
  49. unstructured_ingest/utils/data_prep.py +8 -32
  50. unstructured_ingest/utils/string_and_date_utils.py +3 -3
  51. unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
  52. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +54 -187
  53. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
  54. examples/__init__.py +0 -0
  55. examples/airtable.py +0 -44
  56. examples/azure_cognitive_search.py +0 -55
  57. examples/chroma.py +0 -54
  58. examples/couchbase.py +0 -55
  59. examples/databricks_volumes_dest.py +0 -55
  60. examples/databricks_volumes_source.py +0 -53
  61. examples/delta_table.py +0 -45
  62. examples/discord_example.py +0 -36
  63. examples/elasticsearch.py +0 -49
  64. examples/google_drive.py +0 -45
  65. examples/kdbai.py +0 -54
  66. examples/local.py +0 -36
  67. examples/milvus.py +0 -44
  68. examples/mongodb.py +0 -53
  69. examples/opensearch.py +0 -50
  70. examples/pinecone.py +0 -57
  71. examples/s3.py +0 -38
  72. examples/salesforce.py +0 -44
  73. examples/sharepoint.py +0 -47
  74. examples/singlestore.py +0 -49
  75. examples/sql.py +0 -90
  76. examples/vectara.py +0 -54
  77. examples/weaviate.py +0 -44
  78. test/__init__.py +0 -0
  79. test/integration/__init__.py +0 -0
  80. test/integration/chunkers/__init__.py +0 -0
  81. test/integration/chunkers/test_chunkers.py +0 -31
  82. test/integration/connectors/__init__.py +0 -0
  83. test/integration/connectors/conftest.py +0 -38
  84. test/integration/connectors/databricks/__init__.py +0 -0
  85. test/integration/connectors/databricks/test_volumes_native.py +0 -273
  86. test/integration/connectors/discord/__init__.py +0 -0
  87. test/integration/connectors/discord/test_discord.py +0 -90
  88. test/integration/connectors/duckdb/__init__.py +0 -0
  89. test/integration/connectors/duckdb/conftest.py +0 -14
  90. test/integration/connectors/duckdb/test_duckdb.py +0 -90
  91. test/integration/connectors/duckdb/test_motherduck.py +0 -95
  92. test/integration/connectors/elasticsearch/__init__.py +0 -0
  93. test/integration/connectors/elasticsearch/conftest.py +0 -34
  94. test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
  95. test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
  96. test/integration/connectors/sql/__init__.py +0 -0
  97. test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
  98. test/integration/connectors/sql/test_postgres.py +0 -201
  99. test/integration/connectors/sql/test_singlestore.py +0 -182
  100. test/integration/connectors/sql/test_snowflake.py +0 -244
  101. test/integration/connectors/sql/test_sqlite.py +0 -168
  102. test/integration/connectors/sql/test_vastdb.py +0 -34
  103. test/integration/connectors/test_astradb.py +0 -287
  104. test/integration/connectors/test_azure_ai_search.py +0 -254
  105. test/integration/connectors/test_chroma.py +0 -136
  106. test/integration/connectors/test_confluence.py +0 -111
  107. test/integration/connectors/test_delta_table.py +0 -183
  108. test/integration/connectors/test_dropbox.py +0 -151
  109. test/integration/connectors/test_github.py +0 -49
  110. test/integration/connectors/test_google_drive.py +0 -257
  111. test/integration/connectors/test_jira.py +0 -67
  112. test/integration/connectors/test_lancedb.py +0 -247
  113. test/integration/connectors/test_milvus.py +0 -208
  114. test/integration/connectors/test_mongodb.py +0 -335
  115. test/integration/connectors/test_neo4j.py +0 -244
  116. test/integration/connectors/test_notion.py +0 -152
  117. test/integration/connectors/test_onedrive.py +0 -163
  118. test/integration/connectors/test_pinecone.py +0 -387
  119. test/integration/connectors/test_qdrant.py +0 -216
  120. test/integration/connectors/test_redis.py +0 -143
  121. test/integration/connectors/test_s3.py +0 -184
  122. test/integration/connectors/test_sharepoint.py +0 -222
  123. test/integration/connectors/test_vectara.py +0 -282
  124. test/integration/connectors/test_zendesk.py +0 -120
  125. test/integration/connectors/utils/__init__.py +0 -0
  126. test/integration/connectors/utils/constants.py +0 -13
  127. test/integration/connectors/utils/docker.py +0 -151
  128. test/integration/connectors/utils/docker_compose.py +0 -59
  129. test/integration/connectors/utils/validation/__init__.py +0 -0
  130. test/integration/connectors/utils/validation/destination.py +0 -77
  131. test/integration/connectors/utils/validation/equality.py +0 -76
  132. test/integration/connectors/utils/validation/source.py +0 -331
  133. test/integration/connectors/utils/validation/utils.py +0 -36
  134. test/integration/connectors/weaviate/__init__.py +0 -0
  135. test/integration/connectors/weaviate/conftest.py +0 -15
  136. test/integration/connectors/weaviate/test_cloud.py +0 -39
  137. test/integration/connectors/weaviate/test_local.py +0 -152
  138. test/integration/embedders/__init__.py +0 -0
  139. test/integration/embedders/conftest.py +0 -13
  140. test/integration/embedders/test_azure_openai.py +0 -57
  141. test/integration/embedders/test_bedrock.py +0 -103
  142. test/integration/embedders/test_huggingface.py +0 -24
  143. test/integration/embedders/test_mixedbread.py +0 -71
  144. test/integration/embedders/test_octoai.py +0 -75
  145. test/integration/embedders/test_openai.py +0 -74
  146. test/integration/embedders/test_togetherai.py +0 -71
  147. test/integration/embedders/test_vertexai.py +0 -63
  148. test/integration/embedders/test_voyageai.py +0 -79
  149. test/integration/embedders/utils.py +0 -66
  150. test/integration/partitioners/__init__.py +0 -0
  151. test/integration/partitioners/test_partitioner.py +0 -76
  152. test/integration/utils.py +0 -15
  153. test/unit/__init__.py +0 -0
  154. test/unit/chunkers/__init__.py +0 -0
  155. test/unit/chunkers/test_chunkers.py +0 -49
  156. test/unit/connectors/__init__.py +0 -0
  157. test/unit/connectors/ibm_watsonx/__init__.py +0 -0
  158. test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
  159. test/unit/connectors/motherduck/__init__.py +0 -0
  160. test/unit/connectors/motherduck/test_base.py +0 -73
  161. test/unit/connectors/sql/__init__.py +0 -0
  162. test/unit/connectors/sql/test_sql.py +0 -152
  163. test/unit/connectors/test_confluence.py +0 -71
  164. test/unit/connectors/test_jira.py +0 -401
  165. test/unit/embed/__init__.py +0 -0
  166. test/unit/embed/test_mixedbreadai.py +0 -42
  167. test/unit/embed/test_octoai.py +0 -27
  168. test/unit/embed/test_openai.py +0 -28
  169. test/unit/embed/test_vertexai.py +0 -25
  170. test/unit/embed/test_voyageai.py +0 -24
  171. test/unit/embedders/__init__.py +0 -0
  172. test/unit/embedders/test_bedrock.py +0 -36
  173. test/unit/embedders/test_huggingface.py +0 -48
  174. test/unit/embedders/test_mixedbread.py +0 -37
  175. test/unit/embedders/test_octoai.py +0 -35
  176. test/unit/embedders/test_openai.py +0 -35
  177. test/unit/embedders/test_togetherai.py +0 -37
  178. test/unit/embedders/test_vertexai.py +0 -37
  179. test/unit/embedders/test_voyageai.py +0 -38
  180. test/unit/partitioners/__init__.py +0 -0
  181. test/unit/partitioners/test_partitioner.py +0 -63
  182. test/unit/test_error.py +0 -27
  183. test/unit/test_html.py +0 -112
  184. test/unit/test_interfaces.py +0 -26
  185. test/unit/test_logger.py +0 -78
  186. test/unit/test_utils.py +0 -220
  187. test/unit/utils/__init__.py +0 -0
  188. test/unit/utils/data_generator.py +0 -32
  189. unstructured_ingest-0.7.1.dist-info/METADATA +0 -383
  190. unstructured_ingest-0.7.1.dist-info/top_level.txt +0 -3
  191. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
  192. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
@@ -1,170 +0,0 @@
1
- import json
2
- import os
3
- import time
4
- from contextlib import contextmanager
5
- from pathlib import Path
6
- from uuid import uuid4
7
-
8
- import pytest
9
- from databricks.sql import connect
10
- from databricks.sql.client import Connection as DeltaTableConnection
11
- from databricks.sql.client import Cursor as DeltaTableCursor
12
- from pydantic import BaseModel, Secret, SecretStr
13
- from pytest_mock import MockerFixture
14
-
15
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG, env_setup_path
16
- from test.integration.utils import requires_env
17
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
18
- from unstructured_ingest.logger import logger
19
- from unstructured_ingest.processes.connectors.sql.databricks_delta_tables import (
20
- CONNECTOR_TYPE,
21
- DatabricksDeltaTablesAccessConfig,
22
- DatabricksDeltaTablesConnectionConfig,
23
- DatabricksDeltaTablesUploader,
24
- DatabricksDeltaTablesUploaderConfig,
25
- DatabricksDeltaTablesUploadStager,
26
- )
27
-
28
- CATALOG = "utic-dev-tech-fixtures"
29
-
30
-
31
- class EnvData(BaseModel):
32
- server_hostname: str
33
- http_path: str
34
- access_token: SecretStr
35
-
36
-
37
- def get_env_data() -> EnvData:
38
- return EnvData(
39
- server_hostname=os.environ["DATABRICKS_SERVER_HOSTNAME"],
40
- http_path=os.environ["DATABRICKS_HTTP_PATH"],
41
- access_token=os.environ["DATABRICKS_ACCESS_TOKEN"],
42
- )
43
-
44
-
45
- def get_destination_schema(new_table_name: str) -> str:
46
- p = Path(env_setup_path / "sql" / "databricks_delta_tables" / "destination" / "schema.sql")
47
- with p.open() as f:
48
- data_lines = f.readlines()
49
- data_lines[0] = data_lines[0].replace("elements", new_table_name)
50
- data = "".join([line.strip() for line in data_lines])
51
- return data
52
-
53
-
54
- @contextmanager
55
- def get_connection() -> DeltaTableConnection:
56
- env_data = get_env_data()
57
- with connect(
58
- server_hostname=env_data.server_hostname,
59
- http_path=env_data.http_path,
60
- access_token=env_data.access_token.get_secret_value(),
61
- ) as connection:
62
- yield connection
63
-
64
-
65
- @contextmanager
66
- def get_cursor() -> DeltaTableCursor:
67
- with get_connection() as connection:
68
- with connection.cursor() as cursor:
69
- cursor.execute(f"USE CATALOG '{CATALOG}'")
70
- yield cursor
71
-
72
-
73
- @pytest.fixture
74
- def destination_table() -> str:
75
- random_id = str(uuid4())[:8]
76
- table_name = f"elements_{random_id}"
77
- destination_schema = get_destination_schema(new_table_name=table_name)
78
- with get_cursor() as cursor:
79
- logger.info(f"creating table: {table_name}")
80
- cursor.execute(f"DROP TABLE IF EXISTS {table_name}")
81
- cursor.execute(destination_schema)
82
-
83
- yield table_name
84
- with get_cursor() as cursor:
85
- logger.info(f"dropping table: {table_name}")
86
- cursor.execute(f"DROP TABLE IF EXISTS {table_name}")
87
-
88
-
89
- def validate_destination(expected_num_elements: int, table_name: str, retries=30, interval=1):
90
- with get_cursor() as cursor:
91
- for i in range(retries):
92
- cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
93
- count = cursor.fetchone()[0]
94
- if count == expected_num_elements:
95
- break
96
- logger.info(f"retry attempt {i}: expected {expected_num_elements} != count {count}")
97
- time.sleep(interval)
98
- assert (
99
- count == expected_num_elements
100
- ), f"dest check failed: got {count}, expected {expected_num_elements}"
101
-
102
-
103
- @pytest.mark.asyncio
104
- @pytest.mark.skip("Resources take too long to spin up to run in CI")
105
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
106
- @requires_env("DATABRICKS_SERVER_HOSTNAME", "DATABRICKS_HTTP_PATH", "DATABRICKS_ACCESS_TOKEN")
107
- async def test_databricks_delta_tables_destination(
108
- upload_file: Path, temp_dir: Path, destination_table: str
109
- ):
110
- env_data = get_env_data()
111
- mock_file_data = FileData(
112
- identifier="mock file data",
113
- connector_type=CONNECTOR_TYPE,
114
- source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
115
- )
116
- stager = DatabricksDeltaTablesUploadStager()
117
- staged_path = stager.run(
118
- elements_filepath=upload_file,
119
- file_data=mock_file_data,
120
- output_dir=temp_dir,
121
- output_filename=upload_file.name,
122
- )
123
-
124
- assert staged_path.suffix == upload_file.suffix
125
-
126
- uploader = DatabricksDeltaTablesUploader(
127
- connection_config=DatabricksDeltaTablesConnectionConfig(
128
- access_config=DatabricksDeltaTablesAccessConfig(
129
- token=env_data.access_token.get_secret_value()
130
- ),
131
- http_path=env_data.http_path,
132
- server_hostname=env_data.server_hostname,
133
- ),
134
- upload_config=DatabricksDeltaTablesUploaderConfig(
135
- catalog=CATALOG, database="default", table_name=destination_table
136
- ),
137
- )
138
- with staged_path.open("r") as f:
139
- staged_data = json.load(f)
140
- expected_num_elements = len(staged_data)
141
- uploader.precheck()
142
- uploader.run(path=staged_path, file_data=mock_file_data)
143
- validate_destination(expected_num_elements=expected_num_elements, table_name=destination_table)
144
-
145
-
146
- def test_get_credentials_provider_with_client_id_and_secret(mocker: MockerFixture):
147
- access_config = DatabricksDeltaTablesAccessConfig(
148
- client_id="test_client_id", client_secret="test_client_secret"
149
- )
150
- connection_config = DatabricksDeltaTablesConnectionConfig(
151
- access_config=Secret(access_config),
152
- server_hostname="test_server_hostname",
153
- http_path="test_http_path",
154
- )
155
-
156
- credentials_provider = connection_config.get_credentials_provider()
157
- assert credentials_provider is not False
158
- assert type(credentials_provider).__name__ == "function"
159
-
160
-
161
- def test_get_credentials_provider_with_token(mocker: MockerFixture):
162
- access_config = DatabricksDeltaTablesAccessConfig(token="test_token")
163
- connection_config = DatabricksDeltaTablesConnectionConfig(
164
- access_config=Secret(access_config),
165
- server_hostname="test_server_hostname",
166
- http_path="test_http_path",
167
- )
168
-
169
- credentials_provider = connection_config.get_credentials_provider()
170
- assert credentials_provider is False
@@ -1,201 +0,0 @@
1
- import json
2
- from pathlib import Path
3
-
4
- import pytest
5
- from _pytest.fixtures import TopRequest
6
- from psycopg2 import connect
7
-
8
- from test.integration.connectors.utils.constants import (
9
- DESTINATION_TAG,
10
- SOURCE_TAG,
11
- SQL_TAG,
12
- env_setup_path,
13
- )
14
- from test.integration.connectors.utils.docker_compose import docker_compose_context
15
- from test.integration.connectors.utils.validation.destination import (
16
- StagerValidationConfigs,
17
- stager_validation,
18
- )
19
- from test.integration.connectors.utils.validation.source import (
20
- SourceValidationConfigs,
21
- source_connector_validation,
22
- )
23
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
24
- from unstructured_ingest.processes.connectors.sql.postgres import (
25
- CONNECTOR_TYPE,
26
- PostgresAccessConfig,
27
- PostgresConnectionConfig,
28
- PostgresDownloader,
29
- PostgresDownloaderConfig,
30
- PostgresIndexer,
31
- PostgresIndexerConfig,
32
- PostgresUploader,
33
- PostgresUploadStager,
34
- )
35
-
36
- SEED_DATA_ROWS = 10
37
-
38
-
39
- @pytest.fixture
40
- def source_database_setup() -> str:
41
- db_name = "test_db"
42
- with docker_compose_context(docker_compose_path=env_setup_path / "sql" / "postgres" / "source"):
43
- connection = connect(
44
- user="unstructured",
45
- password="test",
46
- dbname=db_name,
47
- host="localhost",
48
- port=5433,
49
- )
50
- with connection.cursor() as cursor:
51
- for i in range(SEED_DATA_ROWS):
52
- sql_statment = f"INSERT INTO cars (brand, price) VALUES " f"('brand_{i}', {i})"
53
- cursor.execute(sql_statment)
54
- connection.commit()
55
- yield db_name
56
-
57
-
58
- @pytest.mark.asyncio
59
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, SQL_TAG)
60
- async def test_postgres_source(temp_dir: Path, source_database_setup: str):
61
- connect_params = {
62
- "host": "localhost",
63
- "port": 5433,
64
- "database": "test_db",
65
- "user": "unstructured",
66
- "password": "test",
67
- }
68
- connection_config = PostgresConnectionConfig(
69
- host=connect_params["host"],
70
- port=connect_params["port"],
71
- database=connect_params["database"],
72
- username=connect_params["user"],
73
- access_config=PostgresAccessConfig(password=connect_params["password"]),
74
- )
75
- indexer = PostgresIndexer(
76
- connection_config=connection_config,
77
- index_config=PostgresIndexerConfig(table_name="cars", id_column="car_id", batch_size=6),
78
- )
79
- downloader = PostgresDownloader(
80
- connection_config=connection_config,
81
- download_config=PostgresDownloaderConfig(fields=["car_id", "brand"], download_dir=temp_dir),
82
- )
83
- await source_connector_validation(
84
- indexer=indexer,
85
- downloader=downloader,
86
- configs=SourceValidationConfigs(
87
- test_id="postgres",
88
- expected_num_files=SEED_DATA_ROWS,
89
- expected_number_indexed_file_data=2,
90
- validate_downloaded_files=True,
91
- ),
92
- )
93
-
94
-
95
- def validate_destination(
96
- connect_params: dict,
97
- expected_num_elements: int,
98
- test_embedding: list[float],
99
- expected_text: str,
100
- ):
101
- # Run the following validations:
102
- # * Check that the number of records in the table match the expected value
103
- # * Given the embedding, make sure it matches the associated text it belongs to
104
- with connect(**connect_params) as connection:
105
- cursor = connection.cursor()
106
- query = "select count(*) from elements;"
107
- cursor.execute(query)
108
- count = cursor.fetchone()[0]
109
- assert (
110
- count == expected_num_elements
111
- ), f"dest check failed: got {count}, expected {expected_num_elements}"
112
-
113
- cursor.execute("SELECT embeddings FROM elements order by text limit 1")
114
- similarity_query = (
115
- f"SELECT text FROM elements ORDER BY embeddings <-> '{test_embedding}' LIMIT 1;"
116
- )
117
- cursor.execute(similarity_query)
118
- res = cursor.fetchone()
119
- assert res[0] == expected_text
120
-
121
-
122
- @pytest.mark.asyncio
123
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
124
- async def test_postgres_destination(upload_file: Path, temp_dir: Path):
125
- # the postgres destination connector doesn't leverage the file data but is required as an input,
126
- # mocking it with arbitrary values to meet the base requirements:
127
- mock_file_data = FileData(
128
- identifier="mock file data",
129
- connector_type=CONNECTOR_TYPE,
130
- source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
131
- )
132
- with docker_compose_context(
133
- docker_compose_path=env_setup_path / "sql" / "postgres" / "destination"
134
- ):
135
- stager = PostgresUploadStager()
136
- staged_path = stager.run(
137
- elements_filepath=upload_file,
138
- file_data=mock_file_data,
139
- output_dir=temp_dir,
140
- output_filename=upload_file.name,
141
- )
142
-
143
- # The stager should append the `.json` suffix to the output filename passed in.
144
- assert staged_path.suffix == upload_file.suffix
145
-
146
- connect_params = {
147
- "host": "localhost",
148
- "port": 5433,
149
- "database": "elements",
150
- "user": "unstructured",
151
- "password": "test",
152
- }
153
-
154
- uploader = PostgresUploader(
155
- connection_config=PostgresConnectionConfig(
156
- host=connect_params["host"],
157
- port=connect_params["port"],
158
- database=connect_params["database"],
159
- username=connect_params["user"],
160
- access_config=PostgresAccessConfig(password=connect_params["password"]),
161
- )
162
- )
163
- uploader.precheck()
164
- uploader.run(path=staged_path, file_data=mock_file_data)
165
-
166
- with staged_path.open("r") as f:
167
- staged_data = json.load(f)
168
-
169
- sample_element = staged_data[0]
170
- expected_num_elements = len(staged_data)
171
- validate_destination(
172
- connect_params=connect_params,
173
- expected_num_elements=expected_num_elements,
174
- expected_text=sample_element["text"],
175
- test_embedding=sample_element["embeddings"],
176
- )
177
-
178
- uploader.run(path=staged_path, file_data=mock_file_data)
179
- validate_destination(
180
- connect_params=connect_params,
181
- expected_num_elements=expected_num_elements,
182
- expected_text=sample_element["text"],
183
- test_embedding=sample_element["embeddings"],
184
- )
185
-
186
-
187
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
188
- @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
189
- def test_postgres_stager(
190
- request: TopRequest,
191
- upload_file_str: str,
192
- tmp_path: Path,
193
- ):
194
- upload_file: Path = request.getfixturevalue(upload_file_str)
195
- stager = PostgresUploadStager()
196
- stager_validation(
197
- configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
198
- input_file=upload_file,
199
- stager=stager,
200
- tmp_dir=tmp_path,
201
- )
@@ -1,182 +0,0 @@
1
- import json
2
- from pathlib import Path
3
-
4
- import pytest
5
- import singlestoredb as s2
6
- from _pytest.fixtures import TopRequest
7
-
8
- from test.integration.connectors.utils.constants import (
9
- DESTINATION_TAG,
10
- SOURCE_TAG,
11
- SQL_TAG,
12
- env_setup_path,
13
- )
14
- from test.integration.connectors.utils.docker_compose import docker_compose_context
15
- from test.integration.connectors.utils.validation.destination import (
16
- StagerValidationConfigs,
17
- stager_validation,
18
- )
19
- from test.integration.connectors.utils.validation.source import (
20
- SourceValidationConfigs,
21
- source_connector_validation,
22
- )
23
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
24
- from unstructured_ingest.processes.connectors.sql.singlestore import (
25
- CONNECTOR_TYPE,
26
- SingleStoreAccessConfig,
27
- SingleStoreConnectionConfig,
28
- SingleStoreDownloader,
29
- SingleStoreDownloaderConfig,
30
- SingleStoreIndexer,
31
- SingleStoreIndexerConfig,
32
- SingleStoreUploader,
33
- SingleStoreUploaderConfig,
34
- SingleStoreUploadStager,
35
- )
36
-
37
- SEED_DATA_ROWS = 10
38
-
39
-
40
- @pytest.fixture
41
- def source_database_setup() -> dict:
42
- connect_params = {
43
- "host": "localhost",
44
- "port": 3306,
45
- "database": "ingest_test",
46
- "user": "root",
47
- "password": "password",
48
- }
49
- with docker_compose_context(
50
- docker_compose_path=env_setup_path / "sql" / "singlestore" / "source"
51
- ):
52
- with s2.connect(**connect_params) as connection:
53
- with connection.cursor() as cursor:
54
- for i in range(SEED_DATA_ROWS):
55
- sql_statment = f"INSERT INTO cars (brand, price) VALUES " f"('brand_{i}', {i})"
56
- cursor.execute(sql_statment)
57
- connection.commit()
58
- yield connect_params
59
-
60
-
61
- @pytest.mark.asyncio
62
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, SQL_TAG)
63
- async def test_singlestore_source(temp_dir: Path, source_database_setup: dict):
64
-
65
- connection_config = SingleStoreConnectionConfig(
66
- host=source_database_setup["host"],
67
- port=source_database_setup["port"],
68
- database=source_database_setup["database"],
69
- user=source_database_setup["user"],
70
- access_config=SingleStoreAccessConfig(password=source_database_setup["password"]),
71
- )
72
- indexer = SingleStoreIndexer(
73
- connection_config=connection_config,
74
- index_config=SingleStoreIndexerConfig(table_name="cars", id_column="car_id", batch_size=6),
75
- )
76
- downloader = SingleStoreDownloader(
77
- connection_config=connection_config,
78
- download_config=SingleStoreDownloaderConfig(
79
- fields=["car_id", "brand"], download_dir=temp_dir
80
- ),
81
- )
82
- await source_connector_validation(
83
- indexer=indexer,
84
- downloader=downloader,
85
- configs=SourceValidationConfigs(
86
- test_id="singlestore",
87
- expected_num_files=SEED_DATA_ROWS,
88
- expected_number_indexed_file_data=2,
89
- validate_downloaded_files=True,
90
- ),
91
- )
92
-
93
-
94
- def validate_destination(
95
- connect_params: dict,
96
- expected_num_elements: int,
97
- ):
98
- with s2.connect(**connect_params) as connection:
99
- with connection.cursor() as cursor:
100
- query = "select count(*) from elements;"
101
- cursor.execute(query)
102
- count = cursor.fetchone()[0]
103
- assert (
104
- count == expected_num_elements
105
- ), f"dest check failed: got {count}, expected {expected_num_elements}"
106
-
107
-
108
- @pytest.mark.asyncio
109
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
110
- async def test_singlestore_destination(upload_file: Path, temp_dir: Path):
111
- mock_file_data = FileData(
112
- identifier="mock file data",
113
- connector_type=CONNECTOR_TYPE,
114
- source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
115
- )
116
- with docker_compose_context(
117
- docker_compose_path=env_setup_path / "sql" / "singlestore" / "destination"
118
- ):
119
- stager = SingleStoreUploadStager()
120
- staged_path = stager.run(
121
- elements_filepath=upload_file,
122
- file_data=mock_file_data,
123
- output_dir=temp_dir,
124
- output_filename=upload_file.name,
125
- )
126
-
127
- # The stager should append the `.json` suffix to the output filename passed in.
128
- assert staged_path.suffix == upload_file.suffix
129
-
130
- connect_params = {
131
- "host": "localhost",
132
- "port": 3306,
133
- "database": "ingest_test",
134
- "user": "root",
135
- "password": "password",
136
- }
137
-
138
- uploader = SingleStoreUploader(
139
- connection_config=SingleStoreConnectionConfig(
140
- host=connect_params["host"],
141
- port=connect_params["port"],
142
- database=connect_params["database"],
143
- user=connect_params["user"],
144
- access_config=SingleStoreAccessConfig(password=connect_params["password"]),
145
- ),
146
- upload_config=SingleStoreUploaderConfig(
147
- table_name="elements",
148
- ),
149
- )
150
- uploader.precheck()
151
- uploader.run(path=staged_path, file_data=mock_file_data)
152
-
153
- with staged_path.open("r") as f:
154
- staged_data = json.load(f)
155
- expected_num_elements = len(staged_data)
156
- validate_destination(
157
- connect_params=connect_params,
158
- expected_num_elements=expected_num_elements,
159
- )
160
-
161
- uploader.run(path=staged_path, file_data=mock_file_data)
162
- validate_destination(
163
- connect_params=connect_params,
164
- expected_num_elements=expected_num_elements,
165
- )
166
-
167
-
168
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
169
- @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
170
- def test_singlestore_stager(
171
- request: TopRequest,
172
- upload_file_str: str,
173
- tmp_path: Path,
174
- ):
175
- upload_file: Path = request.getfixturevalue(upload_file_str)
176
- stager = SingleStoreUploadStager()
177
- stager_validation(
178
- configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
179
- input_file=upload_file,
180
- stager=stager,
181
- tmp_dir=tmp_path,
182
- )