unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (187) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/README.md +28 -0
  3. unstructured_ingest/embed/mixedbreadai.py +0 -1
  4. unstructured_ingest/interfaces/upload_stager.py +2 -2
  5. unstructured_ingest/interfaces/uploader.py +3 -3
  6. unstructured_ingest/main.py +0 -0
  7. unstructured_ingest/pipeline/interfaces.py +1 -1
  8. unstructured_ingest/pipeline/pipeline.py +1 -1
  9. unstructured_ingest/processes/chunker.py +4 -0
  10. unstructured_ingest/processes/connectors/airtable.py +4 -2
  11. unstructured_ingest/processes/connectors/astradb.py +2 -2
  12. unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  13. unstructured_ingest/processes/connectors/confluence.py +0 -1
  14. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
  15. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
  16. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
  17. unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
  18. unstructured_ingest/processes/connectors/delta_table.py +1 -0
  19. unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
  20. unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
  21. unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
  22. unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
  23. unstructured_ingest/processes/connectors/gitlab.py +1 -2
  24. unstructured_ingest/processes/connectors/google_drive.py +0 -2
  25. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
  26. unstructured_ingest/processes/connectors/kdbai.py +1 -0
  27. unstructured_ingest/processes/connectors/outlook.py +1 -2
  28. unstructured_ingest/processes/connectors/pinecone.py +0 -1
  29. unstructured_ingest/processes/connectors/redisdb.py +28 -24
  30. unstructured_ingest/processes/connectors/salesforce.py +1 -1
  31. unstructured_ingest/processes/connectors/slack.py +1 -2
  32. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
  33. unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
  34. unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
  35. unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
  36. unstructured_ingest/processes/connectors/sql/sql.py +3 -4
  37. unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
  38. unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
  39. unstructured_ingest/processes/connectors/vectara.py +0 -2
  40. unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
  41. unstructured_ingest/processes/embedder.py +2 -2
  42. unstructured_ingest/processes/filter.py +1 -1
  43. unstructured_ingest/processes/partitioner.py +4 -0
  44. unstructured_ingest/processes/utils/blob_storage.py +2 -2
  45. unstructured_ingest/unstructured_api.py +13 -8
  46. unstructured_ingest/utils/data_prep.py +8 -32
  47. unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
  48. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +50 -184
  49. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
  50. examples/__init__.py +0 -0
  51. examples/airtable.py +0 -44
  52. examples/azure_cognitive_search.py +0 -55
  53. examples/chroma.py +0 -54
  54. examples/couchbase.py +0 -55
  55. examples/databricks_volumes_dest.py +0 -55
  56. examples/databricks_volumes_source.py +0 -53
  57. examples/delta_table.py +0 -45
  58. examples/discord_example.py +0 -36
  59. examples/elasticsearch.py +0 -49
  60. examples/google_drive.py +0 -45
  61. examples/kdbai.py +0 -54
  62. examples/local.py +0 -36
  63. examples/milvus.py +0 -44
  64. examples/mongodb.py +0 -53
  65. examples/opensearch.py +0 -50
  66. examples/pinecone.py +0 -57
  67. examples/s3.py +0 -38
  68. examples/salesforce.py +0 -44
  69. examples/sharepoint.py +0 -47
  70. examples/singlestore.py +0 -49
  71. examples/sql.py +0 -90
  72. examples/vectara.py +0 -54
  73. examples/weaviate.py +0 -44
  74. test/__init__.py +0 -0
  75. test/integration/__init__.py +0 -0
  76. test/integration/chunkers/__init__.py +0 -0
  77. test/integration/chunkers/test_chunkers.py +0 -31
  78. test/integration/connectors/__init__.py +0 -0
  79. test/integration/connectors/conftest.py +0 -38
  80. test/integration/connectors/databricks/__init__.py +0 -0
  81. test/integration/connectors/databricks/test_volumes_native.py +0 -273
  82. test/integration/connectors/discord/__init__.py +0 -0
  83. test/integration/connectors/discord/test_discord.py +0 -90
  84. test/integration/connectors/duckdb/__init__.py +0 -0
  85. test/integration/connectors/duckdb/conftest.py +0 -14
  86. test/integration/connectors/duckdb/test_duckdb.py +0 -90
  87. test/integration/connectors/duckdb/test_motherduck.py +0 -95
  88. test/integration/connectors/elasticsearch/__init__.py +0 -0
  89. test/integration/connectors/elasticsearch/conftest.py +0 -34
  90. test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
  91. test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
  92. test/integration/connectors/sql/__init__.py +0 -0
  93. test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
  94. test/integration/connectors/sql/test_postgres.py +0 -201
  95. test/integration/connectors/sql/test_singlestore.py +0 -182
  96. test/integration/connectors/sql/test_snowflake.py +0 -244
  97. test/integration/connectors/sql/test_sqlite.py +0 -168
  98. test/integration/connectors/sql/test_vastdb.py +0 -34
  99. test/integration/connectors/test_astradb.py +0 -287
  100. test/integration/connectors/test_azure_ai_search.py +0 -254
  101. test/integration/connectors/test_chroma.py +0 -136
  102. test/integration/connectors/test_confluence.py +0 -111
  103. test/integration/connectors/test_delta_table.py +0 -183
  104. test/integration/connectors/test_dropbox.py +0 -151
  105. test/integration/connectors/test_github.py +0 -49
  106. test/integration/connectors/test_google_drive.py +0 -257
  107. test/integration/connectors/test_jira.py +0 -67
  108. test/integration/connectors/test_lancedb.py +0 -247
  109. test/integration/connectors/test_milvus.py +0 -208
  110. test/integration/connectors/test_mongodb.py +0 -335
  111. test/integration/connectors/test_neo4j.py +0 -244
  112. test/integration/connectors/test_notion.py +0 -152
  113. test/integration/connectors/test_onedrive.py +0 -163
  114. test/integration/connectors/test_pinecone.py +0 -387
  115. test/integration/connectors/test_qdrant.py +0 -216
  116. test/integration/connectors/test_redis.py +0 -143
  117. test/integration/connectors/test_s3.py +0 -184
  118. test/integration/connectors/test_sharepoint.py +0 -222
  119. test/integration/connectors/test_vectara.py +0 -282
  120. test/integration/connectors/test_zendesk.py +0 -120
  121. test/integration/connectors/utils/__init__.py +0 -0
  122. test/integration/connectors/utils/constants.py +0 -13
  123. test/integration/connectors/utils/docker.py +0 -151
  124. test/integration/connectors/utils/docker_compose.py +0 -59
  125. test/integration/connectors/utils/validation/__init__.py +0 -0
  126. test/integration/connectors/utils/validation/destination.py +0 -77
  127. test/integration/connectors/utils/validation/equality.py +0 -76
  128. test/integration/connectors/utils/validation/source.py +0 -331
  129. test/integration/connectors/utils/validation/utils.py +0 -36
  130. test/integration/connectors/weaviate/__init__.py +0 -0
  131. test/integration/connectors/weaviate/conftest.py +0 -15
  132. test/integration/connectors/weaviate/test_cloud.py +0 -39
  133. test/integration/connectors/weaviate/test_local.py +0 -152
  134. test/integration/embedders/__init__.py +0 -0
  135. test/integration/embedders/conftest.py +0 -13
  136. test/integration/embedders/test_azure_openai.py +0 -57
  137. test/integration/embedders/test_bedrock.py +0 -103
  138. test/integration/embedders/test_huggingface.py +0 -24
  139. test/integration/embedders/test_mixedbread.py +0 -71
  140. test/integration/embedders/test_octoai.py +0 -75
  141. test/integration/embedders/test_openai.py +0 -74
  142. test/integration/embedders/test_togetherai.py +0 -71
  143. test/integration/embedders/test_vertexai.py +0 -63
  144. test/integration/embedders/test_voyageai.py +0 -79
  145. test/integration/embedders/utils.py +0 -66
  146. test/integration/partitioners/__init__.py +0 -0
  147. test/integration/partitioners/test_partitioner.py +0 -76
  148. test/integration/utils.py +0 -15
  149. test/unit/__init__.py +0 -0
  150. test/unit/chunkers/__init__.py +0 -0
  151. test/unit/chunkers/test_chunkers.py +0 -49
  152. test/unit/connectors/__init__.py +0 -0
  153. test/unit/connectors/ibm_watsonx/__init__.py +0 -0
  154. test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
  155. test/unit/connectors/motherduck/__init__.py +0 -0
  156. test/unit/connectors/motherduck/test_base.py +0 -73
  157. test/unit/connectors/sql/__init__.py +0 -0
  158. test/unit/connectors/sql/test_sql.py +0 -152
  159. test/unit/connectors/test_confluence.py +0 -71
  160. test/unit/connectors/test_jira.py +0 -401
  161. test/unit/embed/__init__.py +0 -0
  162. test/unit/embed/test_mixedbreadai.py +0 -42
  163. test/unit/embed/test_octoai.py +0 -27
  164. test/unit/embed/test_openai.py +0 -28
  165. test/unit/embed/test_vertexai.py +0 -25
  166. test/unit/embed/test_voyageai.py +0 -24
  167. test/unit/embedders/__init__.py +0 -0
  168. test/unit/embedders/test_bedrock.py +0 -36
  169. test/unit/embedders/test_huggingface.py +0 -48
  170. test/unit/embedders/test_mixedbread.py +0 -37
  171. test/unit/embedders/test_octoai.py +0 -35
  172. test/unit/embedders/test_openai.py +0 -35
  173. test/unit/embedders/test_togetherai.py +0 -37
  174. test/unit/embedders/test_vertexai.py +0 -37
  175. test/unit/embedders/test_voyageai.py +0 -38
  176. test/unit/partitioners/__init__.py +0 -0
  177. test/unit/partitioners/test_partitioner.py +0 -63
  178. test/unit/test_error.py +0 -27
  179. test/unit/test_html.py +0 -112
  180. test/unit/test_interfaces.py +0 -26
  181. test/unit/test_utils.py +0 -220
  182. test/unit/utils/__init__.py +0 -0
  183. test/unit/utils/data_generator.py +0 -32
  184. unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
  185. unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
  186. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
  187. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
@@ -1,14 +0,0 @@
1
- from pathlib import Path
2
-
3
- import pytest
4
-
5
- int_test_dir = Path(__file__).parent
6
- assets_dir = int_test_dir / "assets"
7
-
8
-
9
- @pytest.fixture
10
- def duckdb_schema() -> Path:
11
- schema_file = assets_dir / "duckdb-schema.sql"
12
- assert schema_file.exists()
13
- assert schema_file.is_file()
14
- return schema_file
@@ -1,90 +0,0 @@
1
- import json
2
- from pathlib import Path
3
-
4
- import duckdb
5
- import pytest
6
- from _pytest.fixtures import TopRequest
7
-
8
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
9
- from test.integration.connectors.utils.validation.destination import (
10
- StagerValidationConfigs,
11
- stager_validation,
12
- )
13
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
14
- from unstructured_ingest.processes.connectors.duckdb.duckdb import (
15
- CONNECTOR_TYPE,
16
- DuckDBConnectionConfig,
17
- DuckDBUploader,
18
- DuckDBUploaderConfig,
19
- DuckDBUploadStager,
20
- )
21
-
22
-
23
- @pytest.fixture
24
- def provisioned_db_file(duckdb_schema: Path, temp_dir: Path) -> Path:
25
- db_path = Path(temp_dir) / "temp_duck.db"
26
- with duckdb.connect(database=db_path) as duckdb_connection:
27
- with duckdb_schema.open("r") as f:
28
- query = f.read()
29
- duckdb_connection.execute(query)
30
- duckdb_connection.close()
31
- return db_path
32
-
33
-
34
- def validate_duckdb_destination(db_path: Path, expected_num_elements: int):
35
- conn = None
36
- try:
37
- conn = duckdb.connect(db_path)
38
- _results = conn.sql("select count(*) from elements").fetchall()
39
- _count = _results[0][0]
40
- assert (
41
- _count == expected_num_elements
42
- ), f"dest check failed: got {_count}, expected {expected_num_elements}"
43
- conn.close()
44
- finally:
45
- if conn:
46
- conn.close()
47
-
48
-
49
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "duckdb", SQL_TAG)
50
- def test_duckdb_destination(upload_file: Path, provisioned_db_file: Path, temp_dir: Path):
51
- file_data = FileData(
52
- source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
53
- connector_type=CONNECTOR_TYPE,
54
- identifier="mock-file-data",
55
- )
56
-
57
- stager = DuckDBUploadStager()
58
- staged_path = stager.run(
59
- elements_filepath=upload_file,
60
- file_data=file_data,
61
- output_dir=temp_dir,
62
- output_filename=upload_file.name,
63
- )
64
-
65
- connection_config = DuckDBConnectionConfig(database=str(provisioned_db_file))
66
- upload_config = DuckDBUploaderConfig()
67
- uploader = DuckDBUploader(connection_config=connection_config, upload_config=upload_config)
68
-
69
- uploader.run(path=staged_path, file_data=file_data)
70
-
71
- with staged_path.open() as f:
72
- data = json.load(f)
73
- validate_duckdb_destination(db_path=provisioned_db_file, expected_num_elements=len(data))
74
-
75
-
76
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, "duckdb", SQL_TAG)
77
- @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
78
- def test_duckdb_stager(
79
- request: TopRequest,
80
- upload_file_str: str,
81
- tmp_path: Path,
82
- ):
83
- upload_file: Path = request.getfixturevalue(upload_file_str)
84
- stager = DuckDBUploadStager()
85
- stager_validation(
86
- configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
87
- input_file=upload_file,
88
- stager=stager,
89
- tmp_dir=tmp_path,
90
- )
@@ -1,95 +0,0 @@
1
- import os
2
- import uuid
3
- from pathlib import Path
4
- from typing import Generator
5
-
6
- import duckdb
7
- import pandas as pd
8
- import pytest
9
-
10
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
11
- from test.integration.utils import requires_env
12
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
13
- from unstructured_ingest.processes.connectors.duckdb.motherduck import (
14
- CONNECTOR_TYPE,
15
- MotherDuckAccessConfig,
16
- MotherDuckConnectionConfig,
17
- MotherDuckUploader,
18
- MotherDuckUploaderConfig,
19
- MotherDuckUploadStager,
20
- )
21
-
22
-
23
- @pytest.fixture
24
- def md_token() -> str:
25
- motherduck_token = os.getenv("MOTHERDUCK_TOKEN", None)
26
- assert motherduck_token
27
- return motherduck_token
28
-
29
-
30
- @pytest.fixture
31
- def provisioned_db(md_token: str, duckdb_schema: Path) -> Generator[str, None, None]:
32
- database_name = f"test_{str(uuid.uuid4()).replace('-', '_')}"
33
- try:
34
- with duckdb.connect(f"md:?motherduck_token={md_token}") as md_conn:
35
- with duckdb_schema.open("r") as f:
36
- query = f.read()
37
- md_conn.execute(f"CREATE DATABASE {database_name}")
38
- md_conn.execute(f"USE {database_name}")
39
- md_conn.execute(query)
40
- md_conn.close()
41
- yield database_name
42
- finally:
43
- with duckdb.connect(f"md:?motherduck_token={md_token}") as md_conn:
44
- md_conn.execute(f"DROP DATABASE {database_name}")
45
- md_conn.close()
46
-
47
-
48
- def validate_motherduck_destination(database: str, expected_num_elements: int, md_token: str):
49
- conn = None
50
- try:
51
- conn = duckdb.connect(f"md:?motherduck_token={md_token}")
52
- conn.execute(f"USE {database}")
53
- _results = conn.sql("select count(*) from elements").fetchall()
54
- _count = _results[0][0]
55
- assert (
56
- _count == expected_num_elements
57
- ), f"dest check failed: got {_count}, expected {expected_num_elements}"
58
- conn.close()
59
- finally:
60
- if conn:
61
- conn.close()
62
-
63
-
64
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
65
- @requires_env("MOTHERDUCK_TOKEN")
66
- def test_motherduck_destination(
67
- md_token: str, upload_file: Path, provisioned_db: str, temp_dir: Path
68
- ):
69
- file_data = FileData(
70
- source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
71
- connector_type=CONNECTOR_TYPE,
72
- identifier="mock-file-data",
73
- )
74
-
75
- stager = MotherDuckUploadStager()
76
- staged_path = stager.run(
77
- elements_filepath=upload_file,
78
- file_data=file_data,
79
- output_dir=temp_dir,
80
- output_filename=upload_file.name,
81
- )
82
-
83
- access_config = MotherDuckAccessConfig(md_token=md_token)
84
- connection_config = MotherDuckConnectionConfig(
85
- database=provisioned_db, access_config=access_config
86
- )
87
- upload_config = MotherDuckUploaderConfig()
88
- uploader = MotherDuckUploader(connection_config=connection_config, upload_config=upload_config)
89
-
90
- uploader.run(path=staged_path, file_data=file_data)
91
-
92
- staged_df = pd.read_json(staged_path, orient="records", lines=True)
93
- validate_motherduck_destination(
94
- database=provisioned_db, expected_num_elements=len(staged_df), md_token=md_token
95
- )
File without changes
@@ -1,34 +0,0 @@
1
- import json
2
- from pathlib import Path
3
-
4
- import pandas as pd
5
- import pytest
6
-
7
- int_test_dir = Path(__file__).parent
8
- assets_dir = int_test_dir / "assets"
9
-
10
-
11
- @pytest.fixture
12
- def movies_dataframe() -> pd.DataFrame:
13
- movies_file = assets_dir / "wiki_movie_plots_small.csv"
14
- assert movies_file.exists()
15
- assert movies_file.is_file()
16
- return pd.read_csv(movies_file).dropna().reset_index()
17
-
18
-
19
- @pytest.fixture
20
- def opensearch_elements_mapping() -> dict:
21
- elements_mapping_file = assets_dir / "opensearch_elements_mappings.json"
22
- assert elements_mapping_file.exists()
23
- assert elements_mapping_file.is_file()
24
- with elements_mapping_file.open() as fp:
25
- return json.load(fp)
26
-
27
-
28
- @pytest.fixture
29
- def elasticsearch_elements_mapping() -> dict:
30
- elements_mapping_file = assets_dir / "elasticsearch_elements_mappings.json"
31
- assert elements_mapping_file.exists()
32
- assert elements_mapping_file.is_file()
33
- with elements_mapping_file.open() as fp:
34
- return json.load(fp)
@@ -1,331 +0,0 @@
1
- # ruff: noqa: I001
2
- import json
3
- import tempfile
4
- import time
5
- from contextlib import contextmanager
6
- from pathlib import Path
7
- from typing import Generator
8
- from test.integration.connectors.utils.validation.destination import (
9
- StagerValidationConfigs,
10
- stager_validation,
11
- )
12
- import pandas as pd
13
- import pytest
14
- from _pytest.fixtures import TopRequest
15
- from elasticsearch import Elasticsearch as ElasticsearchClient
16
- from elasticsearch.helpers import bulk
17
-
18
- from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG, NOSQL_TAG
19
- from test.integration.connectors.utils.docker import HealthCheck, container_context
20
- from test.integration.connectors.utils.validation.source import (
21
- SourceValidationConfigs,
22
- source_connector_validation,
23
- )
24
- from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
25
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
26
- from unstructured_ingest.processes.connectors.elasticsearch.elasticsearch import (
27
- CONNECTOR_TYPE,
28
- ElasticsearchAccessConfig,
29
- ElasticsearchConnectionConfig,
30
- ElasticsearchDownloader,
31
- ElasticsearchDownloaderConfig,
32
- ElasticsearchIndexer,
33
- ElasticsearchIndexerConfig,
34
- ElasticsearchUploader,
35
- ElasticsearchUploaderConfig,
36
- ElasticsearchUploadStager,
37
- ElasticsearchUploadStagerConfig,
38
- )
39
-
40
- SOURCE_INDEX_NAME = "movies"
41
- DESTINATION_INDEX_NAME = "elements"
42
- ES_USERNAME = "elastic"
43
- ES_PASSWORD = "elastic_password"
44
-
45
-
46
- @contextmanager
47
- def get_client() -> Generator[ElasticsearchClient, None, None]:
48
- with ElasticsearchClient(
49
- hosts="http://localhost:9200", basic_auth=(ES_USERNAME, ES_PASSWORD), request_timeout=30
50
- ) as client:
51
- yield client
52
-
53
-
54
- def form_elasticsearch_doc_dict(i, csv_row):
55
- return {
56
- "_index": SOURCE_INDEX_NAME,
57
- "_id": i,
58
- "_source": {
59
- "title": csv_row["Title"],
60
- "ethnicity": csv_row["Origin/Ethnicity"],
61
- "director": csv_row["Director"],
62
- "cast": csv_row["Cast"],
63
- "genre": csv_row["Genre"],
64
- "plot": csv_row["Plot"],
65
- "year": csv_row["Release Year"],
66
- "wiki_page": csv_row["Wiki Page"],
67
- },
68
- }
69
-
70
-
71
- def dataframe_to_upload_data(df: pd.DataFrame) -> list[dict]:
72
- upload_data = []
73
- for index, row in df.iterrows():
74
- upload_data.append(form_elasticsearch_doc_dict(index, row))
75
- return upload_data
76
-
77
-
78
- def get_index_count(client: ElasticsearchClient, index_name: str) -> int:
79
- count_resp = client.cat.count(index=index_name, format="json")
80
- return int(count_resp[0]["count"])
81
-
82
-
83
- def validate_count(
84
- client: ElasticsearchClient,
85
- index_name: str,
86
- expected_count: int,
87
- retries: int = 10,
88
- interval: int = 1,
89
- ) -> None:
90
- current_count = get_index_count(client, index_name)
91
- if current_count == expected_count:
92
- return
93
- tries = 0
94
- while tries < retries:
95
- print(
96
- f"retrying validation to check if expected count "
97
- f"{expected_count} will match current count {current_count}"
98
- )
99
- time.sleep(interval)
100
- current_count = get_index_count(client, index_name)
101
- if current_count == expected_count:
102
- break
103
- assert current_count == expected_count, (
104
- f"Expected count ({expected_count}) doesn't match how "
105
- f"much came back from index: {current_count}"
106
- )
107
-
108
-
109
- def seed_source_db(df: pd.DataFrame):
110
- mapping = {
111
- "properties": {
112
- "title": {"type": "text", "analyzer": "english"},
113
- "ethnicity": {"type": "text", "analyzer": "standard"},
114
- "director": {"type": "text", "analyzer": "standard"},
115
- "cast": {"type": "text", "analyzer": "standard"},
116
- "genre": {"type": "text", "analyzer": "standard"},
117
- "plot": {"type": "text", "analyzer": "english"},
118
- "year": {"type": "integer"},
119
- "wiki_page": {"type": "keyword"},
120
- },
121
- }
122
- # seed content
123
- with get_client() as client:
124
- client.indices.create(index=SOURCE_INDEX_NAME, mappings=mapping)
125
- upload_data = dataframe_to_upload_data(df=df)
126
- bulk(client, upload_data)
127
- client.indices.refresh(index=SOURCE_INDEX_NAME)
128
- count = get_index_count(client, SOURCE_INDEX_NAME)
129
- print(f"seeded {SOURCE_INDEX_NAME} index with {count} records")
130
-
131
-
132
- @pytest.fixture
133
- def source_index(movies_dataframe: pd.DataFrame) -> str:
134
- with container_context(
135
- image="docker.elastic.co/elasticsearch/elasticsearch:8.7.0",
136
- ports={9200: 9200, 9300: 9300},
137
- environment={
138
- "discovery.type": "single-node",
139
- "xpack.security.enabled": True,
140
- "ELASTIC_PASSWORD": ES_PASSWORD,
141
- "ELASTIC_USER": ES_USERNAME,
142
- },
143
- healthcheck=HealthCheck(
144
- test="curl --silent --fail -u ${ELASTIC_USER}:${ELASTIC_PASSWORD} localhost:9200/_cluster/health || exit 1", # noqa: E501
145
- interval=1,
146
- start_period=5,
147
- ),
148
- ):
149
- seed_source_db(df=movies_dataframe)
150
- yield SOURCE_INDEX_NAME
151
-
152
-
153
- @pytest.fixture
154
- def destination_index(elasticsearch_elements_mapping: dict) -> str:
155
- with container_context(
156
- image="docker.elastic.co/elasticsearch/elasticsearch:8.7.0",
157
- ports={9200: 9200, 9300: 9300},
158
- environment={
159
- "discovery.type": "single-node",
160
- "xpack.security.enabled": True,
161
- "ELASTIC_PASSWORD": ES_PASSWORD,
162
- "ELASTIC_USER": ES_USERNAME,
163
- },
164
- healthcheck=HealthCheck(
165
- test="curl --silent --fail -u ${ELASTIC_USER}:${ELASTIC_PASSWORD} localhost:9200/_cluster/health || exit 1", # noqa: E501
166
- interval=1,
167
- start_period=5,
168
- ),
169
- ):
170
- with get_client() as client:
171
- response = client.indices.create(
172
- index=DESTINATION_INDEX_NAME, mappings=elasticsearch_elements_mapping
173
- )
174
- if not response["acknowledged"]:
175
- raise RuntimeError(f"failed to create index: {response}")
176
- yield DESTINATION_INDEX_NAME
177
-
178
-
179
- @pytest.mark.asyncio
180
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
181
- async def test_elasticsearch_source(source_index: str, movies_dataframe: pd.DataFrame):
182
- indexer_config = ElasticsearchIndexerConfig(index_name=source_index)
183
- with tempfile.TemporaryDirectory() as tempdir:
184
- tempdir_path = Path(tempdir)
185
- connection_config = ElasticsearchConnectionConfig(
186
- access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
187
- username=ES_USERNAME,
188
- hosts=["http://localhost:9200"],
189
- )
190
- download_config = ElasticsearchDownloaderConfig(download_dir=tempdir_path)
191
- indexer = ElasticsearchIndexer(
192
- connection_config=connection_config, index_config=indexer_config
193
- )
194
- downloader = ElasticsearchDownloader(
195
- connection_config=connection_config, download_config=download_config
196
- )
197
- expected_num_files = len(movies_dataframe)
198
- await source_connector_validation(
199
- indexer=indexer,
200
- downloader=downloader,
201
- configs=SourceValidationConfigs(
202
- test_id=CONNECTOR_TYPE,
203
- expected_num_files=expected_num_files,
204
- expected_number_indexed_file_data=1,
205
- validate_downloaded_files=True,
206
- ),
207
- )
208
-
209
-
210
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
211
- def test_elasticsearch_source_precheck_fail_no_cluster():
212
- indexer_config = ElasticsearchIndexerConfig(index_name="index")
213
-
214
- connection_config = ElasticsearchConnectionConfig(
215
- access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
216
- username=ES_USERNAME,
217
- hosts=["http://localhost:9200"],
218
- )
219
- indexer = ElasticsearchIndexer(connection_config=connection_config, index_config=indexer_config)
220
- with pytest.raises(SourceConnectionError):
221
- indexer.precheck()
222
-
223
-
224
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, NOSQL_TAG)
225
- def test_elasticsearch_source_precheck_fail_no_index(source_index: str):
226
- indexer_config = ElasticsearchIndexerConfig(index_name="index")
227
-
228
- connection_config = ElasticsearchConnectionConfig(
229
- access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
230
- username=ES_USERNAME,
231
- hosts=["http://localhost:9200"],
232
- )
233
- indexer = ElasticsearchIndexer(connection_config=connection_config, index_config=indexer_config)
234
- with pytest.raises(SourceConnectionError):
235
- indexer.precheck()
236
-
237
-
238
- @pytest.mark.asyncio
239
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
240
- async def test_elasticsearch_destination(
241
- upload_file: Path,
242
- destination_index: str,
243
- tmp_path: Path,
244
- ):
245
- file_data = FileData(
246
- source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
247
- connector_type=CONNECTOR_TYPE,
248
- identifier="mock file data",
249
- )
250
- connection_config = ElasticsearchConnectionConfig(
251
- access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
252
- username=ES_USERNAME,
253
- hosts=["http://localhost:9200"],
254
- )
255
- stager = ElasticsearchUploadStager(
256
- upload_stager_config=ElasticsearchUploadStagerConfig(index_name=destination_index)
257
- )
258
-
259
- uploader = ElasticsearchUploader(
260
- connection_config=connection_config,
261
- upload_config=ElasticsearchUploaderConfig(index_name=destination_index),
262
- )
263
- staged_filepath = stager.run(
264
- elements_filepath=upload_file,
265
- file_data=file_data,
266
- output_dir=tmp_path,
267
- output_filename=upload_file.name,
268
- )
269
- uploader.precheck()
270
- uploader.run(path=staged_filepath, file_data=file_data)
271
-
272
- # Run validation
273
- with staged_filepath.open() as f:
274
- staged_elements = json.load(f)
275
- expected_count = len(staged_elements)
276
- with get_client() as client:
277
- validate_count(client=client, expected_count=expected_count, index_name=destination_index)
278
-
279
- # Rerun and make sure the same documents get updated
280
- uploader.run(path=staged_filepath, file_data=file_data)
281
- with get_client() as client:
282
- validate_count(client=client, expected_count=expected_count, index_name=destination_index)
283
-
284
-
285
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
286
- def test_elasticsearch_destination_precheck_fail():
287
- connection_config = ElasticsearchConnectionConfig(
288
- access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
289
- username=ES_USERNAME,
290
- hosts=["http://localhost:9200"],
291
- )
292
- uploader = ElasticsearchUploader(
293
- connection_config=connection_config,
294
- upload_config=ElasticsearchUploaderConfig(index_name="index"),
295
- )
296
- with pytest.raises(DestinationConnectionError):
297
- uploader.precheck()
298
-
299
-
300
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
301
- def test_elasticsearch_destination_precheck_fail_no_index(destination_index: str):
302
- connection_config = ElasticsearchConnectionConfig(
303
- access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
304
- username=ES_USERNAME,
305
- hosts=["http://localhost:9200"],
306
- )
307
- uploader = ElasticsearchUploader(
308
- connection_config=connection_config,
309
- upload_config=ElasticsearchUploaderConfig(index_name="index"),
310
- )
311
- with pytest.raises(DestinationConnectionError):
312
- uploader.precheck()
313
-
314
-
315
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
316
- @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
317
- def test_elasticsearch_stager(
318
- request: TopRequest,
319
- upload_file_str: str,
320
- tmp_path: Path,
321
- ):
322
- upload_file: Path = request.getfixturevalue(upload_file_str)
323
- stager = ElasticsearchUploadStager(
324
- upload_stager_config=ElasticsearchUploadStagerConfig(index_name="mock_index")
325
- )
326
- stager_validation(
327
- configs=StagerValidationConfigs(test_id=CONNECTOR_TYPE, expected_count=22),
328
- input_file=upload_file,
329
- stager=stager,
330
- tmp_dir=tmp_path,
331
- )