unstructured-ingest 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (78) hide show
  1. test/integration/connectors/test_astradb.py +109 -0
  2. test/integration/connectors/test_azure_cog_search.py +233 -0
  3. test/integration/connectors/test_confluence.py +113 -0
  4. test/integration/connectors/test_kafka.py +167 -0
  5. test/integration/connectors/test_onedrive.py +112 -0
  6. test/integration/connectors/test_pinecone.py +161 -0
  7. test/integration/connectors/test_qdrant.py +137 -0
  8. test/integration/connectors/test_s3.py +23 -0
  9. test/integration/connectors/utils/docker.py +2 -1
  10. test/integration/connectors/utils/validation.py +73 -22
  11. test/unit/v2/__init__.py +0 -0
  12. test/unit/v2/chunkers/__init__.py +0 -0
  13. test/unit/v2/chunkers/test_chunkers.py +49 -0
  14. test/unit/v2/connectors/__init__.py +0 -0
  15. test/unit/v2/embedders/__init__.py +0 -0
  16. test/unit/v2/embedders/test_bedrock.py +36 -0
  17. test/unit/v2/embedders/test_huggingface.py +48 -0
  18. test/unit/v2/embedders/test_mixedbread.py +37 -0
  19. test/unit/v2/embedders/test_octoai.py +35 -0
  20. test/unit/v2/embedders/test_openai.py +35 -0
  21. test/unit/v2/embedders/test_togetherai.py +37 -0
  22. test/unit/v2/embedders/test_vertexai.py +37 -0
  23. test/unit/v2/embedders/test_voyageai.py +38 -0
  24. test/unit/v2/partitioners/__init__.py +0 -0
  25. test/unit/v2/partitioners/test_partitioner.py +63 -0
  26. test/unit/v2/utils/__init__.py +0 -0
  27. test/unit/v2/utils/data_generator.py +32 -0
  28. unstructured_ingest/__version__.py +1 -1
  29. unstructured_ingest/cli/cmds/__init__.py +2 -2
  30. unstructured_ingest/cli/cmds/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
  31. unstructured_ingest/connector/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
  32. unstructured_ingest/connector/kafka.py +0 -1
  33. unstructured_ingest/interfaces.py +7 -7
  34. unstructured_ingest/runner/writers/__init__.py +2 -2
  35. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  36. unstructured_ingest/v2/constants.py +2 -0
  37. unstructured_ingest/v2/processes/chunker.py +2 -2
  38. unstructured_ingest/v2/processes/connectors/__init__.py +16 -5
  39. unstructured_ingest/v2/processes/connectors/airtable.py +2 -2
  40. unstructured_ingest/v2/processes/connectors/astradb.py +33 -21
  41. unstructured_ingest/v2/processes/connectors/{azure_cognitive_search.py → azure_ai_search.py} +112 -35
  42. unstructured_ingest/v2/processes/connectors/confluence.py +195 -0
  43. unstructured_ingest/v2/processes/connectors/couchbase.py +1 -0
  44. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +2 -4
  45. unstructured_ingest/v2/processes/connectors/delta_table.py +17 -5
  46. unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -0
  47. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +28 -10
  48. unstructured_ingest/v2/processes/connectors/gitlab.py +267 -0
  49. unstructured_ingest/v2/processes/connectors/google_drive.py +3 -3
  50. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  51. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +118 -0
  52. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +251 -0
  53. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  54. unstructured_ingest/v2/processes/connectors/onedrive.py +165 -5
  55. unstructured_ingest/v2/processes/connectors/outlook.py +2 -2
  56. unstructured_ingest/v2/processes/connectors/pinecone.py +83 -12
  57. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  58. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  59. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  60. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +168 -0
  61. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  62. unstructured_ingest/v2/processes/connectors/sharepoint.py +3 -2
  63. unstructured_ingest/v2/processes/connectors/slack.py +2 -2
  64. unstructured_ingest/v2/processes/connectors/sql/postgres.py +16 -8
  65. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +3 -1
  66. unstructured_ingest/v2/processes/connectors/sql/sql.py +2 -4
  67. unstructured_ingest/v2/processes/partitioner.py +14 -3
  68. unstructured_ingest/v2/unstructured_api.py +24 -10
  69. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/METADATA +17 -16
  70. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/RECORD +77 -41
  71. unstructured_ingest/runner/writers/azure_cognitive_search.py +0 -24
  72. /test/integration/embedders/{togetherai.py → test_togetherai.py} +0 -0
  73. /test/unit/{test_interfaces_v2.py → v2/test_interfaces.py} +0 -0
  74. /test/unit/{test_utils_v2.py → v2/test_utils.py} +0 -0
  75. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/LICENSE.md +0 -0
  76. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/WHEEL +0 -0
  77. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/entry_points.txt +0 -0
  78. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,112 @@
1
+ import os
2
+ import uuid
3
+ from pathlib import Path
4
+
5
+ import pytest
6
+ from office365.graph_client import GraphClient
7
+
8
+ from test.integration.connectors.utils.constants import (
9
+ DESTINATION_TAG,
10
+ )
11
+ from test.integration.utils import requires_env
12
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
13
+ from unstructured_ingest.v2.processes.connectors.onedrive import (
14
+ CONNECTOR_TYPE,
15
+ OnedriveAccessConfig,
16
+ OnedriveConnectionConfig,
17
+ OnedriveUploader,
18
+ OnedriveUploaderConfig,
19
+ )
20
+
21
+
22
+ @pytest.fixture
23
+ def onedrive_test_folder() -> str:
24
+ """
25
+ Pytest fixture that creates a test folder in OneDrive and deletes it after test run.
26
+ """
27
+ connection_config = get_connection_config()
28
+ user_pname = connection_config.user_pname
29
+
30
+ # Get the OneDrive client
31
+ client: GraphClient = connection_config.get_client()
32
+ drive = client.users[user_pname].drive
33
+
34
+ # Generate a unique test folder path
35
+ test_folder_path = f"utic-test-output-{uuid.uuid4()}"
36
+
37
+ # Create the test folder
38
+ root = drive.root
39
+ folder = root.create_folder(test_folder_path).execute_query()
40
+ print(f"created folder: {folder.name}")
41
+ try:
42
+ yield test_folder_path
43
+ finally:
44
+ # Teardown: delete the test folder and its contents
45
+ folder.delete_object().execute_query()
46
+ print(f"successfully deleted folder: {folder.name}")
47
+
48
+
49
+ def get_connection_config():
50
+ """
51
+ Pytest fixture that provides the OnedriveConnectionConfig for tests.
52
+ """
53
+ client_id = os.getenv("MS_CLIENT_ID")
54
+ client_secret = os.getenv("MS_CLIENT_CRED")
55
+ tenant_id = os.getenv("MS_TENANT_ID")
56
+ user_pname = os.getenv("MS_USER_PNAME")
57
+
58
+ connection_config = OnedriveConnectionConfig(
59
+ client_id=client_id,
60
+ tenant=tenant_id,
61
+ user_pname=user_pname,
62
+ access_config=OnedriveAccessConfig(client_cred=client_secret),
63
+ )
64
+ return connection_config
65
+
66
+
67
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
68
+ @requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
69
+ def test_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
70
+ """
71
+ Integration test for the OneDrive destination connector.
72
+
73
+ This test uploads a file to OneDrive and verifies that it exists.
74
+ """
75
+ connection_config = get_connection_config()
76
+ # Retrieve user principal name from the connection config
77
+ user_pname = connection_config.user_pname
78
+
79
+ # The test folder is provided by the fixture
80
+ destination_folder = onedrive_test_folder
81
+ destination_fullpath = f"{destination_folder}/{upload_file.name}"
82
+
83
+ # Configure the uploader with remote_url
84
+ upload_config = OnedriveUploaderConfig(remote_url=f"onedrive://{destination_folder}")
85
+
86
+ uploader = OnedriveUploader(
87
+ connection_config=connection_config,
88
+ upload_config=upload_config,
89
+ )
90
+
91
+ file_data = FileData(
92
+ source_identifiers=SourceIdentifiers(
93
+ fullpath=destination_fullpath,
94
+ filename=upload_file.name,
95
+ ),
96
+ connector_type=CONNECTOR_TYPE,
97
+ identifier="mock_file_data",
98
+ )
99
+ uploader.precheck()
100
+ uploader.run(path=upload_file, file_data=file_data)
101
+
102
+ # Verify that the file was uploaded
103
+ client = connection_config.get_client()
104
+ drive = client.users[user_pname].drive
105
+
106
+ uploaded_file = (
107
+ drive.root.get_by_path(destination_fullpath).select(["id", "name"]).get().execute_query()
108
+ )
109
+
110
+ # Check if the file exists
111
+ assert uploaded_file is not None
112
+ assert uploaded_file.name == upload_file.name
@@ -0,0 +1,161 @@
1
+ import json
2
+ import os
3
+ import time
4
+ from pathlib import Path
5
+ from uuid import uuid4
6
+
7
+ import pytest
8
+ from pinecone import Pinecone, ServerlessSpec
9
+ from pinecone.core.openapi.shared.exceptions import NotFoundException
10
+
11
+ from test.integration.connectors.utils.constants import (
12
+ DESTINATION_TAG,
13
+ )
14
+ from test.integration.utils import requires_env
15
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
16
+ from unstructured_ingest.v2.logger import logger
17
+ from unstructured_ingest.v2.processes.connectors.pinecone import (
18
+ CONNECTOR_TYPE,
19
+ PineconeAccessConfig,
20
+ PineconeConnectionConfig,
21
+ PineconeUploader,
22
+ PineconeUploaderConfig,
23
+ PineconeUploadStager,
24
+ PineconeUploadStagerConfig,
25
+ )
26
+
27
+ API_KEY = "PINECONE_API_KEY"
28
+
29
+
30
+ def get_api_key() -> str:
31
+ api_key = os.getenv(API_KEY, None)
32
+ assert api_key
33
+ return api_key
34
+
35
+
36
+ def wait_for_delete(client: Pinecone, index_name: str, timeout=60, interval=1) -> None:
37
+ start = time.time()
38
+ while True and time.time() - start < timeout:
39
+ try:
40
+ description = client.describe_index(name=index_name)
41
+ logger.info(f"current index status: {description}")
42
+ except NotFoundException:
43
+ return
44
+ time.sleep(interval)
45
+
46
+ raise TimeoutError("time out waiting for index to delete")
47
+
48
+
49
+ def wait_for_ready(client: Pinecone, index_name: str, timeout=60, interval=1) -> None:
50
+ def is_ready_status():
51
+ description = client.describe_index(name=index_name)
52
+ status = description["status"]
53
+ return status["ready"]
54
+
55
+ start = time.time()
56
+ is_ready = is_ready_status()
57
+ while not is_ready and time.time() - start < timeout:
58
+ time.sleep(interval)
59
+ is_ready = is_ready_status()
60
+ if not is_ready:
61
+ raise TimeoutError("time out waiting for index to be ready")
62
+
63
+
64
+ @pytest.fixture
65
+ def pinecone_index() -> str:
66
+ pinecone = Pinecone(api_key=get_api_key())
67
+ random_id = str(uuid4()).split("-")[0]
68
+ index_name = f"ingest-test-{random_id}"
69
+ assert len(index_name) < 45
70
+ logger.info(f"Creating index: {index_name}")
71
+ try:
72
+ pinecone.create_index(
73
+ name=index_name,
74
+ dimension=384,
75
+ metric="cosine",
76
+ spec=ServerlessSpec(
77
+ cloud="aws",
78
+ region="us-east-1",
79
+ ),
80
+ deletion_protection="disabled",
81
+ )
82
+ wait_for_ready(client=pinecone, index_name=index_name)
83
+ yield index_name
84
+ except Exception as e:
85
+ logger.error(f"failed to create index {index_name}: {e}")
86
+ finally:
87
+ try:
88
+ logger.info(f"deleting index: {index_name}")
89
+ pinecone.delete_index(name=index_name)
90
+ wait_for_delete(client=pinecone, index_name=index_name)
91
+ except NotFoundException:
92
+ return
93
+
94
+
95
+ def validate_pinecone_index(
96
+ index_name: str, expected_num_of_vectors: int, retries=30, interval=1
97
+ ) -> None:
98
+ # Because there's a delay for the index to catch up to the recent writes, add in a retry
99
+ pinecone = Pinecone(api_key=get_api_key())
100
+ index = pinecone.Index(name=index_name)
101
+ vector_count = -1
102
+ for i in range(retries):
103
+ index_stats = index.describe_index_stats()
104
+ vector_count = index_stats["total_vector_count"]
105
+ if vector_count == expected_num_of_vectors:
106
+ logger.info(f"expected {expected_num_of_vectors} == vector count {vector_count}")
107
+ break
108
+ logger.info(
109
+ f"retry attempt {i}: expected {expected_num_of_vectors} != vector count {vector_count}"
110
+ )
111
+ time.sleep(interval)
112
+ assert vector_count == expected_num_of_vectors
113
+
114
+
115
+ @requires_env(API_KEY)
116
+ @pytest.mark.asyncio
117
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
118
+ async def test_pinecone_destination(pinecone_index: str, upload_file: Path, temp_dir: Path):
119
+ file_data = FileData(
120
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
121
+ connector_type=CONNECTOR_TYPE,
122
+ identifier="pinecone_mock_id",
123
+ )
124
+ connection_config = PineconeConnectionConfig(
125
+ index_name=pinecone_index,
126
+ access_config=PineconeAccessConfig(api_key=get_api_key()),
127
+ )
128
+ stager_config = PineconeUploadStagerConfig()
129
+ stager = PineconeUploadStager(upload_stager_config=stager_config)
130
+ new_upload_file = stager.run(
131
+ elements_filepath=upload_file,
132
+ output_dir=temp_dir,
133
+ output_filename=upload_file.name,
134
+ file_data=file_data,
135
+ )
136
+
137
+ upload_config = PineconeUploaderConfig()
138
+ uploader = PineconeUploader(connection_config=connection_config, upload_config=upload_config)
139
+ uploader.precheck()
140
+
141
+ if uploader.is_async():
142
+ await uploader.run_async(path=new_upload_file, file_data=file_data)
143
+ else:
144
+ uploader.run(path=new_upload_file, file_data=file_data)
145
+ with new_upload_file.open() as f:
146
+ staged_content = json.load(f)
147
+ expected_num_of_vectors = len(staged_content)
148
+ logger.info("validating first upload")
149
+ validate_pinecone_index(
150
+ index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
151
+ )
152
+
153
+ # Rerun uploader and make sure no duplicates exist
154
+ if uploader.is_async():
155
+ await uploader.run_async(path=new_upload_file, file_data=file_data)
156
+ else:
157
+ uploader.run(path=new_upload_file, file_data=file_data)
158
+ logger.info("validating second upload")
159
+ validate_pinecone_index(
160
+ index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
161
+ )
@@ -0,0 +1,137 @@
1
+ import json
2
+ import uuid
3
+ from contextlib import asynccontextmanager
4
+ from pathlib import Path
5
+ from typing import AsyncGenerator
6
+
7
+ import pytest
8
+ from qdrant_client import AsyncQdrantClient
9
+
10
+ from test.integration.connectors.utils.constants import DESTINATION_TAG
11
+ from test.integration.connectors.utils.docker import container_context
12
+ from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
13
+ from unstructured_ingest.v2.processes.connectors.qdrant.local import (
14
+ CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE,
15
+ )
16
+ from unstructured_ingest.v2.processes.connectors.qdrant.local import (
17
+ LocalQdrantConnectionConfig,
18
+ LocalQdrantUploader,
19
+ LocalQdrantUploaderConfig,
20
+ LocalQdrantUploadStager,
21
+ LocalQdrantUploadStagerConfig,
22
+ )
23
+ from unstructured_ingest.v2.processes.connectors.qdrant.server import (
24
+ CONNECTOR_TYPE as SERVER_CONNECTOR_TYPE,
25
+ )
26
+ from unstructured_ingest.v2.processes.connectors.qdrant.server import (
27
+ ServerQdrantConnectionConfig,
28
+ ServerQdrantUploader,
29
+ ServerQdrantUploaderConfig,
30
+ ServerQdrantUploadStager,
31
+ ServerQdrantUploadStagerConfig,
32
+ )
33
+
34
+ COLLECTION_NAME = f"test-coll-{uuid.uuid4().hex[:12]}"
35
+ VECTORS_CONFIG = {"size": 384, "distance": "Cosine"}
36
+
37
+
38
+ @asynccontextmanager
39
+ async def qdrant_client(client_params: dict) -> AsyncGenerator[AsyncQdrantClient, None]:
40
+ client = AsyncQdrantClient(**client_params)
41
+ try:
42
+ yield client
43
+ finally:
44
+ await client.close()
45
+
46
+
47
+ async def validate_upload(client: AsyncQdrantClient, upload_file: Path):
48
+ with upload_file.open() as upload_fp:
49
+ elements = json.load(upload_fp)
50
+ expected_point_count = len(elements)
51
+ first_element = elements[0]
52
+ expected_text = first_element["text"]
53
+ embeddings = first_element["embeddings"]
54
+ collection = await client.get_collection(COLLECTION_NAME)
55
+ assert collection.points_count == expected_point_count
56
+
57
+ response = await client.query_points(COLLECTION_NAME, query=embeddings, limit=1)
58
+ assert response.points[0].payload is not None
59
+ assert response.points[0].payload["text"] == expected_text
60
+
61
+
62
+ @pytest.mark.asyncio
63
+ @pytest.mark.tags(LOCAL_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant")
64
+ async def test_qdrant_destination_local(upload_file: Path, tmp_path: Path):
65
+ connection_kwargs = {"path": str(tmp_path / "qdrant")}
66
+ async with qdrant_client(connection_kwargs) as client:
67
+ await client.create_collection(COLLECTION_NAME, vectors_config=VECTORS_CONFIG)
68
+ AsyncQdrantClient(**connection_kwargs)
69
+ stager = LocalQdrantUploadStager(
70
+ upload_stager_config=LocalQdrantUploadStagerConfig(),
71
+ )
72
+ uploader = LocalQdrantUploader(
73
+ connection_config=LocalQdrantConnectionConfig(**connection_kwargs),
74
+ upload_config=LocalQdrantUploaderConfig(collection_name=COLLECTION_NAME),
75
+ )
76
+
77
+ file_data = FileData(
78
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
79
+ connector_type=LOCAL_CONNECTOR_TYPE,
80
+ identifier="mock-file-data",
81
+ )
82
+
83
+ staged_upload_file = stager.run(
84
+ elements_filepath=upload_file,
85
+ file_data=file_data,
86
+ output_dir=tmp_path,
87
+ output_filename=upload_file.name,
88
+ )
89
+
90
+ if uploader.is_async():
91
+ await uploader.run_async(path=staged_upload_file, file_data=file_data)
92
+ else:
93
+ uploader.run(path=upload_file, file_data=file_data)
94
+ async with qdrant_client(connection_kwargs) as client:
95
+ await validate_upload(client=client, upload_file=upload_file)
96
+
97
+
98
+ @pytest.fixture
99
+ def docker_context():
100
+ with container_context(image="qdrant/qdrant:latest", ports={"6333": "6333"}) as container:
101
+ yield container
102
+
103
+
104
+ @pytest.mark.asyncio
105
+ @pytest.mark.tags(SERVER_CONNECTOR_TYPE, DESTINATION_TAG, "qdrant")
106
+ async def test_qdrant_destination_server(upload_file: Path, tmp_path: Path, docker_context):
107
+ connection_kwargs = {"location": "http://localhost:6333"}
108
+ async with qdrant_client(connection_kwargs) as client:
109
+ await client.create_collection(COLLECTION_NAME, vectors_config=VECTORS_CONFIG)
110
+ AsyncQdrantClient(**connection_kwargs)
111
+ stager = ServerQdrantUploadStager(
112
+ upload_stager_config=ServerQdrantUploadStagerConfig(),
113
+ )
114
+ uploader = ServerQdrantUploader(
115
+ connection_config=ServerQdrantConnectionConfig(**connection_kwargs),
116
+ upload_config=ServerQdrantUploaderConfig(collection_name=COLLECTION_NAME),
117
+ )
118
+
119
+ file_data = FileData(
120
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
121
+ connector_type=SERVER_CONNECTOR_TYPE,
122
+ identifier="mock-file-data",
123
+ )
124
+
125
+ staged_upload_file = stager.run(
126
+ elements_filepath=upload_file,
127
+ file_data=file_data,
128
+ output_dir=tmp_path,
129
+ output_filename=upload_file.name,
130
+ )
131
+
132
+ if uploader.is_async():
133
+ await uploader.run_async(path=staged_upload_file, file_data=file_data)
134
+ else:
135
+ uploader.run(path=upload_file, file_data=file_data)
136
+ async with qdrant_client(connection_kwargs) as client:
137
+ await validate_upload(client=client, upload_file=upload_file)
@@ -71,6 +71,29 @@ async def test_s3_source(anon_connection_config: S3ConnectionConfig):
71
71
  )
72
72
 
73
73
 
74
+ @pytest.mark.asyncio
75
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
76
+ async def test_s3_source_special_char(anon_connection_config: S3ConnectionConfig):
77
+ indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/special-characters/")
78
+ with tempfile.TemporaryDirectory() as tempdir:
79
+ tempdir_path = Path(tempdir)
80
+ download_config = S3DownloaderConfig(download_dir=tempdir_path)
81
+ indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
82
+ downloader = S3Downloader(
83
+ connection_config=anon_connection_config, download_config=download_config
84
+ )
85
+ await source_connector_validation(
86
+ indexer=indexer,
87
+ downloader=downloader,
88
+ configs=ValidationConfigs(
89
+ test_id="s3-specialchar",
90
+ predownload_file_data_check=validate_predownload_file_data,
91
+ postdownload_file_data_check=validate_postdownload_file_data,
92
+ expected_num_files=1,
93
+ ),
94
+ )
95
+
96
+
74
97
  @pytest.mark.asyncio
75
98
  @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
76
99
  async def test_s3_source_no_access(anon_connection_config: S3ConnectionConfig):
@@ -47,14 +47,15 @@ def healthcheck_wait(container: Container, timeout: int = 10) -> None:
47
47
 
48
48
  @contextmanager
49
49
  def container_context(
50
- docker_client: docker.DockerClient,
51
50
  image: str,
52
51
  ports: dict,
53
52
  environment: Optional[dict] = None,
54
53
  volumes: Optional[dict] = None,
55
54
  healthcheck: Optional[dict] = None,
56
55
  healthcheck_timeout: int = 10,
56
+ docker_client: Optional[docker.DockerClient] = None,
57
57
  ):
58
+ docker_client = docker_client or docker.from_env()
58
59
  container: Optional[Container] = None
59
60
  try:
60
61
  container = get_container(
@@ -7,13 +7,14 @@ from pathlib import Path
7
7
  from typing import Callable, Optional
8
8
 
9
9
  import pandas as pd
10
+ from bs4 import BeautifulSoup
10
11
  from deepdiff import DeepDiff
11
12
 
12
13
  from test.integration.connectors.utils.constants import expected_results_path
13
14
  from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
14
15
 
15
16
 
16
- def pandas_df_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
17
+ def json_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
17
18
  expected_df = pd.read_csv(expected_filepath)
18
19
  current_df = pd.read_csv(current_filepath)
19
20
  if expected_df.equals(current_df):
@@ -27,6 +28,42 @@ def pandas_df_equality_check(expected_filepath: Path, current_filepath: Path) ->
27
28
  return False
28
29
 
29
30
 
31
+ def html_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
32
+ with expected_filepath.open() as expected_f:
33
+ expected_soup = BeautifulSoup(expected_f, "html.parser")
34
+ with current_filepath.open() as current_f:
35
+ current_soup = BeautifulSoup(current_f, "html.parser")
36
+ return expected_soup.text == current_soup.text
37
+
38
+
39
+ def txt_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
40
+ with expected_filepath.open() as expected_f:
41
+ expected_text_lines = expected_f.readlines()
42
+ with current_filepath.open() as current_f:
43
+ current_text_lines = current_f.readlines()
44
+ if len(expected_text_lines) != len(current_text_lines):
45
+ print(
46
+ f"Lines in expected text file ({len(expected_text_lines)}) "
47
+ f"don't match current text file ({len(current_text_lines)})"
48
+ )
49
+ return False
50
+ expected_text = "\n".join(expected_text_lines)
51
+ current_text = "\n".join(current_text_lines)
52
+ if expected_text == current_text:
53
+ return True
54
+ print("txt content don't match:")
55
+ print(f"expected: {expected_text}")
56
+ print(f"current: {current_text}")
57
+ return False
58
+
59
+
60
+ file_type_equality_check = {
61
+ ".json": json_equality_check,
62
+ ".html": html_equality_check,
63
+ ".txt": txt_equality_check,
64
+ }
65
+
66
+
30
67
  @dataclass
31
68
  class ValidationConfigs:
32
69
  test_id: str
@@ -39,6 +76,7 @@ class ValidationConfigs:
39
76
  )
40
77
  exclude_fields_extend: list[str] = field(default_factory=list)
41
78
  validate_downloaded_files: bool = False
79
+ validate_file_data: bool = True
42
80
  downloaded_file_equality_check: Optional[Callable[[Path, Path], bool]] = None
43
81
 
44
82
  def get_exclude_fields(self) -> list[str]:
@@ -86,7 +124,7 @@ class ValidationConfigs:
86
124
 
87
125
  def get_files(dir_path: Path) -> list[str]:
88
126
  return [
89
- str(f).replace(str(dir_path), "").lstrip("/") for f in dir_path.iterdir() if f.is_file()
127
+ str(f).replace(str(dir_path), "").lstrip("/") for f in dir_path.rglob("*") if f.is_file()
90
128
  ]
91
129
 
92
130
 
@@ -122,6 +160,23 @@ def check_contents(
122
160
  assert not found_diff, f"Diffs found between files: {found_diff}"
123
161
 
124
162
 
163
+ def detect_diff(
164
+ configs: ValidationConfigs, expected_filepath: Path, current_filepath: Path
165
+ ) -> bool:
166
+ if expected_filepath.suffix != current_filepath.suffix:
167
+ return True
168
+ if downloaded_file_equality_check := configs.downloaded_file_equality_check:
169
+ return not downloaded_file_equality_check(expected_filepath, current_filepath)
170
+ current_suffix = expected_filepath.suffix
171
+ if current_suffix in file_type_equality_check:
172
+ equality_check_callable = file_type_equality_check[current_suffix]
173
+ return not equality_check_callable(
174
+ expected_filepath=expected_filepath, current_filepath=current_filepath
175
+ )
176
+ # Fallback is using filecmp.cmp to compare the files
177
+ return not filecmp.cmp(expected_filepath, current_filepath, shallow=False)
178
+
179
+
125
180
  def check_raw_file_contents(
126
181
  expected_output_dir: Path,
127
182
  current_output_dir: Path,
@@ -133,15 +188,7 @@ def check_raw_file_contents(
133
188
  for current_file in current_files:
134
189
  current_file_path = current_output_dir / current_file
135
190
  expected_file_path = expected_output_dir / current_file
136
- if downloaded_file_equality_check := configs.downloaded_file_equality_check:
137
- is_different = downloaded_file_equality_check(expected_file_path, current_file_path)
138
- elif expected_file_path.suffix == ".csv" and current_file_path.suffix == ".csv":
139
- is_different = not pandas_df_equality_check(
140
- expected_filepath=expected_file_path, current_filepath=current_file_path
141
- )
142
- else:
143
- is_different = not filecmp.cmp(expected_file_path, current_file_path, shallow=False)
144
- if is_different:
191
+ if detect_diff(configs, expected_file_path, current_file_path):
145
192
  found_diff = True
146
193
  files.append(str(expected_file_path))
147
194
  print(f"diffs between files {expected_file_path} and {current_file_path}")
@@ -185,17 +232,19 @@ def update_fixtures(
185
232
  download_dir: Path,
186
233
  all_file_data: list[FileData],
187
234
  save_downloads: bool = False,
235
+ save_filedata: bool = True,
188
236
  ):
189
237
  # Delete current files
190
238
  shutil.rmtree(path=output_dir, ignore_errors=True)
191
239
  output_dir.mkdir(parents=True)
192
240
  # Rewrite the current file data
193
- file_data_output_path = output_dir / "file_data"
194
- file_data_output_path.mkdir(parents=True, exist_ok=True)
195
- for file_data in all_file_data:
196
- file_data_path = file_data_output_path / f"{file_data.identifier}.json"
197
- with file_data_path.open(mode="w") as f:
198
- json.dump(file_data.to_dict(), f, indent=2)
241
+ if save_filedata:
242
+ file_data_output_path = output_dir / "file_data"
243
+ file_data_output_path.mkdir(parents=True, exist_ok=True)
244
+ for file_data in all_file_data:
245
+ file_data_path = file_data_output_path / f"{file_data.identifier}.json"
246
+ with file_data_path.open(mode="w") as f:
247
+ json.dump(file_data.to_dict(), f, indent=2)
199
248
 
200
249
  # Record file structure of download directory
201
250
  download_files = get_files(dir_path=download_dir)
@@ -229,11 +278,12 @@ def run_all_validations(
229
278
  predownload_file_data=pre_data, postdownload_file_data=post_data
230
279
  )
231
280
  configs.run_download_dir_validation(download_dir=download_dir)
232
- run_expected_results_validation(
233
- expected_output_dir=test_output_dir / "file_data",
234
- all_file_data=postdownload_file_data,
235
- configs=configs,
236
- )
281
+ if configs.validate_file_data:
282
+ run_expected_results_validation(
283
+ expected_output_dir=test_output_dir / "file_data",
284
+ all_file_data=postdownload_file_data,
285
+ configs=configs,
286
+ )
237
287
  download_files = get_files(dir_path=download_dir)
238
288
  download_files.sort()
239
289
  run_directory_structure_validation(
@@ -291,4 +341,5 @@ async def source_connector_validation(
291
341
  download_dir=download_dir,
292
342
  all_file_data=all_postdownload_file_data,
293
343
  save_downloads=configs.validate_downloaded_files,
344
+ save_filedata=configs.validate_file_data,
294
345
  )
File without changes
File without changes
@@ -0,0 +1,49 @@
1
+ import random
2
+
3
+ import faker
4
+ import pytest
5
+
6
+ from unstructured_ingest.v2.processes.chunker import Chunker, ChunkerConfig
7
+
8
+ fake = faker.Faker()
9
+
10
+
11
+ def generate_chunker_config_params() -> dict:
12
+ params = {}
13
+ random_val = random.random()
14
+ if random_val < 0.5:
15
+ params["chunking_strategy"] = fake.word() if random.random() < 0.5 else None
16
+ params["chunk_combine_text_under_n_chars"] = (
17
+ fake.random_int() if random.random() < 0.5 else None
18
+ )
19
+ params["chunk_include_orig_elements"] = fake.boolean() if random.random() < 0.5 else None
20
+ params["chunk_max_characters"] = fake.random_int()
21
+ params["chunk_multipage_sections"] = fake.boolean()
22
+ params["chunk_new_after_n_chars"] = fake.random_int() if random.random() < 0.5 else None
23
+ params["chunk_overlap"] = fake.random_int() if random.random() < 0.5 else None
24
+ params["chunk_overlap_all"] = fake.boolean() if random.random() < 0.5 else None
25
+ if random_val < 0.5:
26
+ params["chunk_by_api"] = True
27
+ params["chunking_endpoint"] = fake.url()
28
+ params["chunk_api_key"] = fake.password()
29
+ else:
30
+ params["chunk_by_api"] = False
31
+
32
+ return params
33
+
34
+
35
+ @pytest.mark.parametrize(
36
+ "partition_config_params", [generate_chunker_config_params() for i in range(10)]
37
+ )
38
+ def test_chunker_config(partition_config_params: dict):
39
+ chunker_config = ChunkerConfig.model_validate(partition_config_params)
40
+ assert chunker_config
41
+
42
+
43
+ @pytest.mark.parametrize(
44
+ "partition_config_params", [generate_chunker_config_params() for i in range(10)]
45
+ )
46
+ def test_chunker(partition_config_params: dict):
47
+ chunker_config = ChunkerConfig.model_validate(partition_config_params)
48
+ chunker = Chunker(config=chunker_config)
49
+ assert chunker
File without changes
File without changes