unstructured-ingest 0.7.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/embed/mixedbreadai.py +0 -1
- unstructured_ingest/interfaces/upload_stager.py +2 -2
- unstructured_ingest/interfaces/uploader.py +3 -3
- unstructured_ingest/logger.py +2 -93
- unstructured_ingest/main.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/processes/chunker.py +4 -0
- unstructured_ingest/processes/connectors/airtable.py +4 -2
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +10 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +2 -2
- unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
- unstructured_ingest/processes/connectors/confluence.py +0 -1
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
- unstructured_ingest/processes/connectors/delta_table.py +1 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
- unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
- unstructured_ingest/processes/connectors/gitlab.py +1 -2
- unstructured_ingest/processes/connectors/google_drive.py +0 -2
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
- unstructured_ingest/processes/connectors/kdbai.py +1 -0
- unstructured_ingest/processes/connectors/outlook.py +1 -2
- unstructured_ingest/processes/connectors/pinecone.py +0 -1
- unstructured_ingest/processes/connectors/redisdb.py +28 -24
- unstructured_ingest/processes/connectors/salesforce.py +1 -1
- unstructured_ingest/processes/connectors/slack.py +1 -2
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
- unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
- unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
- unstructured_ingest/processes/connectors/sql/sql.py +3 -4
- unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
- unstructured_ingest/processes/connectors/vectara.py +0 -2
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
- unstructured_ingest/processes/embedder.py +2 -2
- unstructured_ingest/processes/filter.py +1 -1
- unstructured_ingest/processes/partitioner.py +4 -0
- unstructured_ingest/processes/utils/blob_storage.py +2 -2
- unstructured_ingest/unstructured_api.py +13 -8
- unstructured_ingest/utils/data_prep.py +8 -32
- unstructured_ingest/utils/string_and_date_utils.py +3 -3
- unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +54 -187
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
- examples/__init__.py +0 -0
- examples/airtable.py +0 -44
- examples/azure_cognitive_search.py +0 -55
- examples/chroma.py +0 -54
- examples/couchbase.py +0 -55
- examples/databricks_volumes_dest.py +0 -55
- examples/databricks_volumes_source.py +0 -53
- examples/delta_table.py +0 -45
- examples/discord_example.py +0 -36
- examples/elasticsearch.py +0 -49
- examples/google_drive.py +0 -45
- examples/kdbai.py +0 -54
- examples/local.py +0 -36
- examples/milvus.py +0 -44
- examples/mongodb.py +0 -53
- examples/opensearch.py +0 -50
- examples/pinecone.py +0 -57
- examples/s3.py +0 -38
- examples/salesforce.py +0 -44
- examples/sharepoint.py +0 -47
- examples/singlestore.py +0 -49
- examples/sql.py +0 -90
- examples/vectara.py +0 -54
- examples/weaviate.py +0 -44
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +0 -31
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +0 -38
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +0 -273
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +0 -90
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +0 -14
- test/integration/connectors/duckdb/test_duckdb.py +0 -90
- test/integration/connectors/duckdb/test_motherduck.py +0 -95
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +0 -34
- test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
- test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
- test/integration/connectors/sql/test_postgres.py +0 -201
- test/integration/connectors/sql/test_singlestore.py +0 -182
- test/integration/connectors/sql/test_snowflake.py +0 -244
- test/integration/connectors/sql/test_sqlite.py +0 -168
- test/integration/connectors/sql/test_vastdb.py +0 -34
- test/integration/connectors/test_astradb.py +0 -287
- test/integration/connectors/test_azure_ai_search.py +0 -254
- test/integration/connectors/test_chroma.py +0 -136
- test/integration/connectors/test_confluence.py +0 -111
- test/integration/connectors/test_delta_table.py +0 -183
- test/integration/connectors/test_dropbox.py +0 -151
- test/integration/connectors/test_github.py +0 -49
- test/integration/connectors/test_google_drive.py +0 -257
- test/integration/connectors/test_jira.py +0 -67
- test/integration/connectors/test_lancedb.py +0 -247
- test/integration/connectors/test_milvus.py +0 -208
- test/integration/connectors/test_mongodb.py +0 -335
- test/integration/connectors/test_neo4j.py +0 -244
- test/integration/connectors/test_notion.py +0 -152
- test/integration/connectors/test_onedrive.py +0 -163
- test/integration/connectors/test_pinecone.py +0 -387
- test/integration/connectors/test_qdrant.py +0 -216
- test/integration/connectors/test_redis.py +0 -143
- test/integration/connectors/test_s3.py +0 -184
- test/integration/connectors/test_sharepoint.py +0 -222
- test/integration/connectors/test_vectara.py +0 -282
- test/integration/connectors/test_zendesk.py +0 -120
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +0 -13
- test/integration/connectors/utils/docker.py +0 -151
- test/integration/connectors/utils/docker_compose.py +0 -59
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +0 -77
- test/integration/connectors/utils/validation/equality.py +0 -76
- test/integration/connectors/utils/validation/source.py +0 -331
- test/integration/connectors/utils/validation/utils.py +0 -36
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +0 -15
- test/integration/connectors/weaviate/test_cloud.py +0 -39
- test/integration/connectors/weaviate/test_local.py +0 -152
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +0 -13
- test/integration/embedders/test_azure_openai.py +0 -57
- test/integration/embedders/test_bedrock.py +0 -103
- test/integration/embedders/test_huggingface.py +0 -24
- test/integration/embedders/test_mixedbread.py +0 -71
- test/integration/embedders/test_octoai.py +0 -75
- test/integration/embedders/test_openai.py +0 -74
- test/integration/embedders/test_togetherai.py +0 -71
- test/integration/embedders/test_vertexai.py +0 -63
- test/integration/embedders/test_voyageai.py +0 -79
- test/integration/embedders/utils.py +0 -66
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +0 -76
- test/integration/utils.py +0 -15
- test/unit/__init__.py +0 -0
- test/unit/chunkers/__init__.py +0 -0
- test/unit/chunkers/test_chunkers.py +0 -49
- test/unit/connectors/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
- test/unit/connectors/motherduck/__init__.py +0 -0
- test/unit/connectors/motherduck/test_base.py +0 -73
- test/unit/connectors/sql/__init__.py +0 -0
- test/unit/connectors/sql/test_sql.py +0 -152
- test/unit/connectors/test_confluence.py +0 -71
- test/unit/connectors/test_jira.py +0 -401
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +0 -42
- test/unit/embed/test_octoai.py +0 -27
- test/unit/embed/test_openai.py +0 -28
- test/unit/embed/test_vertexai.py +0 -25
- test/unit/embed/test_voyageai.py +0 -24
- test/unit/embedders/__init__.py +0 -0
- test/unit/embedders/test_bedrock.py +0 -36
- test/unit/embedders/test_huggingface.py +0 -48
- test/unit/embedders/test_mixedbread.py +0 -37
- test/unit/embedders/test_octoai.py +0 -35
- test/unit/embedders/test_openai.py +0 -35
- test/unit/embedders/test_togetherai.py +0 -37
- test/unit/embedders/test_vertexai.py +0 -37
- test/unit/embedders/test_voyageai.py +0 -38
- test/unit/partitioners/__init__.py +0 -0
- test/unit/partitioners/test_partitioner.py +0 -63
- test/unit/test_error.py +0 -27
- test/unit/test_html.py +0 -112
- test/unit/test_interfaces.py +0 -26
- test/unit/test_logger.py +0 -78
- test/unit/test_utils.py +0 -220
- test/unit/utils/__init__.py +0 -0
- test/unit/utils/data_generator.py +0 -32
- unstructured_ingest-0.7.1.dist-info/METADATA +0 -383
- unstructured_ingest-0.7.1.dist-info/top_level.txt +0 -3
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
|
@@ -1,244 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import time
|
|
3
|
-
import uuid
|
|
4
|
-
from datetime import datetime
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
|
|
7
|
-
import pytest
|
|
8
|
-
from neo4j import AsyncGraphDatabase, Driver, GraphDatabase
|
|
9
|
-
from neo4j.exceptions import ServiceUnavailable
|
|
10
|
-
from pytest_check import check
|
|
11
|
-
|
|
12
|
-
from test.integration.connectors.utils.constants import DESTINATION_TAG, GRAPH_DB_TAG
|
|
13
|
-
from test.integration.connectors.utils.docker import container_context
|
|
14
|
-
from unstructured_ingest.data_types.file_data import (
|
|
15
|
-
FileData,
|
|
16
|
-
FileDataSourceMetadata,
|
|
17
|
-
SourceIdentifiers,
|
|
18
|
-
)
|
|
19
|
-
from unstructured_ingest.error import DestinationConnectionError
|
|
20
|
-
from unstructured_ingest.processes.connectors.neo4j import (
|
|
21
|
-
CONNECTOR_TYPE,
|
|
22
|
-
Label,
|
|
23
|
-
Neo4jAccessConfig,
|
|
24
|
-
Neo4jConnectionConfig,
|
|
25
|
-
Neo4jUploader,
|
|
26
|
-
Neo4jUploaderConfig,
|
|
27
|
-
Neo4jUploadStager,
|
|
28
|
-
Relationship,
|
|
29
|
-
)
|
|
30
|
-
from unstructured_ingest.utils.chunking import elements_from_base64_gzipped_json
|
|
31
|
-
|
|
32
|
-
USERNAME = "neo4j"
|
|
33
|
-
PASSWORD = "password"
|
|
34
|
-
URI = "neo4j://localhost:7687"
|
|
35
|
-
DATABASE = "neo4j"
|
|
36
|
-
|
|
37
|
-
EXPECTED_DOCUMENT_COUNT = 1
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
# NOTE: Precheck tests are read-only so we utilize the same container for all tests.
|
|
41
|
-
# If new tests require clean neo4j container, this fixture's scope should be adjusted.
|
|
42
|
-
@pytest.fixture(autouse=True, scope="module")
|
|
43
|
-
def _neo4j_server():
|
|
44
|
-
with container_context(
|
|
45
|
-
image="neo4j:latest", environment={"NEO4J_AUTH": "neo4j/password"}, ports={"7687": "7687"}
|
|
46
|
-
):
|
|
47
|
-
driver = GraphDatabase.driver(uri=URI, auth=(USERNAME, PASSWORD))
|
|
48
|
-
wait_for_connection(driver)
|
|
49
|
-
driver.close()
|
|
50
|
-
yield
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
@pytest.mark.asyncio
|
|
54
|
-
@pytest.mark.tags(DESTINATION_TAG, CONNECTOR_TYPE, GRAPH_DB_TAG)
|
|
55
|
-
async def test_neo4j_destination(upload_file: Path, tmp_path: Path):
|
|
56
|
-
stager = Neo4jUploadStager()
|
|
57
|
-
uploader = Neo4jUploader(
|
|
58
|
-
connection_config=Neo4jConnectionConfig(
|
|
59
|
-
access_config=Neo4jAccessConfig(password=PASSWORD), # type: ignore
|
|
60
|
-
username=USERNAME,
|
|
61
|
-
uri=URI,
|
|
62
|
-
database=DATABASE,
|
|
63
|
-
),
|
|
64
|
-
upload_config=Neo4jUploaderConfig(),
|
|
65
|
-
)
|
|
66
|
-
file_data = FileData(
|
|
67
|
-
identifier="mock-file-data",
|
|
68
|
-
connector_type="neo4j",
|
|
69
|
-
source_identifiers=SourceIdentifiers(
|
|
70
|
-
filename=upload_file.name,
|
|
71
|
-
fullpath=upload_file.name,
|
|
72
|
-
),
|
|
73
|
-
metadata=FileDataSourceMetadata(
|
|
74
|
-
date_created=str(datetime(2022, 1, 1).timestamp()),
|
|
75
|
-
date_modified=str(datetime(2022, 1, 2).timestamp()),
|
|
76
|
-
),
|
|
77
|
-
)
|
|
78
|
-
staged_filepath = stager.run(
|
|
79
|
-
upload_file,
|
|
80
|
-
file_data=file_data,
|
|
81
|
-
output_dir=tmp_path,
|
|
82
|
-
output_filename=upload_file.name,
|
|
83
|
-
)
|
|
84
|
-
|
|
85
|
-
await uploader.run_async(staged_filepath, file_data)
|
|
86
|
-
await validate_uploaded_graph(upload_file)
|
|
87
|
-
|
|
88
|
-
modified_upload_file = tmp_path / f"modified-{upload_file.name}"
|
|
89
|
-
with open(upload_file) as file:
|
|
90
|
-
elements = json.load(file)
|
|
91
|
-
for element in elements:
|
|
92
|
-
element["element_id"] = str(uuid.uuid4())
|
|
93
|
-
|
|
94
|
-
with open(modified_upload_file, "w") as file:
|
|
95
|
-
json.dump(elements, file, indent=4)
|
|
96
|
-
|
|
97
|
-
staged_filepath = stager.run(
|
|
98
|
-
modified_upload_file,
|
|
99
|
-
file_data=file_data,
|
|
100
|
-
output_dir=tmp_path,
|
|
101
|
-
output_filename=modified_upload_file.name,
|
|
102
|
-
)
|
|
103
|
-
await uploader.run_async(staged_filepath, file_data)
|
|
104
|
-
await validate_uploaded_graph(modified_upload_file)
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
@pytest.mark.tags(DESTINATION_TAG, CONNECTOR_TYPE, GRAPH_DB_TAG)
|
|
108
|
-
class TestPrecheck:
|
|
109
|
-
@pytest.fixture
|
|
110
|
-
def configured_uploader(self) -> Neo4jUploader:
|
|
111
|
-
return Neo4jUploader(
|
|
112
|
-
connection_config=Neo4jConnectionConfig(
|
|
113
|
-
access_config=Neo4jAccessConfig(password=PASSWORD), # type: ignore
|
|
114
|
-
username=USERNAME,
|
|
115
|
-
uri=URI,
|
|
116
|
-
database=DATABASE,
|
|
117
|
-
),
|
|
118
|
-
upload_config=Neo4jUploaderConfig(),
|
|
119
|
-
)
|
|
120
|
-
|
|
121
|
-
def test_succeeds(self, configured_uploader: Neo4jUploader):
|
|
122
|
-
configured_uploader.precheck()
|
|
123
|
-
|
|
124
|
-
def test_fails_on_invalid_password(self, configured_uploader: Neo4jUploader):
|
|
125
|
-
configured_uploader.connection_config.access_config.get_secret_value().password = (
|
|
126
|
-
"invalid-password"
|
|
127
|
-
)
|
|
128
|
-
with pytest.raises(
|
|
129
|
-
DestinationConnectionError,
|
|
130
|
-
match="{code: Neo.ClientError.Security.Unauthorized}",
|
|
131
|
-
):
|
|
132
|
-
configured_uploader.precheck()
|
|
133
|
-
|
|
134
|
-
def test_fails_on_invalid_username(self, configured_uploader: Neo4jUploader):
|
|
135
|
-
configured_uploader.connection_config.username = "invalid-username"
|
|
136
|
-
with pytest.raises(
|
|
137
|
-
DestinationConnectionError, match="{code: Neo.ClientError.Security.Unauthorized}"
|
|
138
|
-
):
|
|
139
|
-
configured_uploader.precheck()
|
|
140
|
-
|
|
141
|
-
@pytest.mark.parametrize(
|
|
142
|
-
("uri", "expected_error_msg"),
|
|
143
|
-
[
|
|
144
|
-
("neo4j://localhst:7687", "Cannot resolve address"),
|
|
145
|
-
("neo4j://localhost:7777", "Unable to retrieve routing information"),
|
|
146
|
-
],
|
|
147
|
-
)
|
|
148
|
-
def test_fails_on_invalid_uri(
|
|
149
|
-
self, configured_uploader: Neo4jUploader, uri: str, expected_error_msg: str
|
|
150
|
-
):
|
|
151
|
-
configured_uploader.connection_config.uri = uri
|
|
152
|
-
with pytest.raises(DestinationConnectionError, match=expected_error_msg):
|
|
153
|
-
configured_uploader.precheck()
|
|
154
|
-
|
|
155
|
-
def test_fails_on_invalid_database(self, configured_uploader: Neo4jUploader):
|
|
156
|
-
configured_uploader.connection_config.database = "invalid-database"
|
|
157
|
-
with pytest.raises(
|
|
158
|
-
DestinationConnectionError, match="{code: Neo.ClientError.Database.DatabaseNotFound}"
|
|
159
|
-
):
|
|
160
|
-
configured_uploader.precheck()
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
def wait_for_connection(driver: Driver, retries: int = 10, delay_seconds: int = 2):
|
|
164
|
-
attempts = 0
|
|
165
|
-
while attempts < retries:
|
|
166
|
-
try:
|
|
167
|
-
driver.verify_connectivity()
|
|
168
|
-
return
|
|
169
|
-
except ServiceUnavailable:
|
|
170
|
-
time.sleep(delay_seconds)
|
|
171
|
-
attempts += 1
|
|
172
|
-
|
|
173
|
-
pytest.fail("Failed to connect with Neo4j server.")
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
async def validate_uploaded_graph(upload_file: Path):
|
|
177
|
-
with open(upload_file) as file:
|
|
178
|
-
elements = json.load(file)
|
|
179
|
-
|
|
180
|
-
for element in elements:
|
|
181
|
-
if "orig_elements" in element["metadata"]:
|
|
182
|
-
element["metadata"]["orig_elements"] = elements_from_base64_gzipped_json(
|
|
183
|
-
element["metadata"]["orig_elements"]
|
|
184
|
-
)
|
|
185
|
-
else:
|
|
186
|
-
element["metadata"]["orig_elements"] = []
|
|
187
|
-
|
|
188
|
-
expected_chunks_count = len(elements)
|
|
189
|
-
expected_element_count = len(
|
|
190
|
-
{
|
|
191
|
-
origin_element["element_id"]
|
|
192
|
-
for chunk in elements
|
|
193
|
-
for origin_element in chunk["metadata"]["orig_elements"]
|
|
194
|
-
}
|
|
195
|
-
)
|
|
196
|
-
expected_nodes_count = expected_chunks_count + expected_element_count + EXPECTED_DOCUMENT_COUNT
|
|
197
|
-
|
|
198
|
-
driver = AsyncGraphDatabase.driver(uri=URI, auth=(USERNAME, PASSWORD))
|
|
199
|
-
try:
|
|
200
|
-
nodes_count = len((await driver.execute_query("MATCH (n) RETURN n"))[0])
|
|
201
|
-
chunk_nodes_count = len(
|
|
202
|
-
(await driver.execute_query(f"MATCH (n: {Label.CHUNK.value}) RETURN n"))[0]
|
|
203
|
-
)
|
|
204
|
-
document_nodes_count = len(
|
|
205
|
-
(await driver.execute_query(f"MATCH (n: {Label.DOCUMENT.value}) RETURN n"))[0]
|
|
206
|
-
)
|
|
207
|
-
element_nodes_count = len(
|
|
208
|
-
(await driver.execute_query(f"MATCH (n: {Label.UNSTRUCTURED_ELEMENT.value}) RETURN n"))[
|
|
209
|
-
0
|
|
210
|
-
]
|
|
211
|
-
)
|
|
212
|
-
with check:
|
|
213
|
-
assert nodes_count == expected_nodes_count
|
|
214
|
-
with check:
|
|
215
|
-
assert document_nodes_count == EXPECTED_DOCUMENT_COUNT
|
|
216
|
-
with check:
|
|
217
|
-
assert chunk_nodes_count == expected_chunks_count
|
|
218
|
-
with check:
|
|
219
|
-
assert element_nodes_count == expected_element_count
|
|
220
|
-
|
|
221
|
-
records, _, _ = await driver.execute_query(
|
|
222
|
-
f"""
|
|
223
|
-
MATCH ()-[r:{Relationship.PART_OF_DOCUMENT.value}]->(:{Label.DOCUMENT.value})
|
|
224
|
-
RETURN r
|
|
225
|
-
"""
|
|
226
|
-
)
|
|
227
|
-
part_of_document_count = len(records)
|
|
228
|
-
|
|
229
|
-
records, _, _ = await driver.execute_query(
|
|
230
|
-
f"""
|
|
231
|
-
MATCH (:{Label.CHUNK.value})-[r:{Relationship.NEXT_CHUNK.value}]->(:{Label.CHUNK.value})
|
|
232
|
-
RETURN r
|
|
233
|
-
"""
|
|
234
|
-
)
|
|
235
|
-
next_chunk_count = len(records)
|
|
236
|
-
|
|
237
|
-
if not check.any_failures():
|
|
238
|
-
with check:
|
|
239
|
-
assert part_of_document_count == expected_chunks_count + expected_element_count
|
|
240
|
-
with check:
|
|
241
|
-
assert next_chunk_count == expected_chunks_count - 1
|
|
242
|
-
|
|
243
|
-
finally:
|
|
244
|
-
await driver.close()
|
|
@@ -1,152 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
|
-
import pytest
|
|
4
|
-
|
|
5
|
-
from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
|
|
6
|
-
from test.integration.connectors.utils.validation.source import (
|
|
7
|
-
SourceValidationConfigs,
|
|
8
|
-
get_all_file_data,
|
|
9
|
-
run_all_validations,
|
|
10
|
-
update_fixtures,
|
|
11
|
-
)
|
|
12
|
-
from unstructured_ingest.interfaces import Downloader, Indexer
|
|
13
|
-
from unstructured_ingest.processes.connectors.notion.connector import (
|
|
14
|
-
CONNECTOR_TYPE,
|
|
15
|
-
NotionAccessConfig,
|
|
16
|
-
NotionConnectionConfig,
|
|
17
|
-
NotionDownloader,
|
|
18
|
-
NotionDownloaderConfig,
|
|
19
|
-
NotionIndexer,
|
|
20
|
-
NotionIndexerConfig,
|
|
21
|
-
)
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
|
|
25
|
-
def test_notion_source_database(temp_dir):
|
|
26
|
-
# Retrieve environment variables
|
|
27
|
-
notion_api_key = os.environ["NOTION_API_KEY"]
|
|
28
|
-
|
|
29
|
-
# Create connection and indexer configurations
|
|
30
|
-
access_config = NotionAccessConfig(notion_api_key=notion_api_key)
|
|
31
|
-
connection_config = NotionConnectionConfig(
|
|
32
|
-
access_config=access_config,
|
|
33
|
-
)
|
|
34
|
-
index_config = NotionIndexerConfig(
|
|
35
|
-
database_ids=["1722c3765a0a8082b382ebc2c62d3f4c"], recursive=False
|
|
36
|
-
)
|
|
37
|
-
|
|
38
|
-
download_config = NotionDownloaderConfig(download_dir=temp_dir)
|
|
39
|
-
|
|
40
|
-
# Instantiate indexer and downloader
|
|
41
|
-
indexer = NotionIndexer(
|
|
42
|
-
connection_config=connection_config,
|
|
43
|
-
index_config=index_config,
|
|
44
|
-
)
|
|
45
|
-
downloader = NotionDownloader(
|
|
46
|
-
connection_config=connection_config,
|
|
47
|
-
download_config=download_config,
|
|
48
|
-
)
|
|
49
|
-
|
|
50
|
-
# Run the source connector validation
|
|
51
|
-
source_connector_validation(
|
|
52
|
-
indexer=indexer,
|
|
53
|
-
downloader=downloader,
|
|
54
|
-
configs=SourceValidationConfigs(
|
|
55
|
-
test_id="notion_database",
|
|
56
|
-
expected_num_files=1,
|
|
57
|
-
validate_downloaded_files=True,
|
|
58
|
-
exclude_fields_extend=["metadata.date_created", "metadata.date_modified"],
|
|
59
|
-
),
|
|
60
|
-
)
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
|
|
64
|
-
def test_notion_source_page(temp_dir):
|
|
65
|
-
# Retrieve environment variables
|
|
66
|
-
notion_api_key = os.environ["NOTION_API_KEY"]
|
|
67
|
-
|
|
68
|
-
# Create connection and indexer configurations
|
|
69
|
-
access_config = NotionAccessConfig(notion_api_key=notion_api_key)
|
|
70
|
-
connection_config = NotionConnectionConfig(
|
|
71
|
-
access_config=access_config,
|
|
72
|
-
)
|
|
73
|
-
index_config = NotionIndexerConfig(
|
|
74
|
-
page_ids=["1572c3765a0a806299f0dd6999f9e4c7"], recursive=False
|
|
75
|
-
)
|
|
76
|
-
|
|
77
|
-
download_config = NotionDownloaderConfig(download_dir=temp_dir)
|
|
78
|
-
|
|
79
|
-
# Instantiate indexer and downloader
|
|
80
|
-
indexer = NotionIndexer(
|
|
81
|
-
connection_config=connection_config,
|
|
82
|
-
index_config=index_config,
|
|
83
|
-
)
|
|
84
|
-
downloader = NotionDownloader(
|
|
85
|
-
connection_config=connection_config,
|
|
86
|
-
download_config=download_config,
|
|
87
|
-
)
|
|
88
|
-
|
|
89
|
-
# Run the source connector validation
|
|
90
|
-
source_connector_validation(
|
|
91
|
-
indexer=indexer,
|
|
92
|
-
downloader=downloader,
|
|
93
|
-
configs=SourceValidationConfigs(
|
|
94
|
-
test_id="notion_page",
|
|
95
|
-
expected_num_files=1,
|
|
96
|
-
validate_downloaded_files=True,
|
|
97
|
-
exclude_fields_extend=["metadata.date_created", "metadata.date_modified"],
|
|
98
|
-
),
|
|
99
|
-
)
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
|
|
103
|
-
def source_connector_validation(
|
|
104
|
-
indexer: Indexer,
|
|
105
|
-
downloader: Downloader,
|
|
106
|
-
configs: SourceValidationConfigs,
|
|
107
|
-
overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
|
|
108
|
-
) -> None:
|
|
109
|
-
# Run common validations on the process of running a source connector, supporting dynamic
|
|
110
|
-
# validators that get passed in along with comparisons on the saved expected values.
|
|
111
|
-
# If overwrite_fixtures is st to True, will ignore all validators but instead overwrite the
|
|
112
|
-
# expected values with what gets generated by this test.
|
|
113
|
-
all_predownload_file_data = []
|
|
114
|
-
all_postdownload_file_data = []
|
|
115
|
-
indexer.precheck()
|
|
116
|
-
download_dir = downloader.download_config.download_dir
|
|
117
|
-
test_output_dir = configs.test_output_dir()
|
|
118
|
-
|
|
119
|
-
for file_data in indexer.run():
|
|
120
|
-
assert file_data
|
|
121
|
-
predownload_file_data = file_data.model_copy(deep=True)
|
|
122
|
-
all_predownload_file_data.append(predownload_file_data)
|
|
123
|
-
resp = downloader.run(file_data=file_data)
|
|
124
|
-
if isinstance(resp, list):
|
|
125
|
-
for r in resp:
|
|
126
|
-
postdownload_file_data = r["file_data"].model_copy(deep=True)
|
|
127
|
-
all_postdownload_file_data.append(postdownload_file_data)
|
|
128
|
-
else:
|
|
129
|
-
postdownload_file_data = resp["file_data"].model_copy(deep=True)
|
|
130
|
-
all_postdownload_file_data.append(postdownload_file_data)
|
|
131
|
-
|
|
132
|
-
if not overwrite_fixtures:
|
|
133
|
-
print("Running validation")
|
|
134
|
-
run_all_validations(
|
|
135
|
-
configs=configs,
|
|
136
|
-
predownload_file_data=all_predownload_file_data,
|
|
137
|
-
postdownload_file_data=all_postdownload_file_data,
|
|
138
|
-
download_dir=download_dir,
|
|
139
|
-
test_output_dir=test_output_dir,
|
|
140
|
-
)
|
|
141
|
-
else:
|
|
142
|
-
print("Running fixtures update")
|
|
143
|
-
update_fixtures(
|
|
144
|
-
output_dir=test_output_dir,
|
|
145
|
-
download_dir=download_dir,
|
|
146
|
-
all_file_data=get_all_file_data(
|
|
147
|
-
all_predownload_file_data=all_predownload_file_data,
|
|
148
|
-
all_postdownload_file_data=all_postdownload_file_data,
|
|
149
|
-
),
|
|
150
|
-
save_downloads=configs.validate_downloaded_files,
|
|
151
|
-
save_filedata=configs.validate_file_data,
|
|
152
|
-
)
|
|
@@ -1,163 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import uuid
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
import pytest
|
|
6
|
-
from office365.graph_client import GraphClient
|
|
7
|
-
|
|
8
|
-
from test.integration.connectors.utils.constants import (
|
|
9
|
-
BLOB_STORAGE_TAG,
|
|
10
|
-
DESTINATION_TAG,
|
|
11
|
-
SOURCE_TAG,
|
|
12
|
-
)
|
|
13
|
-
from test.integration.connectors.utils.validation.source import (
|
|
14
|
-
SourceValidationConfigs,
|
|
15
|
-
source_connector_validation,
|
|
16
|
-
)
|
|
17
|
-
from test.integration.utils import requires_env
|
|
18
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
19
|
-
from unstructured_ingest.processes.connectors.onedrive import (
|
|
20
|
-
CONNECTOR_TYPE,
|
|
21
|
-
OnedriveAccessConfig,
|
|
22
|
-
OnedriveConnectionConfig,
|
|
23
|
-
OnedriveDownloader,
|
|
24
|
-
OnedriveDownloaderConfig,
|
|
25
|
-
OnedriveIndexer,
|
|
26
|
-
OnedriveIndexerConfig,
|
|
27
|
-
OnedriveUploader,
|
|
28
|
-
OnedriveUploaderConfig,
|
|
29
|
-
)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
@pytest.fixture
|
|
33
|
-
def onedrive_test_folder() -> str:
|
|
34
|
-
"""
|
|
35
|
-
Pytest fixture that creates a test folder in OneDrive and deletes it after test run.
|
|
36
|
-
"""
|
|
37
|
-
connection_config = get_connection_config()
|
|
38
|
-
user_pname = connection_config.user_pname
|
|
39
|
-
|
|
40
|
-
# Get the OneDrive client
|
|
41
|
-
client: GraphClient = connection_config.get_client()
|
|
42
|
-
drive = client.users[user_pname].drive
|
|
43
|
-
|
|
44
|
-
# Generate a unique test folder path
|
|
45
|
-
test_folder_path = f"utic-test-output-{uuid.uuid4()}"
|
|
46
|
-
|
|
47
|
-
# Create the test folder
|
|
48
|
-
root = drive.root
|
|
49
|
-
folder = root.create_folder(test_folder_path).execute_query()
|
|
50
|
-
print(f"created folder: {folder.name}")
|
|
51
|
-
try:
|
|
52
|
-
yield test_folder_path
|
|
53
|
-
finally:
|
|
54
|
-
# Teardown: delete the test folder and its contents
|
|
55
|
-
folder.delete_object().execute_query()
|
|
56
|
-
print(f"successfully deleted folder: {folder.name}")
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
def get_connection_config():
|
|
60
|
-
"""
|
|
61
|
-
Pytest fixture that provides the OnedriveConnectionConfig for tests.
|
|
62
|
-
"""
|
|
63
|
-
client_id = os.getenv("MS_CLIENT_ID")
|
|
64
|
-
client_secret = os.getenv("MS_CLIENT_CRED")
|
|
65
|
-
tenant_id = os.getenv("MS_TENANT_ID")
|
|
66
|
-
user_pname = os.getenv("MS_USER_PNAME")
|
|
67
|
-
|
|
68
|
-
connection_config = OnedriveConnectionConfig(
|
|
69
|
-
client_id=client_id,
|
|
70
|
-
tenant=tenant_id,
|
|
71
|
-
user_pname=user_pname,
|
|
72
|
-
access_config=OnedriveAccessConfig(client_cred=client_secret),
|
|
73
|
-
)
|
|
74
|
-
return connection_config
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
@pytest.mark.asyncio
|
|
78
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
79
|
-
@requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
|
|
80
|
-
async def test_onedrive_source(temp_dir):
|
|
81
|
-
connection_config = get_connection_config()
|
|
82
|
-
index_config = OnedriveIndexerConfig(recursive=True, path="eml")
|
|
83
|
-
|
|
84
|
-
download_config = OnedriveDownloaderConfig(download_dir=temp_dir)
|
|
85
|
-
|
|
86
|
-
# Instantiate indexer and downloader
|
|
87
|
-
indexer = OnedriveIndexer(
|
|
88
|
-
connection_config=connection_config,
|
|
89
|
-
index_config=index_config,
|
|
90
|
-
)
|
|
91
|
-
downloader = OnedriveDownloader(
|
|
92
|
-
connection_config=connection_config,
|
|
93
|
-
download_config=download_config,
|
|
94
|
-
)
|
|
95
|
-
|
|
96
|
-
# Run the source connector validation
|
|
97
|
-
await source_connector_validation(
|
|
98
|
-
indexer=indexer,
|
|
99
|
-
downloader=downloader,
|
|
100
|
-
configs=SourceValidationConfigs(
|
|
101
|
-
test_id="onedrive",
|
|
102
|
-
expected_num_files=1,
|
|
103
|
-
validate_downloaded_files=True,
|
|
104
|
-
exclude_fields_extend=[
|
|
105
|
-
"metadata.date_created",
|
|
106
|
-
"metadata.date_modified",
|
|
107
|
-
"additional_metadata.LastModified",
|
|
108
|
-
"additional_metadata.@microsoft.graph.downloadUrl",
|
|
109
|
-
],
|
|
110
|
-
),
|
|
111
|
-
)
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, BLOB_STORAGE_TAG)
|
|
115
|
-
@requires_env("MS_CLIENT_CRED", "MS_CLIENT_ID", "MS_TENANT_ID", "MS_USER_PNAME")
|
|
116
|
-
def xtest_onedrive_destination(upload_file: Path, onedrive_test_folder: str):
|
|
117
|
-
"""
|
|
118
|
-
Integration test for the OneDrive destination connector.
|
|
119
|
-
|
|
120
|
-
This test uploads a file to OneDrive and verifies that it exists.
|
|
121
|
-
"""
|
|
122
|
-
connection_config = get_connection_config()
|
|
123
|
-
# Retrieve user principal name from the connection config
|
|
124
|
-
user_pname = connection_config.user_pname
|
|
125
|
-
|
|
126
|
-
# The test folder is provided by the fixture
|
|
127
|
-
destination_folder = onedrive_test_folder
|
|
128
|
-
destination_fullpath = f"{destination_folder}/{upload_file.name}"
|
|
129
|
-
|
|
130
|
-
# Configure the uploader with remote_url
|
|
131
|
-
upload_config = OnedriveUploaderConfig(remote_url=f"onedrive://{destination_folder}")
|
|
132
|
-
|
|
133
|
-
uploader = OnedriveUploader(
|
|
134
|
-
connection_config=connection_config,
|
|
135
|
-
upload_config=upload_config,
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
file_data = FileData(
|
|
139
|
-
source_identifiers=SourceIdentifiers(
|
|
140
|
-
fullpath=destination_fullpath,
|
|
141
|
-
filename=upload_file.name,
|
|
142
|
-
),
|
|
143
|
-
connector_type=CONNECTOR_TYPE,
|
|
144
|
-
identifier="mock_file_data",
|
|
145
|
-
)
|
|
146
|
-
uploader.precheck()
|
|
147
|
-
uploader.run(path=upload_file, file_data=file_data)
|
|
148
|
-
|
|
149
|
-
# Verify that the file was uploaded
|
|
150
|
-
client = connection_config.get_client()
|
|
151
|
-
drive = client.users[user_pname].drive
|
|
152
|
-
|
|
153
|
-
# Workaround: File should not have .json in the metadata.filename it comes from embedder
|
|
154
|
-
uploaded_file = (
|
|
155
|
-
drive.root.get_by_path(f"{destination_fullpath}.json")
|
|
156
|
-
.select(["id", "name"])
|
|
157
|
-
.get()
|
|
158
|
-
.execute_query()
|
|
159
|
-
)
|
|
160
|
-
|
|
161
|
-
# Check if the file exists
|
|
162
|
-
assert uploaded_file is not None
|
|
163
|
-
assert uploaded_file.name == f"{upload_file.name}.json"
|