unstructured-ingest 0.5.23__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/databricks/test_volumes_native.py +1 -1
- test/integration/connectors/duckdb/test_duckdb.py +1 -1
- test/integration/connectors/duckdb/test_motherduck.py +1 -1
- test/integration/connectors/elasticsearch/test_elasticsearch.py +1 -1
- test/integration/connectors/elasticsearch/test_opensearch.py +1 -1
- test/integration/connectors/sql/test_databricks_delta_tables.py +1 -1
- test/integration/connectors/sql/test_postgres.py +1 -1
- test/integration/connectors/sql/test_singlestore.py +1 -1
- test/integration/connectors/sql/test_snowflake.py +1 -1
- test/integration/connectors/sql/test_sqlite.py +1 -1
- test/integration/connectors/test_astradb.py +1 -1
- test/integration/connectors/test_azure_ai_search.py +1 -1
- test/integration/connectors/test_chroma.py +1 -1
- test/integration/connectors/test_delta_table.py +1 -1
- test/integration/connectors/test_lancedb.py +1 -1
- test/integration/connectors/test_milvus.py +1 -1
- test/integration/connectors/test_mongodb.py +1 -1
- test/integration/connectors/test_neo4j.py +5 -5
- test/integration/connectors/test_onedrive.py +1 -1
- test/integration/connectors/test_pinecone.py +1 -1
- test/integration/connectors/test_qdrant.py +1 -1
- test/integration/connectors/test_redis.py +1 -1
- test/integration/connectors/test_s3.py +1 -1
- test/integration/connectors/test_vectara.py +68 -56
- test/integration/connectors/utils/validation/destination.py +2 -1
- test/integration/connectors/utils/validation/source.py +2 -1
- test/integration/connectors/weaviate/test_local.py +1 -1
- test/unit/test_html.py +1 -1
- test/unit/v2/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +1 -1
- test/unit/v2/connectors/motherduck/test_base.py +1 -2
- test/unit/v2/connectors/sql/test_sql.py +1 -1
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/utils/html.py +2 -1
- unstructured_ingest/v2/interfaces/__init__.py +0 -13
- unstructured_ingest/v2/interfaces/downloader.py +1 -1
- unstructured_ingest/v2/interfaces/indexer.py +1 -1
- unstructured_ingest/v2/interfaces/upload_stager.py +2 -2
- unstructured_ingest/v2/interfaces/uploader.py +2 -3
- unstructured_ingest/v2/pipeline/steps/chunk.py +1 -2
- unstructured_ingest/v2/pipeline/steps/download.py +2 -3
- unstructured_ingest/v2/pipeline/steps/embed.py +1 -2
- unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
- unstructured_ingest/v2/pipeline/steps/partition.py +1 -2
- unstructured_ingest/v2/pipeline/steps/stage.py +2 -2
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
- unstructured_ingest/v2/pipeline/steps/upload.py +2 -2
- unstructured_ingest/v2/processes/connectors/airtable.py +1 -2
- unstructured_ingest/v2/processes/connectors/astradb.py +7 -5
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +1 -1
- unstructured_ingest/v2/processes/connectors/chroma.py +1 -1
- unstructured_ingest/v2/processes/connectors/confluence.py +5 -3
- unstructured_ingest/v2/processes/connectors/couchbase.py +7 -5
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +5 -3
- unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +64 -19
- unstructured_ingest/v2/processes/connectors/delta_table.py +1 -1
- unstructured_ingest/v2/processes/connectors/discord.py +5 -3
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +2 -1
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +1 -1
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +1 -1
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +7 -5
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +5 -3
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -4
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +1 -1
- unstructured_ingest/v2/processes/connectors/gitlab.py +5 -3
- unstructured_ingest/v2/processes/connectors/google_drive.py +5 -3
- unstructured_ingest/v2/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +1 -1
- unstructured_ingest/v2/processes/connectors/jira.py +5 -3
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +5 -3
- unstructured_ingest/v2/processes/connectors/kdbai.py +1 -1
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -4
- unstructured_ingest/v2/processes/connectors/local.py +5 -3
- unstructured_ingest/v2/processes/connectors/milvus.py +1 -1
- unstructured_ingest/v2/processes/connectors/mongodb.py +7 -5
- unstructured_ingest/v2/processes/connectors/neo4j.py +1 -1
- unstructured_ingest/v2/processes/connectors/notion/connector.py +5 -3
- unstructured_ingest/v2/processes/connectors/onedrive.py +5 -3
- unstructured_ingest/v2/processes/connectors/outlook.py +5 -2
- unstructured_ingest/v2/processes/connectors/pinecone.py +1 -1
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +1 -1
- unstructured_ingest/v2/processes/connectors/redisdb.py +1 -1
- unstructured_ingest/v2/processes/connectors/salesforce.py +5 -3
- unstructured_ingest/v2/processes/connectors/sharepoint.py +3 -3
- unstructured_ingest/v2/processes/connectors/slack.py +2 -2
- unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +1 -1
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +1 -1
- unstructured_ingest/v2/processes/connectors/sql/sql.py +13 -8
- unstructured_ingest/v2/processes/connectors/sql/vastdb.py +3 -3
- unstructured_ingest/v2/processes/connectors/vectara.py +1 -1
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +1 -1
- unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py +5 -3
- unstructured_ingest/v2/processes/filter.py +1 -1
- unstructured_ingest/v2/processes/uncompress.py +1 -1
- unstructured_ingest/v2/processes/utils/blob_storage.py +2 -1
- unstructured_ingest/v2/utils.py +1 -1
- {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/METADATA +101 -101
- {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/RECORD +104 -105
- unstructured_ingest/v2/interfaces/file_data.py +0 -13
- {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/top_level.txt +0 -0
|
@@ -21,7 +21,6 @@ from test.integration.connectors.utils.validation.source import (
|
|
|
21
21
|
)
|
|
22
22
|
from test.integration.utils import requires_env
|
|
23
23
|
from unstructured_ingest.v2.errors import UserAuthError, UserError
|
|
24
|
-
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
25
24
|
from unstructured_ingest.v2.processes.connectors.databricks.volumes_native import (
|
|
26
25
|
CONNECTOR_TYPE,
|
|
27
26
|
DatabricksNativeVolumesAccessConfig,
|
|
@@ -33,6 +32,7 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes_native impor
|
|
|
33
32
|
DatabricksNativeVolumesUploader,
|
|
34
33
|
DatabricksNativeVolumesUploaderConfig,
|
|
35
34
|
)
|
|
35
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
36
36
|
|
|
37
37
|
|
|
38
38
|
@dataclass
|
|
@@ -10,7 +10,6 @@ from test.integration.connectors.utils.validation.destination import (
|
|
|
10
10
|
StagerValidationConfigs,
|
|
11
11
|
stager_validation,
|
|
12
12
|
)
|
|
13
|
-
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
14
13
|
from unstructured_ingest.v2.processes.connectors.duckdb.duckdb import (
|
|
15
14
|
CONNECTOR_TYPE,
|
|
16
15
|
DuckDBConnectionConfig,
|
|
@@ -18,6 +17,7 @@ from unstructured_ingest.v2.processes.connectors.duckdb.duckdb import (
|
|
|
18
17
|
DuckDBUploaderConfig,
|
|
19
18
|
DuckDBUploadStager,
|
|
20
19
|
)
|
|
20
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
@pytest.fixture
|
|
@@ -9,7 +9,6 @@ import pytest
|
|
|
9
9
|
|
|
10
10
|
from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
|
|
11
11
|
from test.integration.utils import requires_env
|
|
12
|
-
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
13
12
|
from unstructured_ingest.v2.processes.connectors.duckdb.motherduck import (
|
|
14
13
|
CONNECTOR_TYPE,
|
|
15
14
|
MotherDuckAccessConfig,
|
|
@@ -18,6 +17,7 @@ from unstructured_ingest.v2.processes.connectors.duckdb.motherduck import (
|
|
|
18
17
|
MotherDuckUploaderConfig,
|
|
19
18
|
MotherDuckUploadStager,
|
|
20
19
|
)
|
|
20
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
@pytest.fixture
|
|
@@ -22,7 +22,7 @@ from test.integration.connectors.utils.validation.source import (
|
|
|
22
22
|
source_connector_validation,
|
|
23
23
|
)
|
|
24
24
|
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
25
|
-
from unstructured_ingest.v2.
|
|
25
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
26
26
|
from unstructured_ingest.v2.processes.connectors.elasticsearch.elasticsearch import (
|
|
27
27
|
CONNECTOR_TYPE,
|
|
28
28
|
ElasticsearchAccessConfig,
|
|
@@ -24,7 +24,6 @@ from unstructured_ingest.error import (
|
|
|
24
24
|
DestinationConnectionError,
|
|
25
25
|
SourceConnectionError,
|
|
26
26
|
)
|
|
27
|
-
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
28
27
|
from unstructured_ingest.v2.processes.connectors.elasticsearch.opensearch import (
|
|
29
28
|
CONNECTOR_TYPE,
|
|
30
29
|
OpenSearchAccessConfig,
|
|
@@ -38,6 +37,7 @@ from unstructured_ingest.v2.processes.connectors.elasticsearch.opensearch import
|
|
|
38
37
|
OpenSearchUploadStager,
|
|
39
38
|
OpenSearchUploadStagerConfig,
|
|
40
39
|
)
|
|
40
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
41
41
|
|
|
42
42
|
SOURCE_INDEX_NAME = "movies"
|
|
43
43
|
DESTINATION_INDEX_NAME = "elements"
|
|
@@ -14,7 +14,6 @@ from pytest_mock import MockerFixture
|
|
|
14
14
|
|
|
15
15
|
from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG, env_setup_path
|
|
16
16
|
from test.integration.utils import requires_env
|
|
17
|
-
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
18
17
|
from unstructured_ingest.v2.logger import logger
|
|
19
18
|
from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables import (
|
|
20
19
|
CONNECTOR_TYPE,
|
|
@@ -24,6 +23,7 @@ from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables imp
|
|
|
24
23
|
DatabricksDeltaTablesUploaderConfig,
|
|
25
24
|
DatabricksDeltaTablesUploadStager,
|
|
26
25
|
)
|
|
26
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
27
27
|
|
|
28
28
|
CATALOG = "utic-dev-tech-fixtures"
|
|
29
29
|
|
|
@@ -20,7 +20,6 @@ from test.integration.connectors.utils.validation.source import (
|
|
|
20
20
|
SourceValidationConfigs,
|
|
21
21
|
source_connector_validation,
|
|
22
22
|
)
|
|
23
|
-
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
24
23
|
from unstructured_ingest.v2.processes.connectors.sql.postgres import (
|
|
25
24
|
CONNECTOR_TYPE,
|
|
26
25
|
PostgresAccessConfig,
|
|
@@ -32,6 +31,7 @@ from unstructured_ingest.v2.processes.connectors.sql.postgres import (
|
|
|
32
31
|
PostgresUploader,
|
|
33
32
|
PostgresUploadStager,
|
|
34
33
|
)
|
|
34
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
35
35
|
|
|
36
36
|
SEED_DATA_ROWS = 10
|
|
37
37
|
|
|
@@ -20,7 +20,6 @@ from test.integration.connectors.utils.validation.source import (
|
|
|
20
20
|
SourceValidationConfigs,
|
|
21
21
|
source_connector_validation,
|
|
22
22
|
)
|
|
23
|
-
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
24
23
|
from unstructured_ingest.v2.processes.connectors.sql.singlestore import (
|
|
25
24
|
CONNECTOR_TYPE,
|
|
26
25
|
SingleStoreAccessConfig,
|
|
@@ -33,6 +32,7 @@ from unstructured_ingest.v2.processes.connectors.sql.singlestore import (
|
|
|
33
32
|
SingleStoreUploaderConfig,
|
|
34
33
|
SingleStoreUploadStager,
|
|
35
34
|
)
|
|
35
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
36
36
|
|
|
37
37
|
SEED_DATA_ROWS = 10
|
|
38
38
|
|
|
@@ -22,7 +22,6 @@ from test.integration.connectors.utils.validation.source import (
|
|
|
22
22
|
source_connector_validation,
|
|
23
23
|
)
|
|
24
24
|
from test.integration.utils import requires_env
|
|
25
|
-
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
26
25
|
from unstructured_ingest.v2.processes.connectors.sql.snowflake import (
|
|
27
26
|
CONNECTOR_TYPE,
|
|
28
27
|
SnowflakeAccessConfig,
|
|
@@ -34,6 +33,7 @@ from unstructured_ingest.v2.processes.connectors.sql.snowflake import (
|
|
|
34
33
|
SnowflakeUploader,
|
|
35
34
|
SnowflakeUploadStager,
|
|
36
35
|
)
|
|
36
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
37
37
|
|
|
38
38
|
SEED_DATA_ROWS = 20
|
|
39
39
|
|
|
@@ -20,7 +20,6 @@ from test.integration.connectors.utils.validation.source import (
|
|
|
20
20
|
SourceValidationConfigs,
|
|
21
21
|
source_connector_validation,
|
|
22
22
|
)
|
|
23
|
-
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
24
23
|
from unstructured_ingest.v2.processes.connectors.sql.sqlite import (
|
|
25
24
|
CONNECTOR_TYPE,
|
|
26
25
|
SQLiteConnectionConfig,
|
|
@@ -31,6 +30,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sqlite import (
|
|
|
31
30
|
SQLiteUploader,
|
|
32
31
|
SQLiteUploadStager,
|
|
33
32
|
)
|
|
33
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
34
34
|
|
|
35
35
|
SEED_DATA_ROWS = 10
|
|
36
36
|
|
|
@@ -20,7 +20,6 @@ from test.integration.connectors.utils.validation.source import (
|
|
|
20
20
|
source_connector_validation,
|
|
21
21
|
)
|
|
22
22
|
from test.integration.utils import requires_env
|
|
23
|
-
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
24
23
|
from unstructured_ingest.v2.processes.connectors.astradb import (
|
|
25
24
|
CONNECTOR_TYPE,
|
|
26
25
|
AstraDBAccessConfig,
|
|
@@ -36,6 +35,7 @@ from unstructured_ingest.v2.processes.connectors.astradb import (
|
|
|
36
35
|
DestinationConnectionError,
|
|
37
36
|
SourceConnectionError,
|
|
38
37
|
)
|
|
38
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
39
39
|
|
|
40
40
|
EXISTENT_COLLECTION_NAME = "ingest_test_src"
|
|
41
41
|
NONEXISTENT_COLLECTION_NAME = "nonexistant"
|
|
@@ -29,7 +29,6 @@ from test.integration.connectors.utils.validation.destination import (
|
|
|
29
29
|
stager_validation,
|
|
30
30
|
)
|
|
31
31
|
from test.integration.utils import requires_env
|
|
32
|
-
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
33
32
|
from unstructured_ingest.v2.processes.connectors.azure_ai_search import (
|
|
34
33
|
CONNECTOR_TYPE,
|
|
35
34
|
RECORD_ID_LABEL,
|
|
@@ -40,6 +39,7 @@ from unstructured_ingest.v2.processes.connectors.azure_ai_search import (
|
|
|
40
39
|
AzureAISearchUploadStager,
|
|
41
40
|
AzureAISearchUploadStagerConfig,
|
|
42
41
|
)
|
|
42
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
43
43
|
|
|
44
44
|
repo_path = Path(__file__).parent.resolve()
|
|
45
45
|
|
|
@@ -27,7 +27,7 @@ from test.integration.connectors.utils.validation.destination import (
|
|
|
27
27
|
StagerValidationConfigs,
|
|
28
28
|
stager_validation,
|
|
29
29
|
)
|
|
30
|
-
from unstructured_ingest.v2.
|
|
30
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
31
31
|
from unstructured_ingest.v2.processes.connectors.chroma import (
|
|
32
32
|
CONNECTOR_TYPE,
|
|
33
33
|
ChromaConnectionConfig,
|
|
@@ -8,7 +8,6 @@ from fsspec import get_filesystem_class
|
|
|
8
8
|
|
|
9
9
|
from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
|
|
10
10
|
from test.integration.utils import requires_env
|
|
11
|
-
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
12
11
|
from unstructured_ingest.v2.processes.connectors.delta_table import (
|
|
13
12
|
CONNECTOR_TYPE,
|
|
14
13
|
DeltaTableAccessConfig,
|
|
@@ -18,6 +17,7 @@ from unstructured_ingest.v2.processes.connectors.delta_table import (
|
|
|
18
17
|
DeltaTableUploadStager,
|
|
19
18
|
DeltaTableUploadStagerConfig,
|
|
20
19
|
)
|
|
20
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
21
21
|
|
|
22
22
|
multiprocessing.set_start_method("spawn")
|
|
23
23
|
|
|
@@ -13,7 +13,6 @@ from upath import UPath
|
|
|
13
13
|
|
|
14
14
|
from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
|
|
15
15
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
16
|
-
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
17
16
|
from unstructured_ingest.v2.processes.connectors.lancedb.aws import (
|
|
18
17
|
LanceDBAwsAccessConfig,
|
|
19
18
|
LanceDBAwsConnectionConfig,
|
|
@@ -39,6 +38,7 @@ from unstructured_ingest.v2.processes.connectors.lancedb.local import (
|
|
|
39
38
|
LanceDBLocalConnectionConfig,
|
|
40
39
|
LanceDBLocalUploader,
|
|
41
40
|
)
|
|
41
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
42
42
|
|
|
43
43
|
DATABASE_NAME = "database"
|
|
44
44
|
TABLE_NAME = "elements"
|
|
@@ -25,7 +25,6 @@ from test.integration.connectors.utils.validation.destination import (
|
|
|
25
25
|
stager_validation,
|
|
26
26
|
)
|
|
27
27
|
from unstructured_ingest.error import DestinationConnectionError
|
|
28
|
-
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
29
28
|
from unstructured_ingest.v2.processes.connectors.milvus import (
|
|
30
29
|
CONNECTOR_TYPE,
|
|
31
30
|
MilvusConnectionConfig,
|
|
@@ -33,6 +32,7 @@ from unstructured_ingest.v2.processes.connectors.milvus import (
|
|
|
33
32
|
MilvusUploaderConfig,
|
|
34
33
|
MilvusUploadStager,
|
|
35
34
|
)
|
|
35
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
36
36
|
|
|
37
37
|
DB_NAME = "test_database"
|
|
38
38
|
EXISTENT_COLLECTION_NAME = "test_collection"
|
|
@@ -20,7 +20,6 @@ from test.integration.connectors.utils.validation.source import (
|
|
|
20
20
|
)
|
|
21
21
|
from test.integration.utils import requires_env
|
|
22
22
|
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
23
|
-
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
24
23
|
from unstructured_ingest.v2.processes.connectors.mongodb import (
|
|
25
24
|
CONNECTOR_TYPE,
|
|
26
25
|
MongoDBAccessConfig,
|
|
@@ -32,6 +31,7 @@ from unstructured_ingest.v2.processes.connectors.mongodb import (
|
|
|
32
31
|
MongoDBUploader,
|
|
33
32
|
MongoDBUploaderConfig,
|
|
34
33
|
)
|
|
34
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
35
35
|
|
|
36
36
|
SOURCE_COLLECTION = "sample-mongodb-data"
|
|
37
37
|
|
|
@@ -13,11 +13,6 @@ from test.integration.connectors.utils.constants import DESTINATION_TAG, GRAPH_D
|
|
|
13
13
|
from test.integration.connectors.utils.docker import container_context
|
|
14
14
|
from unstructured_ingest.error import DestinationConnectionError
|
|
15
15
|
from unstructured_ingest.utils.chunking import elements_from_base64_gzipped_json
|
|
16
|
-
from unstructured_ingest.v2.interfaces.file_data import (
|
|
17
|
-
FileData,
|
|
18
|
-
FileDataSourceMetadata,
|
|
19
|
-
SourceIdentifiers,
|
|
20
|
-
)
|
|
21
16
|
from unstructured_ingest.v2.processes.connectors.neo4j import (
|
|
22
17
|
CONNECTOR_TYPE,
|
|
23
18
|
Label,
|
|
@@ -28,6 +23,11 @@ from unstructured_ingest.v2.processes.connectors.neo4j import (
|
|
|
28
23
|
Neo4jUploadStager,
|
|
29
24
|
Relationship,
|
|
30
25
|
)
|
|
26
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
27
|
+
FileData,
|
|
28
|
+
FileDataSourceMetadata,
|
|
29
|
+
SourceIdentifiers,
|
|
30
|
+
)
|
|
31
31
|
|
|
32
32
|
USERNAME = "neo4j"
|
|
33
33
|
PASSWORD = "password"
|
|
@@ -15,7 +15,6 @@ from test.integration.connectors.utils.validation.source import (
|
|
|
15
15
|
source_connector_validation,
|
|
16
16
|
)
|
|
17
17
|
from test.integration.utils import requires_env
|
|
18
|
-
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
19
18
|
from unstructured_ingest.v2.processes.connectors.onedrive import (
|
|
20
19
|
CONNECTOR_TYPE,
|
|
21
20
|
OnedriveAccessConfig,
|
|
@@ -27,6 +26,7 @@ from unstructured_ingest.v2.processes.connectors.onedrive import (
|
|
|
27
26
|
OnedriveUploader,
|
|
28
27
|
OnedriveUploaderConfig,
|
|
29
28
|
)
|
|
29
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
@pytest.fixture
|
|
@@ -19,7 +19,6 @@ from test.integration.connectors.utils.validation.destination import (
|
|
|
19
19
|
)
|
|
20
20
|
from test.integration.utils import requires_env
|
|
21
21
|
from unstructured_ingest.error import DestinationConnectionError
|
|
22
|
-
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
23
22
|
from unstructured_ingest.v2.logger import logger
|
|
24
23
|
from unstructured_ingest.v2.processes.connectors.pinecone import (
|
|
25
24
|
CONNECTOR_TYPE,
|
|
@@ -31,6 +30,7 @@ from unstructured_ingest.v2.processes.connectors.pinecone import (
|
|
|
31
30
|
PineconeUploadStager,
|
|
32
31
|
PineconeUploadStagerConfig,
|
|
33
32
|
)
|
|
33
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
34
34
|
|
|
35
35
|
METADATA_BYTES_LIMIT = (
|
|
36
36
|
40960 # 40KB https://docs.pinecone.io/reference/quotas-and-limits#hard-limits
|
|
@@ -16,7 +16,6 @@ from test.integration.connectors.utils.validation.destination import (
|
|
|
16
16
|
stager_validation,
|
|
17
17
|
)
|
|
18
18
|
from test.integration.utils import requires_env
|
|
19
|
-
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
20
19
|
from unstructured_ingest.v2.processes.connectors.qdrant.cloud import (
|
|
21
20
|
CloudQdrantAccessConfig,
|
|
22
21
|
CloudQdrantConnectionConfig,
|
|
@@ -45,6 +44,7 @@ from unstructured_ingest.v2.processes.connectors.qdrant.server import (
|
|
|
45
44
|
ServerQdrantUploadStager,
|
|
46
45
|
ServerQdrantUploadStagerConfig,
|
|
47
46
|
)
|
|
47
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
48
48
|
|
|
49
49
|
COLLECTION_NAME = f"test-coll-{uuid.uuid4().hex[:12]}"
|
|
50
50
|
VECTORS_CONFIG = {"size": 384, "distance": "Cosine"}
|
|
@@ -11,7 +11,6 @@ from redis.asyncio import Redis, from_url
|
|
|
11
11
|
|
|
12
12
|
from test.integration.connectors.utils.constants import DESTINATION_TAG, NOSQL_TAG
|
|
13
13
|
from test.integration.utils import requires_env
|
|
14
|
-
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
15
14
|
from unstructured_ingest.v2.processes.connectors.redisdb import (
|
|
16
15
|
CONNECTOR_TYPE as REDIS_CONNECTOR_TYPE,
|
|
17
16
|
)
|
|
@@ -21,6 +20,7 @@ from unstructured_ingest.v2.processes.connectors.redisdb import (
|
|
|
21
20
|
RedisUploader,
|
|
22
21
|
RedisUploaderConfig,
|
|
23
22
|
)
|
|
23
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
async def delete_record(client: Redis, element_id: str, key_prefix: str) -> None:
|
|
@@ -18,7 +18,6 @@ from test.integration.connectors.utils.validation.source import (
|
|
|
18
18
|
)
|
|
19
19
|
from test.integration.utils import requires_env
|
|
20
20
|
from unstructured_ingest.v2.errors import UserAuthError, UserError
|
|
21
|
-
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
22
21
|
from unstructured_ingest.v2.processes.connectors.fsspec.s3 import (
|
|
23
22
|
CONNECTOR_TYPE,
|
|
24
23
|
S3AccessConfig,
|
|
@@ -30,6 +29,7 @@ from unstructured_ingest.v2.processes.connectors.fsspec.s3 import (
|
|
|
30
29
|
S3Uploader,
|
|
31
30
|
S3UploaderConfig,
|
|
32
31
|
)
|
|
32
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
def validate_predownload_file_data(file_data: FileData):
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
3
|
import time
|
|
4
|
+
from functools import lru_cache
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from typing import Generator
|
|
6
7
|
from uuid import uuid4
|
|
@@ -10,7 +11,6 @@ import requests
|
|
|
10
11
|
|
|
11
12
|
from test.integration.connectors.utils.constants import DESTINATION_TAG, NOSQL_TAG
|
|
12
13
|
from test.integration.utils import requires_env
|
|
13
|
-
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
14
14
|
from unstructured_ingest.v2.logger import logger
|
|
15
15
|
from unstructured_ingest.v2.processes.connectors.vectara import (
|
|
16
16
|
CONNECTOR_TYPE as VECTARA_CONNECTOR_TYPE,
|
|
@@ -23,26 +23,32 @@ from unstructured_ingest.v2.processes.connectors.vectara import (
|
|
|
23
23
|
VectaraUploadStager,
|
|
24
24
|
VectaraUploadStagerConfig,
|
|
25
25
|
)
|
|
26
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
26
27
|
|
|
27
28
|
|
|
28
|
-
def validate_upload(
|
|
29
|
+
def validate_upload(document: dict, expected_data: dict):
|
|
30
|
+
logger.info(f"validating document: {document}")
|
|
29
31
|
element_id = expected_data["element_id"]
|
|
30
32
|
expected_text = expected_data["text"]
|
|
31
33
|
filename = expected_data["metadata"]["filename"]
|
|
32
34
|
filetype = expected_data["metadata"]["filetype"]
|
|
33
35
|
page_number = expected_data["metadata"]["page_number"]
|
|
34
36
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
assert
|
|
38
|
-
|
|
39
|
-
assert
|
|
40
|
-
|
|
41
|
-
assert
|
|
42
|
-
assert
|
|
37
|
+
assert document is not None
|
|
38
|
+
speech_parts = document["parts"]
|
|
39
|
+
assert speech_parts
|
|
40
|
+
first_part = speech_parts[0]
|
|
41
|
+
assert first_part["text"] == expected_text
|
|
42
|
+
part_metadata = first_part["metadata"]
|
|
43
|
+
assert part_metadata
|
|
44
|
+
assert part_metadata["element_id"] == element_id
|
|
45
|
+
assert part_metadata["filename"] == filename
|
|
46
|
+
assert part_metadata["filetype"] == filetype
|
|
47
|
+
assert part_metadata["page_number"] == page_number
|
|
43
48
|
|
|
44
49
|
|
|
45
50
|
@requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
|
|
51
|
+
@lru_cache()
|
|
46
52
|
def _get_jwt_token():
|
|
47
53
|
"""Connect to the server and get a JWT token."""
|
|
48
54
|
customer_id = os.environ["VECTARA_CUSTOMER_ID"]
|
|
@@ -65,23 +71,12 @@ def _get_jwt_token():
|
|
|
65
71
|
return response_json.get("access_token")
|
|
66
72
|
|
|
67
73
|
|
|
68
|
-
def
|
|
74
|
+
def list_documents(corpus_key: str) -> list[str]:
|
|
69
75
|
|
|
70
|
-
url = f"https://api.vectara.io/v2/corpora/{corpus_key}/
|
|
76
|
+
url = f"https://api.vectara.io/v2/corpora/{corpus_key}/documents"
|
|
71
77
|
|
|
72
78
|
# the query below requires the corpus to have filter attributes for element_id
|
|
73
79
|
|
|
74
|
-
data = json.dumps(
|
|
75
|
-
{
|
|
76
|
-
"query": "string",
|
|
77
|
-
"search": {
|
|
78
|
-
"metadata_filter": f"part.element_id = '{element_id}'",
|
|
79
|
-
"lexical_interpolation": 1,
|
|
80
|
-
"limit": 10,
|
|
81
|
-
},
|
|
82
|
-
}
|
|
83
|
-
)
|
|
84
|
-
|
|
85
80
|
jwt_token = _get_jwt_token()
|
|
86
81
|
headers = {
|
|
87
82
|
"Content-Type": "application/json",
|
|
@@ -90,11 +85,26 @@ def query_data(corpus_key: str, element_id: str) -> dict:
|
|
|
90
85
|
"X-source": "unstructured",
|
|
91
86
|
}
|
|
92
87
|
|
|
93
|
-
response = requests.
|
|
88
|
+
response = requests.get(url, headers=headers)
|
|
94
89
|
response.raise_for_status()
|
|
95
90
|
response_json = response.json()
|
|
91
|
+
documents = response_json.get("documents", [])
|
|
92
|
+
return documents
|
|
93
|
+
|
|
96
94
|
|
|
97
|
-
|
|
95
|
+
def fetch_document(corpus_key: str, documents_id: str) -> dict:
|
|
96
|
+
url = f"https://api.vectara.io/v2/corpora/{corpus_key}/documents/{documents_id}"
|
|
97
|
+
jwt_token = _get_jwt_token()
|
|
98
|
+
headers = {
|
|
99
|
+
"Content-Type": "application/json",
|
|
100
|
+
"Accept": "application/json",
|
|
101
|
+
"Authorization": f"Bearer {jwt_token}",
|
|
102
|
+
"X-source": "unstructured",
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
response = requests.get(url, headers=headers)
|
|
106
|
+
response.raise_for_status()
|
|
107
|
+
return response.json()
|
|
98
108
|
|
|
99
109
|
|
|
100
110
|
def create_corpora(corpus_key: str, corpus_name: str) -> None:
|
|
@@ -148,8 +158,8 @@ def delete_corpora(corpus_key: str) -> None:
|
|
|
148
158
|
response.raise_for_status()
|
|
149
159
|
|
|
150
160
|
|
|
151
|
-
def
|
|
152
|
-
url = "https://api.vectara.io/v2/corpora
|
|
161
|
+
def get_metadata(corpus_key: str):
|
|
162
|
+
url = f"https://api.vectara.io/v2/corpora/{corpus_key}"
|
|
153
163
|
jwt_token = _get_jwt_token()
|
|
154
164
|
headers = {
|
|
155
165
|
"Content-Type": "application/json",
|
|
@@ -159,35 +169,28 @@ def list_corpora() -> list:
|
|
|
159
169
|
}
|
|
160
170
|
response = requests.get(url, headers=headers)
|
|
161
171
|
response.raise_for_status()
|
|
162
|
-
|
|
163
|
-
if response_json.get("corpora"):
|
|
164
|
-
return [item["key"] for item in response_json.get("corpora")]
|
|
165
|
-
else:
|
|
166
|
-
return []
|
|
172
|
+
return response.json()
|
|
167
173
|
|
|
168
174
|
|
|
169
175
|
def wait_for_ready(corpus_key: str, timeout=60, interval=2) -> None:
|
|
170
|
-
def is_ready_status():
|
|
171
|
-
corpora_list = list_corpora()
|
|
172
|
-
return corpus_key in corpora_list
|
|
173
|
-
|
|
174
176
|
start = time.time()
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
177
|
+
while time.time() - start < timeout:
|
|
178
|
+
try:
|
|
179
|
+
get_metadata(corpus_key)
|
|
180
|
+
return
|
|
181
|
+
except requests.HTTPError:
|
|
182
|
+
time.sleep(interval)
|
|
183
|
+
raise TimeoutError("time out waiting for corpus to be ready")
|
|
181
184
|
|
|
182
185
|
|
|
183
186
|
def wait_for_delete(corpus_key: str, timeout=60, interval=2) -> None:
|
|
184
187
|
start = time.time()
|
|
185
188
|
while time.time() - start < timeout:
|
|
186
|
-
|
|
187
|
-
|
|
189
|
+
try:
|
|
190
|
+
get_metadata(corpus_key)
|
|
191
|
+
time.sleep(interval)
|
|
192
|
+
except requests.HTTPError:
|
|
188
193
|
return
|
|
189
|
-
time.sleep(interval)
|
|
190
|
-
|
|
191
194
|
raise TimeoutError("time out waiting for corpus to delete")
|
|
192
195
|
|
|
193
196
|
|
|
@@ -210,11 +213,23 @@ def corpora_util() -> Generator[str, None, None]:
|
|
|
210
213
|
wait_for_delete(corpus_key=corpus_key)
|
|
211
214
|
|
|
212
215
|
|
|
216
|
+
def wait_for_doc_meta(corpus_key: str, timeout=60, interval=1) -> list[str]:
|
|
217
|
+
start = time.time()
|
|
218
|
+
while time.time() - start < timeout:
|
|
219
|
+
all_document_meta = list_documents(corpus_key)
|
|
220
|
+
if not all_document_meta:
|
|
221
|
+
time.sleep(interval)
|
|
222
|
+
continue
|
|
223
|
+
else:
|
|
224
|
+
return all_document_meta
|
|
225
|
+
raise TimeoutError("time out waiting for document to be ready")
|
|
226
|
+
|
|
227
|
+
|
|
213
228
|
@pytest.mark.asyncio
|
|
214
229
|
@pytest.mark.tags(VECTARA_CONNECTOR_TYPE, DESTINATION_TAG, "vectara", NOSQL_TAG)
|
|
215
230
|
@requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
|
|
216
231
|
async def test_vectara_destination(
|
|
217
|
-
upload_file: Path, tmp_path: Path, corpora_util: str, retries=30, interval=
|
|
232
|
+
upload_file: Path, tmp_path: Path, corpora_util: str, retries=30, interval=1
|
|
218
233
|
):
|
|
219
234
|
corpus_key = corpora_util
|
|
220
235
|
connection_kwargs = {
|
|
@@ -231,7 +246,7 @@ async def test_vectara_destination(
|
|
|
231
246
|
identifier="mock-file-data",
|
|
232
247
|
)
|
|
233
248
|
|
|
234
|
-
stager_config = VectaraUploadStagerConfig(
|
|
249
|
+
stager_config = VectaraUploadStagerConfig()
|
|
235
250
|
stager = VectaraUploadStager(upload_stager_config=stager_config)
|
|
236
251
|
new_upload_file = stager.run(
|
|
237
252
|
elements_filepath=upload_file,
|
|
@@ -260,11 +275,8 @@ async def test_vectara_destination(
|
|
|
260
275
|
elements = json.load(upload_fp)
|
|
261
276
|
first_element = elements[0]
|
|
262
277
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
break
|
|
269
|
-
|
|
270
|
-
validate_upload(response=response, expected_data=first_element)
|
|
278
|
+
all_document_meta = wait_for_doc_meta(corpus_key)
|
|
279
|
+
assert len(all_document_meta) == 1
|
|
280
|
+
document_meta = all_document_meta[0]
|
|
281
|
+
document = fetch_document(corpus_key=corpus_key, documents_id=document_meta["id"])
|
|
282
|
+
validate_upload(document=document, expected_data=first_element)
|
|
@@ -4,7 +4,8 @@ from pathlib import Path
|
|
|
4
4
|
|
|
5
5
|
from test.integration.connectors.utils.validation.utils import ValidationConfig
|
|
6
6
|
from unstructured_ingest.utils.data_prep import get_data
|
|
7
|
-
from unstructured_ingest.v2.interfaces import
|
|
7
|
+
from unstructured_ingest.v2.interfaces import UploadStager
|
|
8
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
class StagerValidationConfigs(ValidationConfig):
|
|
@@ -8,7 +8,8 @@ from deepdiff import DeepDiff
|
|
|
8
8
|
from pydantic import Field
|
|
9
9
|
|
|
10
10
|
from test.integration.connectors.utils.validation.utils import ValidationConfig
|
|
11
|
-
from unstructured_ingest.v2.interfaces import Downloader,
|
|
11
|
+
from unstructured_ingest.v2.interfaces import Downloader, Indexer
|
|
12
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
12
13
|
|
|
13
14
|
NONSTANDARD_METADATA_FIELDS = {
|
|
14
15
|
"additional_metadata.@microsoft.graph.downloadUrl": [
|
|
@@ -9,7 +9,6 @@ from weaviate.client import WeaviateClient
|
|
|
9
9
|
|
|
10
10
|
from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
|
|
11
11
|
from test.integration.connectors.utils.docker import container_context
|
|
12
|
-
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
13
12
|
from unstructured_ingest.v2.processes.connectors.weaviate.local import (
|
|
14
13
|
CONNECTOR_TYPE,
|
|
15
14
|
LocalWeaviateConnectionConfig,
|
|
@@ -17,6 +16,7 @@ from unstructured_ingest.v2.processes.connectors.weaviate.local import (
|
|
|
17
16
|
LocalWeaviateUploaderConfig,
|
|
18
17
|
LocalWeaviateUploadStager,
|
|
19
18
|
)
|
|
19
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
20
20
|
|
|
21
21
|
COLLECTION_NAME = "elements"
|
|
22
22
|
|
test/unit/test_html.py
CHANGED
|
@@ -5,7 +5,7 @@ from bs4 import BeautifulSoup
|
|
|
5
5
|
from pytest_mock import MockerFixture
|
|
6
6
|
|
|
7
7
|
from unstructured_ingest.utils.html import HtmlMixin
|
|
8
|
-
from unstructured_ingest.v2.
|
|
8
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def test_extract_images(mocker: MockerFixture):
|
|
@@ -8,7 +8,6 @@ from pyiceberg.exceptions import CommitFailedException
|
|
|
8
8
|
from pytest_mock import MockerFixture
|
|
9
9
|
|
|
10
10
|
from unstructured_ingest.v2.errors import ProviderError, UserError
|
|
11
|
-
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
12
11
|
from unstructured_ingest.v2.processes.connectors.ibm_watsonx import IBM_WATSONX_S3_CONNECTOR_TYPE
|
|
13
12
|
from unstructured_ingest.v2.processes.connectors.ibm_watsonx.ibm_watsonx_s3 import (
|
|
14
13
|
IbmWatsonxAccessConfig,
|
|
@@ -16,6 +15,7 @@ from unstructured_ingest.v2.processes.connectors.ibm_watsonx.ibm_watsonx_s3 impo
|
|
|
16
15
|
IbmWatsonxUploader,
|
|
17
16
|
IbmWatsonxUploaderConfig,
|
|
18
17
|
)
|
|
18
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
@pytest.fixture
|