unstructured-ingest 0.5.23__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (105) hide show
  1. test/integration/connectors/databricks/test_volumes_native.py +1 -1
  2. test/integration/connectors/duckdb/test_duckdb.py +1 -1
  3. test/integration/connectors/duckdb/test_motherduck.py +1 -1
  4. test/integration/connectors/elasticsearch/test_elasticsearch.py +1 -1
  5. test/integration/connectors/elasticsearch/test_opensearch.py +1 -1
  6. test/integration/connectors/sql/test_databricks_delta_tables.py +1 -1
  7. test/integration/connectors/sql/test_postgres.py +1 -1
  8. test/integration/connectors/sql/test_singlestore.py +1 -1
  9. test/integration/connectors/sql/test_snowflake.py +1 -1
  10. test/integration/connectors/sql/test_sqlite.py +1 -1
  11. test/integration/connectors/test_astradb.py +1 -1
  12. test/integration/connectors/test_azure_ai_search.py +1 -1
  13. test/integration/connectors/test_chroma.py +1 -1
  14. test/integration/connectors/test_delta_table.py +1 -1
  15. test/integration/connectors/test_lancedb.py +1 -1
  16. test/integration/connectors/test_milvus.py +1 -1
  17. test/integration/connectors/test_mongodb.py +1 -1
  18. test/integration/connectors/test_neo4j.py +5 -5
  19. test/integration/connectors/test_onedrive.py +1 -1
  20. test/integration/connectors/test_pinecone.py +1 -1
  21. test/integration/connectors/test_qdrant.py +1 -1
  22. test/integration/connectors/test_redis.py +1 -1
  23. test/integration/connectors/test_s3.py +1 -1
  24. test/integration/connectors/test_vectara.py +68 -56
  25. test/integration/connectors/utils/validation/destination.py +2 -1
  26. test/integration/connectors/utils/validation/source.py +2 -1
  27. test/integration/connectors/weaviate/test_local.py +1 -1
  28. test/unit/test_html.py +1 -1
  29. test/unit/v2/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +1 -1
  30. test/unit/v2/connectors/motherduck/test_base.py +1 -2
  31. test/unit/v2/connectors/sql/test_sql.py +1 -1
  32. unstructured_ingest/__version__.py +1 -1
  33. unstructured_ingest/utils/html.py +2 -1
  34. unstructured_ingest/v2/interfaces/__init__.py +0 -13
  35. unstructured_ingest/v2/interfaces/downloader.py +1 -1
  36. unstructured_ingest/v2/interfaces/indexer.py +1 -1
  37. unstructured_ingest/v2/interfaces/upload_stager.py +2 -2
  38. unstructured_ingest/v2/interfaces/uploader.py +2 -3
  39. unstructured_ingest/v2/pipeline/steps/chunk.py +1 -2
  40. unstructured_ingest/v2/pipeline/steps/download.py +2 -3
  41. unstructured_ingest/v2/pipeline/steps/embed.py +1 -2
  42. unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
  43. unstructured_ingest/v2/pipeline/steps/partition.py +1 -2
  44. unstructured_ingest/v2/pipeline/steps/stage.py +2 -2
  45. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
  46. unstructured_ingest/v2/pipeline/steps/upload.py +2 -2
  47. unstructured_ingest/v2/processes/connectors/airtable.py +1 -2
  48. unstructured_ingest/v2/processes/connectors/astradb.py +7 -5
  49. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +1 -1
  50. unstructured_ingest/v2/processes/connectors/chroma.py +1 -1
  51. unstructured_ingest/v2/processes/connectors/confluence.py +5 -3
  52. unstructured_ingest/v2/processes/connectors/couchbase.py +7 -5
  53. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +5 -3
  54. unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +64 -19
  55. unstructured_ingest/v2/processes/connectors/delta_table.py +1 -1
  56. unstructured_ingest/v2/processes/connectors/discord.py +5 -3
  57. unstructured_ingest/v2/processes/connectors/duckdb/base.py +2 -1
  58. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +1 -1
  59. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +1 -1
  60. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +7 -5
  61. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +1 -1
  62. unstructured_ingest/v2/processes/connectors/fsspec/box.py +1 -1
  63. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +1 -1
  64. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +5 -3
  65. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +1 -1
  66. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -4
  67. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +1 -1
  68. unstructured_ingest/v2/processes/connectors/gitlab.py +5 -3
  69. unstructured_ingest/v2/processes/connectors/google_drive.py +5 -3
  70. unstructured_ingest/v2/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +1 -1
  71. unstructured_ingest/v2/processes/connectors/jira.py +5 -3
  72. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +5 -3
  73. unstructured_ingest/v2/processes/connectors/kdbai.py +1 -1
  74. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -4
  75. unstructured_ingest/v2/processes/connectors/local.py +5 -3
  76. unstructured_ingest/v2/processes/connectors/milvus.py +1 -1
  77. unstructured_ingest/v2/processes/connectors/mongodb.py +7 -5
  78. unstructured_ingest/v2/processes/connectors/neo4j.py +1 -1
  79. unstructured_ingest/v2/processes/connectors/notion/connector.py +5 -3
  80. unstructured_ingest/v2/processes/connectors/onedrive.py +5 -3
  81. unstructured_ingest/v2/processes/connectors/outlook.py +5 -2
  82. unstructured_ingest/v2/processes/connectors/pinecone.py +1 -1
  83. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +1 -1
  84. unstructured_ingest/v2/processes/connectors/redisdb.py +1 -1
  85. unstructured_ingest/v2/processes/connectors/salesforce.py +5 -3
  86. unstructured_ingest/v2/processes/connectors/sharepoint.py +3 -3
  87. unstructured_ingest/v2/processes/connectors/slack.py +2 -2
  88. unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +1 -1
  89. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +1 -1
  90. unstructured_ingest/v2/processes/connectors/sql/sql.py +13 -8
  91. unstructured_ingest/v2/processes/connectors/sql/vastdb.py +3 -3
  92. unstructured_ingest/v2/processes/connectors/vectara.py +1 -1
  93. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +1 -1
  94. unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py +5 -3
  95. unstructured_ingest/v2/processes/filter.py +1 -1
  96. unstructured_ingest/v2/processes/uncompress.py +1 -1
  97. unstructured_ingest/v2/processes/utils/blob_storage.py +2 -1
  98. unstructured_ingest/v2/utils.py +1 -1
  99. {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/METADATA +101 -101
  100. {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/RECORD +104 -105
  101. unstructured_ingest/v2/interfaces/file_data.py +0 -13
  102. {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/LICENSE.md +0 -0
  103. {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/WHEEL +0 -0
  104. {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/entry_points.txt +0 -0
  105. {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/top_level.txt +0 -0
@@ -21,7 +21,6 @@ from test.integration.connectors.utils.validation.source import (
21
21
  )
22
22
  from test.integration.utils import requires_env
23
23
  from unstructured_ingest.v2.errors import UserAuthError, UserError
24
- from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
25
24
  from unstructured_ingest.v2.processes.connectors.databricks.volumes_native import (
26
25
  CONNECTOR_TYPE,
27
26
  DatabricksNativeVolumesAccessConfig,
@@ -33,6 +32,7 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes_native impor
33
32
  DatabricksNativeVolumesUploader,
34
33
  DatabricksNativeVolumesUploaderConfig,
35
34
  )
35
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
36
36
 
37
37
 
38
38
  @dataclass
@@ -10,7 +10,6 @@ from test.integration.connectors.utils.validation.destination import (
10
10
  StagerValidationConfigs,
11
11
  stager_validation,
12
12
  )
13
- from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
14
13
  from unstructured_ingest.v2.processes.connectors.duckdb.duckdb import (
15
14
  CONNECTOR_TYPE,
16
15
  DuckDBConnectionConfig,
@@ -18,6 +17,7 @@ from unstructured_ingest.v2.processes.connectors.duckdb.duckdb import (
18
17
  DuckDBUploaderConfig,
19
18
  DuckDBUploadStager,
20
19
  )
20
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
21
21
 
22
22
 
23
23
  @pytest.fixture
@@ -9,7 +9,6 @@ import pytest
9
9
 
10
10
  from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
11
11
  from test.integration.utils import requires_env
12
- from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
13
12
  from unstructured_ingest.v2.processes.connectors.duckdb.motherduck import (
14
13
  CONNECTOR_TYPE,
15
14
  MotherDuckAccessConfig,
@@ -18,6 +17,7 @@ from unstructured_ingest.v2.processes.connectors.duckdb.motherduck import (
18
17
  MotherDuckUploaderConfig,
19
18
  MotherDuckUploadStager,
20
19
  )
20
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
21
21
 
22
22
 
23
23
  @pytest.fixture
@@ -22,7 +22,7 @@ from test.integration.connectors.utils.validation.source import (
22
22
  source_connector_validation,
23
23
  )
24
24
  from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
25
- from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
25
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
26
26
  from unstructured_ingest.v2.processes.connectors.elasticsearch.elasticsearch import (
27
27
  CONNECTOR_TYPE,
28
28
  ElasticsearchAccessConfig,
@@ -24,7 +24,6 @@ from unstructured_ingest.error import (
24
24
  DestinationConnectionError,
25
25
  SourceConnectionError,
26
26
  )
27
- from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
28
27
  from unstructured_ingest.v2.processes.connectors.elasticsearch.opensearch import (
29
28
  CONNECTOR_TYPE,
30
29
  OpenSearchAccessConfig,
@@ -38,6 +37,7 @@ from unstructured_ingest.v2.processes.connectors.elasticsearch.opensearch import
38
37
  OpenSearchUploadStager,
39
38
  OpenSearchUploadStagerConfig,
40
39
  )
40
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
41
41
 
42
42
  SOURCE_INDEX_NAME = "movies"
43
43
  DESTINATION_INDEX_NAME = "elements"
@@ -14,7 +14,6 @@ from pytest_mock import MockerFixture
14
14
 
15
15
  from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG, env_setup_path
16
16
  from test.integration.utils import requires_env
17
- from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
18
17
  from unstructured_ingest.v2.logger import logger
19
18
  from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables import (
20
19
  CONNECTOR_TYPE,
@@ -24,6 +23,7 @@ from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables imp
24
23
  DatabricksDeltaTablesUploaderConfig,
25
24
  DatabricksDeltaTablesUploadStager,
26
25
  )
26
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
27
27
 
28
28
  CATALOG = "utic-dev-tech-fixtures"
29
29
 
@@ -20,7 +20,6 @@ from test.integration.connectors.utils.validation.source import (
20
20
  SourceValidationConfigs,
21
21
  source_connector_validation,
22
22
  )
23
- from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
24
23
  from unstructured_ingest.v2.processes.connectors.sql.postgres import (
25
24
  CONNECTOR_TYPE,
26
25
  PostgresAccessConfig,
@@ -32,6 +31,7 @@ from unstructured_ingest.v2.processes.connectors.sql.postgres import (
32
31
  PostgresUploader,
33
32
  PostgresUploadStager,
34
33
  )
34
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
35
35
 
36
36
  SEED_DATA_ROWS = 10
37
37
 
@@ -20,7 +20,6 @@ from test.integration.connectors.utils.validation.source import (
20
20
  SourceValidationConfigs,
21
21
  source_connector_validation,
22
22
  )
23
- from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
24
23
  from unstructured_ingest.v2.processes.connectors.sql.singlestore import (
25
24
  CONNECTOR_TYPE,
26
25
  SingleStoreAccessConfig,
@@ -33,6 +32,7 @@ from unstructured_ingest.v2.processes.connectors.sql.singlestore import (
33
32
  SingleStoreUploaderConfig,
34
33
  SingleStoreUploadStager,
35
34
  )
35
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
36
36
 
37
37
  SEED_DATA_ROWS = 10
38
38
 
@@ -22,7 +22,6 @@ from test.integration.connectors.utils.validation.source import (
22
22
  source_connector_validation,
23
23
  )
24
24
  from test.integration.utils import requires_env
25
- from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
26
25
  from unstructured_ingest.v2.processes.connectors.sql.snowflake import (
27
26
  CONNECTOR_TYPE,
28
27
  SnowflakeAccessConfig,
@@ -34,6 +33,7 @@ from unstructured_ingest.v2.processes.connectors.sql.snowflake import (
34
33
  SnowflakeUploader,
35
34
  SnowflakeUploadStager,
36
35
  )
36
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
37
37
 
38
38
  SEED_DATA_ROWS = 20
39
39
 
@@ -20,7 +20,6 @@ from test.integration.connectors.utils.validation.source import (
20
20
  SourceValidationConfigs,
21
21
  source_connector_validation,
22
22
  )
23
- from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
24
23
  from unstructured_ingest.v2.processes.connectors.sql.sqlite import (
25
24
  CONNECTOR_TYPE,
26
25
  SQLiteConnectionConfig,
@@ -31,6 +30,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sqlite import (
31
30
  SQLiteUploader,
32
31
  SQLiteUploadStager,
33
32
  )
33
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
34
34
 
35
35
  SEED_DATA_ROWS = 10
36
36
 
@@ -20,7 +20,6 @@ from test.integration.connectors.utils.validation.source import (
20
20
  source_connector_validation,
21
21
  )
22
22
  from test.integration.utils import requires_env
23
- from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
24
23
  from unstructured_ingest.v2.processes.connectors.astradb import (
25
24
  CONNECTOR_TYPE,
26
25
  AstraDBAccessConfig,
@@ -36,6 +35,7 @@ from unstructured_ingest.v2.processes.connectors.astradb import (
36
35
  DestinationConnectionError,
37
36
  SourceConnectionError,
38
37
  )
38
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
39
39
 
40
40
  EXISTENT_COLLECTION_NAME = "ingest_test_src"
41
41
  NONEXISTENT_COLLECTION_NAME = "nonexistant"
@@ -29,7 +29,6 @@ from test.integration.connectors.utils.validation.destination import (
29
29
  stager_validation,
30
30
  )
31
31
  from test.integration.utils import requires_env
32
- from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
33
32
  from unstructured_ingest.v2.processes.connectors.azure_ai_search import (
34
33
  CONNECTOR_TYPE,
35
34
  RECORD_ID_LABEL,
@@ -40,6 +39,7 @@ from unstructured_ingest.v2.processes.connectors.azure_ai_search import (
40
39
  AzureAISearchUploadStager,
41
40
  AzureAISearchUploadStagerConfig,
42
41
  )
42
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
43
43
 
44
44
  repo_path = Path(__file__).parent.resolve()
45
45
 
@@ -27,7 +27,7 @@ from test.integration.connectors.utils.validation.destination import (
27
27
  StagerValidationConfigs,
28
28
  stager_validation,
29
29
  )
30
- from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
30
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
31
31
  from unstructured_ingest.v2.processes.connectors.chroma import (
32
32
  CONNECTOR_TYPE,
33
33
  ChromaConnectionConfig,
@@ -8,7 +8,6 @@ from fsspec import get_filesystem_class
8
8
 
9
9
  from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
10
10
  from test.integration.utils import requires_env
11
- from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
12
11
  from unstructured_ingest.v2.processes.connectors.delta_table import (
13
12
  CONNECTOR_TYPE,
14
13
  DeltaTableAccessConfig,
@@ -18,6 +17,7 @@ from unstructured_ingest.v2.processes.connectors.delta_table import (
18
17
  DeltaTableUploadStager,
19
18
  DeltaTableUploadStagerConfig,
20
19
  )
20
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
21
21
 
22
22
  multiprocessing.set_start_method("spawn")
23
23
 
@@ -13,7 +13,6 @@ from upath import UPath
13
13
 
14
14
  from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
15
15
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
16
- from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
17
16
  from unstructured_ingest.v2.processes.connectors.lancedb.aws import (
18
17
  LanceDBAwsAccessConfig,
19
18
  LanceDBAwsConnectionConfig,
@@ -39,6 +38,7 @@ from unstructured_ingest.v2.processes.connectors.lancedb.local import (
39
38
  LanceDBLocalConnectionConfig,
40
39
  LanceDBLocalUploader,
41
40
  )
41
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
42
42
 
43
43
  DATABASE_NAME = "database"
44
44
  TABLE_NAME = "elements"
@@ -25,7 +25,6 @@ from test.integration.connectors.utils.validation.destination import (
25
25
  stager_validation,
26
26
  )
27
27
  from unstructured_ingest.error import DestinationConnectionError
28
- from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
29
28
  from unstructured_ingest.v2.processes.connectors.milvus import (
30
29
  CONNECTOR_TYPE,
31
30
  MilvusConnectionConfig,
@@ -33,6 +32,7 @@ from unstructured_ingest.v2.processes.connectors.milvus import (
33
32
  MilvusUploaderConfig,
34
33
  MilvusUploadStager,
35
34
  )
35
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
36
36
 
37
37
  DB_NAME = "test_database"
38
38
  EXISTENT_COLLECTION_NAME = "test_collection"
@@ -20,7 +20,6 @@ from test.integration.connectors.utils.validation.source import (
20
20
  )
21
21
  from test.integration.utils import requires_env
22
22
  from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
23
- from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
24
23
  from unstructured_ingest.v2.processes.connectors.mongodb import (
25
24
  CONNECTOR_TYPE,
26
25
  MongoDBAccessConfig,
@@ -32,6 +31,7 @@ from unstructured_ingest.v2.processes.connectors.mongodb import (
32
31
  MongoDBUploader,
33
32
  MongoDBUploaderConfig,
34
33
  )
34
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
35
35
 
36
36
  SOURCE_COLLECTION = "sample-mongodb-data"
37
37
 
@@ -13,11 +13,6 @@ from test.integration.connectors.utils.constants import DESTINATION_TAG, GRAPH_D
13
13
  from test.integration.connectors.utils.docker import container_context
14
14
  from unstructured_ingest.error import DestinationConnectionError
15
15
  from unstructured_ingest.utils.chunking import elements_from_base64_gzipped_json
16
- from unstructured_ingest.v2.interfaces.file_data import (
17
- FileData,
18
- FileDataSourceMetadata,
19
- SourceIdentifiers,
20
- )
21
16
  from unstructured_ingest.v2.processes.connectors.neo4j import (
22
17
  CONNECTOR_TYPE,
23
18
  Label,
@@ -28,6 +23,11 @@ from unstructured_ingest.v2.processes.connectors.neo4j import (
28
23
  Neo4jUploadStager,
29
24
  Relationship,
30
25
  )
26
+ from unstructured_ingest.v2.types.file_data import (
27
+ FileData,
28
+ FileDataSourceMetadata,
29
+ SourceIdentifiers,
30
+ )
31
31
 
32
32
  USERNAME = "neo4j"
33
33
  PASSWORD = "password"
@@ -15,7 +15,6 @@ from test.integration.connectors.utils.validation.source import (
15
15
  source_connector_validation,
16
16
  )
17
17
  from test.integration.utils import requires_env
18
- from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
19
18
  from unstructured_ingest.v2.processes.connectors.onedrive import (
20
19
  CONNECTOR_TYPE,
21
20
  OnedriveAccessConfig,
@@ -27,6 +26,7 @@ from unstructured_ingest.v2.processes.connectors.onedrive import (
27
26
  OnedriveUploader,
28
27
  OnedriveUploaderConfig,
29
28
  )
29
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
30
30
 
31
31
 
32
32
  @pytest.fixture
@@ -19,7 +19,6 @@ from test.integration.connectors.utils.validation.destination import (
19
19
  )
20
20
  from test.integration.utils import requires_env
21
21
  from unstructured_ingest.error import DestinationConnectionError
22
- from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
23
22
  from unstructured_ingest.v2.logger import logger
24
23
  from unstructured_ingest.v2.processes.connectors.pinecone import (
25
24
  CONNECTOR_TYPE,
@@ -31,6 +30,7 @@ from unstructured_ingest.v2.processes.connectors.pinecone import (
31
30
  PineconeUploadStager,
32
31
  PineconeUploadStagerConfig,
33
32
  )
33
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
34
34
 
35
35
  METADATA_BYTES_LIMIT = (
36
36
  40960 # 40KB https://docs.pinecone.io/reference/quotas-and-limits#hard-limits
@@ -16,7 +16,6 @@ from test.integration.connectors.utils.validation.destination import (
16
16
  stager_validation,
17
17
  )
18
18
  from test.integration.utils import requires_env
19
- from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
20
19
  from unstructured_ingest.v2.processes.connectors.qdrant.cloud import (
21
20
  CloudQdrantAccessConfig,
22
21
  CloudQdrantConnectionConfig,
@@ -45,6 +44,7 @@ from unstructured_ingest.v2.processes.connectors.qdrant.server import (
45
44
  ServerQdrantUploadStager,
46
45
  ServerQdrantUploadStagerConfig,
47
46
  )
47
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
48
48
 
49
49
  COLLECTION_NAME = f"test-coll-{uuid.uuid4().hex[:12]}"
50
50
  VECTORS_CONFIG = {"size": 384, "distance": "Cosine"}
@@ -11,7 +11,6 @@ from redis.asyncio import Redis, from_url
11
11
 
12
12
  from test.integration.connectors.utils.constants import DESTINATION_TAG, NOSQL_TAG
13
13
  from test.integration.utils import requires_env
14
- from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
15
14
  from unstructured_ingest.v2.processes.connectors.redisdb import (
16
15
  CONNECTOR_TYPE as REDIS_CONNECTOR_TYPE,
17
16
  )
@@ -21,6 +20,7 @@ from unstructured_ingest.v2.processes.connectors.redisdb import (
21
20
  RedisUploader,
22
21
  RedisUploaderConfig,
23
22
  )
23
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
24
24
 
25
25
 
26
26
  async def delete_record(client: Redis, element_id: str, key_prefix: str) -> None:
@@ -18,7 +18,6 @@ from test.integration.connectors.utils.validation.source import (
18
18
  )
19
19
  from test.integration.utils import requires_env
20
20
  from unstructured_ingest.v2.errors import UserAuthError, UserError
21
- from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
22
21
  from unstructured_ingest.v2.processes.connectors.fsspec.s3 import (
23
22
  CONNECTOR_TYPE,
24
23
  S3AccessConfig,
@@ -30,6 +29,7 @@ from unstructured_ingest.v2.processes.connectors.fsspec.s3 import (
30
29
  S3Uploader,
31
30
  S3UploaderConfig,
32
31
  )
32
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
33
33
 
34
34
 
35
35
  def validate_predownload_file_data(file_data: FileData):
@@ -1,6 +1,7 @@
1
1
  import json
2
2
  import os
3
3
  import time
4
+ from functools import lru_cache
4
5
  from pathlib import Path
5
6
  from typing import Generator
6
7
  from uuid import uuid4
@@ -10,7 +11,6 @@ import requests
10
11
 
11
12
  from test.integration.connectors.utils.constants import DESTINATION_TAG, NOSQL_TAG
12
13
  from test.integration.utils import requires_env
13
- from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
14
14
  from unstructured_ingest.v2.logger import logger
15
15
  from unstructured_ingest.v2.processes.connectors.vectara import (
16
16
  CONNECTOR_TYPE as VECTARA_CONNECTOR_TYPE,
@@ -23,26 +23,32 @@ from unstructured_ingest.v2.processes.connectors.vectara import (
23
23
  VectaraUploadStager,
24
24
  VectaraUploadStagerConfig,
25
25
  )
26
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
26
27
 
27
28
 
28
- def validate_upload(response: dict, expected_data: dict):
29
+ def validate_upload(document: dict, expected_data: dict):
30
+ logger.info(f"validating document: {document}")
29
31
  element_id = expected_data["element_id"]
30
32
  expected_text = expected_data["text"]
31
33
  filename = expected_data["metadata"]["filename"]
32
34
  filetype = expected_data["metadata"]["filetype"]
33
35
  page_number = expected_data["metadata"]["page_number"]
34
36
 
35
- response = response["search_results"][0]
36
-
37
- assert response is not None
38
- assert response["text"] == expected_text
39
- assert response["part_metadata"]["element_id"] == element_id
40
- assert response["part_metadata"]["filename"] == filename
41
- assert response["part_metadata"]["filetype"] == filetype
42
- assert response["part_metadata"]["page_number"] == page_number
37
+ assert document is not None
38
+ speech_parts = document["parts"]
39
+ assert speech_parts
40
+ first_part = speech_parts[0]
41
+ assert first_part["text"] == expected_text
42
+ part_metadata = first_part["metadata"]
43
+ assert part_metadata
44
+ assert part_metadata["element_id"] == element_id
45
+ assert part_metadata["filename"] == filename
46
+ assert part_metadata["filetype"] == filetype
47
+ assert part_metadata["page_number"] == page_number
43
48
 
44
49
 
45
50
  @requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
51
+ @lru_cache()
46
52
  def _get_jwt_token():
47
53
  """Connect to the server and get a JWT token."""
48
54
  customer_id = os.environ["VECTARA_CUSTOMER_ID"]
@@ -65,23 +71,12 @@ def _get_jwt_token():
65
71
  return response_json.get("access_token")
66
72
 
67
73
 
68
- def query_data(corpus_key: str, element_id: str) -> dict:
74
+ def list_documents(corpus_key: str) -> list[str]:
69
75
 
70
- url = f"https://api.vectara.io/v2/corpora/{corpus_key}/query"
76
+ url = f"https://api.vectara.io/v2/corpora/{corpus_key}/documents"
71
77
 
72
78
  # the query below requires the corpus to have filter attributes for element_id
73
79
 
74
- data = json.dumps(
75
- {
76
- "query": "string",
77
- "search": {
78
- "metadata_filter": f"part.element_id = '{element_id}'",
79
- "lexical_interpolation": 1,
80
- "limit": 10,
81
- },
82
- }
83
- )
84
-
85
80
  jwt_token = _get_jwt_token()
86
81
  headers = {
87
82
  "Content-Type": "application/json",
@@ -90,11 +85,26 @@ def query_data(corpus_key: str, element_id: str) -> dict:
90
85
  "X-source": "unstructured",
91
86
  }
92
87
 
93
- response = requests.post(url, headers=headers, data=data)
88
+ response = requests.get(url, headers=headers)
94
89
  response.raise_for_status()
95
90
  response_json = response.json()
91
+ documents = response_json.get("documents", [])
92
+ return documents
93
+
96
94
 
97
- return response_json
95
+ def fetch_document(corpus_key: str, documents_id: str) -> dict:
96
+ url = f"https://api.vectara.io/v2/corpora/{corpus_key}/documents/{documents_id}"
97
+ jwt_token = _get_jwt_token()
98
+ headers = {
99
+ "Content-Type": "application/json",
100
+ "Accept": "application/json",
101
+ "Authorization": f"Bearer {jwt_token}",
102
+ "X-source": "unstructured",
103
+ }
104
+
105
+ response = requests.get(url, headers=headers)
106
+ response.raise_for_status()
107
+ return response.json()
98
108
 
99
109
 
100
110
  def create_corpora(corpus_key: str, corpus_name: str) -> None:
@@ -148,8 +158,8 @@ def delete_corpora(corpus_key: str) -> None:
148
158
  response.raise_for_status()
149
159
 
150
160
 
151
- def list_corpora() -> list:
152
- url = "https://api.vectara.io/v2/corpora?limit=100"
161
+ def get_metadata(corpus_key: str):
162
+ url = f"https://api.vectara.io/v2/corpora/{corpus_key}"
153
163
  jwt_token = _get_jwt_token()
154
164
  headers = {
155
165
  "Content-Type": "application/json",
@@ -159,35 +169,28 @@ def list_corpora() -> list:
159
169
  }
160
170
  response = requests.get(url, headers=headers)
161
171
  response.raise_for_status()
162
- response_json = response.json()
163
- if response_json.get("corpora"):
164
- return [item["key"] for item in response_json.get("corpora")]
165
- else:
166
- return []
172
+ return response.json()
167
173
 
168
174
 
169
175
  def wait_for_ready(corpus_key: str, timeout=60, interval=2) -> None:
170
- def is_ready_status():
171
- corpora_list = list_corpora()
172
- return corpus_key in corpora_list
173
-
174
176
  start = time.time()
175
- is_ready = is_ready_status()
176
- while not is_ready and time.time() - start < timeout:
177
- time.sleep(interval)
178
- is_ready = is_ready_status()
179
- if not is_ready:
180
- raise TimeoutError("time out waiting for corpus to be ready")
177
+ while time.time() - start < timeout:
178
+ try:
179
+ get_metadata(corpus_key)
180
+ return
181
+ except requests.HTTPError:
182
+ time.sleep(interval)
183
+ raise TimeoutError("time out waiting for corpus to be ready")
181
184
 
182
185
 
183
186
  def wait_for_delete(corpus_key: str, timeout=60, interval=2) -> None:
184
187
  start = time.time()
185
188
  while time.time() - start < timeout:
186
- corpora_list = list_corpora()
187
- if corpus_key not in corpora_list:
189
+ try:
190
+ get_metadata(corpus_key)
191
+ time.sleep(interval)
192
+ except requests.HTTPError:
188
193
  return
189
- time.sleep(interval)
190
-
191
194
  raise TimeoutError("time out waiting for corpus to delete")
192
195
 
193
196
 
@@ -210,11 +213,23 @@ def corpora_util() -> Generator[str, None, None]:
210
213
  wait_for_delete(corpus_key=corpus_key)
211
214
 
212
215
 
216
+ def wait_for_doc_meta(corpus_key: str, timeout=60, interval=1) -> list[str]:
217
+ start = time.time()
218
+ while time.time() - start < timeout:
219
+ all_document_meta = list_documents(corpus_key)
220
+ if not all_document_meta:
221
+ time.sleep(interval)
222
+ continue
223
+ else:
224
+ return all_document_meta
225
+ raise TimeoutError("time out waiting for document to be ready")
226
+
227
+
213
228
  @pytest.mark.asyncio
214
229
  @pytest.mark.tags(VECTARA_CONNECTOR_TYPE, DESTINATION_TAG, "vectara", NOSQL_TAG)
215
230
  @requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
216
231
  async def test_vectara_destination(
217
- upload_file: Path, tmp_path: Path, corpora_util: str, retries=30, interval=10
232
+ upload_file: Path, tmp_path: Path, corpora_util: str, retries=30, interval=1
218
233
  ):
219
234
  corpus_key = corpora_util
220
235
  connection_kwargs = {
@@ -231,7 +246,7 @@ async def test_vectara_destination(
231
246
  identifier="mock-file-data",
232
247
  )
233
248
 
234
- stager_config = VectaraUploadStagerConfig(batch_size=10)
249
+ stager_config = VectaraUploadStagerConfig()
235
250
  stager = VectaraUploadStager(upload_stager_config=stager_config)
236
251
  new_upload_file = stager.run(
237
252
  elements_filepath=upload_file,
@@ -260,11 +275,8 @@ async def test_vectara_destination(
260
275
  elements = json.load(upload_fp)
261
276
  first_element = elements[0]
262
277
 
263
- for i in range(retries):
264
- response = query_data(corpus_key, first_element["element_id"])
265
- if not response["search_results"]:
266
- time.sleep(interval)
267
- else:
268
- break
269
-
270
- validate_upload(response=response, expected_data=first_element)
278
+ all_document_meta = wait_for_doc_meta(corpus_key)
279
+ assert len(all_document_meta) == 1
280
+ document_meta = all_document_meta[0]
281
+ document = fetch_document(corpus_key=corpus_key, documents_id=document_meta["id"])
282
+ validate_upload(document=document, expected_data=first_element)
@@ -4,7 +4,8 @@ from pathlib import Path
4
4
 
5
5
  from test.integration.connectors.utils.validation.utils import ValidationConfig
6
6
  from unstructured_ingest.utils.data_prep import get_data
7
- from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers, UploadStager
7
+ from unstructured_ingest.v2.interfaces import UploadStager
8
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
8
9
 
9
10
 
10
11
  class StagerValidationConfigs(ValidationConfig):
@@ -8,7 +8,8 @@ from deepdiff import DeepDiff
8
8
  from pydantic import Field
9
9
 
10
10
  from test.integration.connectors.utils.validation.utils import ValidationConfig
11
- from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
11
+ from unstructured_ingest.v2.interfaces import Downloader, Indexer
12
+ from unstructured_ingest.v2.types.file_data import FileData
12
13
 
13
14
  NONSTANDARD_METADATA_FIELDS = {
14
15
  "additional_metadata.@microsoft.graph.downloadUrl": [
@@ -9,7 +9,6 @@ from weaviate.client import WeaviateClient
9
9
 
10
10
  from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
11
11
  from test.integration.connectors.utils.docker import container_context
12
- from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
13
12
  from unstructured_ingest.v2.processes.connectors.weaviate.local import (
14
13
  CONNECTOR_TYPE,
15
14
  LocalWeaviateConnectionConfig,
@@ -17,6 +16,7 @@ from unstructured_ingest.v2.processes.connectors.weaviate.local import (
17
16
  LocalWeaviateUploaderConfig,
18
17
  LocalWeaviateUploadStager,
19
18
  )
19
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
20
20
 
21
21
  COLLECTION_NAME = "elements"
22
22
 
test/unit/test_html.py CHANGED
@@ -5,7 +5,7 @@ from bs4 import BeautifulSoup
5
5
  from pytest_mock import MockerFixture
6
6
 
7
7
  from unstructured_ingest.utils.html import HtmlMixin
8
- from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
8
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
9
9
 
10
10
 
11
11
  def test_extract_images(mocker: MockerFixture):
@@ -8,7 +8,6 @@ from pyiceberg.exceptions import CommitFailedException
8
8
  from pytest_mock import MockerFixture
9
9
 
10
10
  from unstructured_ingest.v2.errors import ProviderError, UserError
11
- from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
12
11
  from unstructured_ingest.v2.processes.connectors.ibm_watsonx import IBM_WATSONX_S3_CONNECTOR_TYPE
13
12
  from unstructured_ingest.v2.processes.connectors.ibm_watsonx.ibm_watsonx_s3 import (
14
13
  IbmWatsonxAccessConfig,
@@ -16,6 +15,7 @@ from unstructured_ingest.v2.processes.connectors.ibm_watsonx.ibm_watsonx_s3 impo
16
15
  IbmWatsonxUploader,
17
16
  IbmWatsonxUploaderConfig,
18
17
  )
18
+ from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
19
19
 
20
20
 
21
21
  @pytest.fixture