unstructured-ingest 0.5.23__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (105) hide show
  1. test/integration/connectors/databricks/test_volumes_native.py +1 -1
  2. test/integration/connectors/duckdb/test_duckdb.py +1 -1
  3. test/integration/connectors/duckdb/test_motherduck.py +1 -1
  4. test/integration/connectors/elasticsearch/test_elasticsearch.py +1 -1
  5. test/integration/connectors/elasticsearch/test_opensearch.py +1 -1
  6. test/integration/connectors/sql/test_databricks_delta_tables.py +1 -1
  7. test/integration/connectors/sql/test_postgres.py +1 -1
  8. test/integration/connectors/sql/test_singlestore.py +1 -1
  9. test/integration/connectors/sql/test_snowflake.py +1 -1
  10. test/integration/connectors/sql/test_sqlite.py +1 -1
  11. test/integration/connectors/test_astradb.py +1 -1
  12. test/integration/connectors/test_azure_ai_search.py +1 -1
  13. test/integration/connectors/test_chroma.py +1 -1
  14. test/integration/connectors/test_delta_table.py +1 -1
  15. test/integration/connectors/test_lancedb.py +1 -1
  16. test/integration/connectors/test_milvus.py +1 -1
  17. test/integration/connectors/test_mongodb.py +1 -1
  18. test/integration/connectors/test_neo4j.py +5 -5
  19. test/integration/connectors/test_onedrive.py +1 -1
  20. test/integration/connectors/test_pinecone.py +1 -1
  21. test/integration/connectors/test_qdrant.py +1 -1
  22. test/integration/connectors/test_redis.py +1 -1
  23. test/integration/connectors/test_s3.py +1 -1
  24. test/integration/connectors/test_vectara.py +68 -56
  25. test/integration/connectors/utils/validation/destination.py +2 -1
  26. test/integration/connectors/utils/validation/source.py +2 -1
  27. test/integration/connectors/weaviate/test_local.py +1 -1
  28. test/unit/test_html.py +1 -1
  29. test/unit/v2/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +1 -1
  30. test/unit/v2/connectors/motherduck/test_base.py +1 -2
  31. test/unit/v2/connectors/sql/test_sql.py +1 -1
  32. unstructured_ingest/__version__.py +1 -1
  33. unstructured_ingest/utils/html.py +2 -1
  34. unstructured_ingest/v2/interfaces/__init__.py +0 -13
  35. unstructured_ingest/v2/interfaces/downloader.py +1 -1
  36. unstructured_ingest/v2/interfaces/indexer.py +1 -1
  37. unstructured_ingest/v2/interfaces/upload_stager.py +2 -2
  38. unstructured_ingest/v2/interfaces/uploader.py +2 -3
  39. unstructured_ingest/v2/pipeline/steps/chunk.py +1 -2
  40. unstructured_ingest/v2/pipeline/steps/download.py +2 -3
  41. unstructured_ingest/v2/pipeline/steps/embed.py +1 -2
  42. unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
  43. unstructured_ingest/v2/pipeline/steps/partition.py +1 -2
  44. unstructured_ingest/v2/pipeline/steps/stage.py +2 -2
  45. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
  46. unstructured_ingest/v2/pipeline/steps/upload.py +2 -2
  47. unstructured_ingest/v2/processes/connectors/airtable.py +1 -2
  48. unstructured_ingest/v2/processes/connectors/astradb.py +7 -5
  49. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +1 -1
  50. unstructured_ingest/v2/processes/connectors/chroma.py +1 -1
  51. unstructured_ingest/v2/processes/connectors/confluence.py +5 -3
  52. unstructured_ingest/v2/processes/connectors/couchbase.py +7 -5
  53. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +5 -3
  54. unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +64 -19
  55. unstructured_ingest/v2/processes/connectors/delta_table.py +1 -1
  56. unstructured_ingest/v2/processes/connectors/discord.py +5 -3
  57. unstructured_ingest/v2/processes/connectors/duckdb/base.py +2 -1
  58. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +1 -1
  59. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +1 -1
  60. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +7 -5
  61. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +1 -1
  62. unstructured_ingest/v2/processes/connectors/fsspec/box.py +1 -1
  63. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +1 -1
  64. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +5 -3
  65. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +1 -1
  66. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -4
  67. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +1 -1
  68. unstructured_ingest/v2/processes/connectors/gitlab.py +5 -3
  69. unstructured_ingest/v2/processes/connectors/google_drive.py +5 -3
  70. unstructured_ingest/v2/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +1 -1
  71. unstructured_ingest/v2/processes/connectors/jira.py +5 -3
  72. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +5 -3
  73. unstructured_ingest/v2/processes/connectors/kdbai.py +1 -1
  74. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -4
  75. unstructured_ingest/v2/processes/connectors/local.py +5 -3
  76. unstructured_ingest/v2/processes/connectors/milvus.py +1 -1
  77. unstructured_ingest/v2/processes/connectors/mongodb.py +7 -5
  78. unstructured_ingest/v2/processes/connectors/neo4j.py +1 -1
  79. unstructured_ingest/v2/processes/connectors/notion/connector.py +5 -3
  80. unstructured_ingest/v2/processes/connectors/onedrive.py +5 -3
  81. unstructured_ingest/v2/processes/connectors/outlook.py +5 -2
  82. unstructured_ingest/v2/processes/connectors/pinecone.py +1 -1
  83. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +1 -1
  84. unstructured_ingest/v2/processes/connectors/redisdb.py +1 -1
  85. unstructured_ingest/v2/processes/connectors/salesforce.py +5 -3
  86. unstructured_ingest/v2/processes/connectors/sharepoint.py +3 -3
  87. unstructured_ingest/v2/processes/connectors/slack.py +2 -2
  88. unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +1 -1
  89. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +1 -1
  90. unstructured_ingest/v2/processes/connectors/sql/sql.py +13 -8
  91. unstructured_ingest/v2/processes/connectors/sql/vastdb.py +3 -3
  92. unstructured_ingest/v2/processes/connectors/vectara.py +1 -1
  93. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +1 -1
  94. unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py +5 -3
  95. unstructured_ingest/v2/processes/filter.py +1 -1
  96. unstructured_ingest/v2/processes/uncompress.py +1 -1
  97. unstructured_ingest/v2/processes/utils/blob_storage.py +2 -1
  98. unstructured_ingest/v2/utils.py +1 -1
  99. {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/METADATA +101 -101
  100. {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/RECORD +104 -105
  101. unstructured_ingest/v2/interfaces/file_data.py +0 -13
  102. {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/LICENSE.md +0 -0
  103. {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/WHEEL +0 -0
  104. {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/entry_points.txt +0 -0
  105. {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/top_level.txt +0 -0
@@ -11,7 +11,6 @@ from pydantic.functional_validators import BeforeValidator
11
11
 
12
12
  from unstructured_ingest.utils.dep_check import requires_dependencies
13
13
  from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserError
14
- from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
15
14
  from unstructured_ingest.v2.logger import logger
16
15
  from unstructured_ingest.v2.processes.connector_registry import (
17
16
  DestinationRegistryEntry,
@@ -32,6 +31,7 @@ from unstructured_ingest.v2.processes.utils.blob_storage import (
32
31
  BlobStoreUploadStager,
33
32
  BlobStoreUploadStagerConfig,
34
33
  )
34
+ from unstructured_ingest.v2.types.file_data import FileDataSourceMetadata
35
35
 
36
36
  if TYPE_CHECKING:
37
37
  from boxfs import BoxFileSystem
@@ -15,7 +15,6 @@ from unstructured_ingest.v2.errors import (
15
15
  from unstructured_ingest.v2.errors import (
16
16
  RateLimitError as CustomRateLimitError,
17
17
  )
18
- from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
19
18
  from unstructured_ingest.v2.logger import logger
20
19
  from unstructured_ingest.v2.processes.connector_registry import (
21
20
  DestinationRegistryEntry,
@@ -35,6 +34,7 @@ from unstructured_ingest.v2.processes.utils.blob_storage import (
35
34
  BlobStoreUploadStager,
36
35
  BlobStoreUploadStagerConfig,
37
36
  )
37
+ from unstructured_ingest.v2.types.file_data import FileDataSourceMetadata
38
38
 
39
39
  if TYPE_CHECKING:
40
40
  pass
@@ -18,16 +18,18 @@ from unstructured_ingest.v2.interfaces import (
18
18
  Downloader,
19
19
  DownloaderConfig,
20
20
  DownloadResponse,
21
- FileData,
22
- FileDataSourceMetadata,
23
21
  Indexer,
24
22
  IndexerConfig,
25
- SourceIdentifiers,
26
23
  Uploader,
27
24
  UploaderConfig,
28
25
  )
29
26
  from unstructured_ingest.v2.logger import logger
30
27
  from unstructured_ingest.v2.processes.connectors.fsspec.utils import sterilize_dict
28
+ from unstructured_ingest.v2.types.file_data import (
29
+ FileData,
30
+ FileDataSourceMetadata,
31
+ SourceIdentifiers,
32
+ )
31
33
 
32
34
  if TYPE_CHECKING:
33
35
  from fsspec import AbstractFileSystem
@@ -12,7 +12,6 @@ from pydantic import Field, Secret
12
12
  from unstructured_ingest.utils.dep_check import requires_dependencies
13
13
  from unstructured_ingest.utils.string_and_date_utils import json_to_dict
14
14
  from unstructured_ingest.v2.errors import ProviderError, UserError
15
- from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
16
15
  from unstructured_ingest.v2.logger import logger
17
16
  from unstructured_ingest.v2.processes.connector_registry import (
18
17
  DestinationRegistryEntry,
@@ -32,6 +31,7 @@ from unstructured_ingest.v2.processes.utils.blob_storage import (
32
31
  BlobStoreUploadStager,
33
32
  BlobStoreUploadStagerConfig,
34
33
  )
34
+ from unstructured_ingest.v2.types.file_data import FileDataSourceMetadata
35
35
 
36
36
  if TYPE_CHECKING:
37
37
  from gcsfs import GCSFileSystem
@@ -8,9 +8,6 @@ from pydantic import Field, Secret
8
8
 
9
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
10
  from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserError
11
- from unstructured_ingest.v2.interfaces import (
12
- FileDataSourceMetadata,
13
- )
14
11
  from unstructured_ingest.v2.logger import logger
15
12
  from unstructured_ingest.v2.processes.connector_registry import (
16
13
  DestinationRegistryEntry,
@@ -30,9 +27,15 @@ from unstructured_ingest.v2.processes.utils.blob_storage import (
30
27
  BlobStoreUploadStager,
31
28
  BlobStoreUploadStagerConfig,
32
29
  )
30
+ from unstructured_ingest.v2.types.file_data import (
31
+ FileDataSourceMetadata,
32
+ )
33
33
 
34
34
  CONNECTOR_TYPE = "s3"
35
35
 
36
+ # https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html#object-key-guidelines-avoid-characters
37
+ CHARACTERS_TO_AVOID = ["\\", "{", "^", "}", "%", "`", "]", '"', ">", "[", "~", "<", "#", "|"]
38
+
36
39
  if TYPE_CHECKING:
37
40
  from s3fs import S3FileSystem
38
41
 
@@ -91,7 +94,7 @@ class S3ConnectionConfig(FsspecConnectionConfig):
91
94
  if isinstance(e, PermissionError):
92
95
  return UserAuthError(e)
93
96
  if isinstance(e, FileNotFoundError):
94
- return UserError(e)
97
+ return UserError(f"File not found: {e}")
95
98
  if cause := getattr(e, "__cause__", None):
96
99
  error_response = cause.response
97
100
  error_meta = error_response["ResponseMetadata"]
@@ -140,6 +143,12 @@ class S3Indexer(FsspecIndexer):
140
143
  }
141
144
  if metadata:
142
145
  record_locator["metadata"] = metadata
146
+ issue_characters = [char for char in CHARACTERS_TO_AVOID if char in path]
147
+ if issue_characters:
148
+ logger.warning(
149
+ f"File path {path} contains characters "
150
+ f"that can cause issues with S3: {issue_characters}"
151
+ )
143
152
  return FileDataSourceMetadata(
144
153
  date_created=date_created,
145
154
  date_modified=date_modified,
@@ -11,7 +11,6 @@ from urllib.parse import urlparse
11
11
  from pydantic import Field, Secret
12
12
 
13
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
14
- from unstructured_ingest.v2.interfaces import FileData, FileDataSourceMetadata
15
14
  from unstructured_ingest.v2.processes.connector_registry import (
16
15
  DestinationRegistryEntry,
17
16
  SourceRegistryEntry,
@@ -30,6 +29,7 @@ from unstructured_ingest.v2.processes.utils.blob_storage import (
30
29
  BlobStoreUploadStager,
31
30
  BlobStoreUploadStagerConfig,
32
31
  )
32
+ from unstructured_ingest.v2.types.file_data import FileData, FileDataSourceMetadata
33
33
 
34
34
  if TYPE_CHECKING:
35
35
  from fsspec.implementations.sftp import SFTPFileSystem
@@ -16,14 +16,16 @@ from unstructured_ingest.v2.interfaces import (
16
16
  Downloader,
17
17
  DownloaderConfig,
18
18
  DownloadResponse,
19
- FileData,
20
- FileDataSourceMetadata,
21
19
  Indexer,
22
20
  IndexerConfig,
23
- SourceIdentifiers,
24
21
  )
25
22
  from unstructured_ingest.v2.logger import logger
26
23
  from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
24
+ from unstructured_ingest.v2.types.file_data import (
25
+ FileData,
26
+ FileDataSourceMetadata,
27
+ SourceIdentifiers,
28
+ )
27
29
 
28
30
  CONNECTOR_TYPE = "gitlab"
29
31
  if TYPE_CHECKING:
@@ -21,15 +21,17 @@ from unstructured_ingest.v2.interfaces import (
21
21
  Downloader,
22
22
  DownloaderConfig,
23
23
  DownloadResponse,
24
- FileData,
25
- FileDataSourceMetadata,
26
24
  Indexer,
27
25
  IndexerConfig,
28
- SourceIdentifiers,
29
26
  )
30
27
  from unstructured_ingest.v2.logger import logger
31
28
  from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
32
29
  from unstructured_ingest.v2.processes.connectors.utils import conform_string_to_dict
30
+ from unstructured_ingest.v2.types.file_data import (
31
+ FileData,
32
+ FileDataSourceMetadata,
33
+ SourceIdentifiers,
34
+ )
33
35
 
34
36
  CONNECTOR_TYPE = "google_drive"
35
37
 
@@ -15,7 +15,6 @@ from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserErro
15
15
  from unstructured_ingest.v2.interfaces import (
16
16
  AccessConfig,
17
17
  ConnectionConfig,
18
- FileData,
19
18
  UploaderConfig,
20
19
  )
21
20
  from unstructured_ingest.v2.logger import logger
@@ -27,6 +26,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sql import (
27
26
  SQLUploadStager,
28
27
  SQLUploadStagerConfig,
29
28
  )
29
+ from unstructured_ingest.v2.types.file_data import FileData
30
30
 
31
31
  if TYPE_CHECKING:
32
32
  from pyarrow import Table as ArrowTable
@@ -15,16 +15,18 @@ from unstructured_ingest.v2.interfaces import (
15
15
  Downloader,
16
16
  DownloaderConfig,
17
17
  DownloadResponse,
18
- FileData,
19
- FileDataSourceMetadata,
20
18
  Indexer,
21
19
  IndexerConfig,
22
- SourceIdentifiers,
23
20
  )
24
21
  from unstructured_ingest.v2.logger import logger
25
22
  from unstructured_ingest.v2.processes.connector_registry import (
26
23
  SourceRegistryEntry,
27
24
  )
25
+ from unstructured_ingest.v2.types.file_data import (
26
+ FileData,
27
+ FileDataSourceMetadata,
28
+ SourceIdentifiers,
29
+ )
28
30
 
29
31
  if TYPE_CHECKING:
30
32
  from atlassian import Jira
@@ -21,15 +21,17 @@ from unstructured_ingest.v2.interfaces import (
21
21
  Downloader,
22
22
  DownloaderConfig,
23
23
  DownloadResponse,
24
- FileData,
25
- FileDataSourceMetadata,
26
24
  Indexer,
27
25
  IndexerConfig,
28
- SourceIdentifiers,
29
26
  Uploader,
30
27
  UploaderConfig,
31
28
  )
32
29
  from unstructured_ingest.v2.logger import logger
30
+ from unstructured_ingest.v2.types.file_data import (
31
+ FileData,
32
+ FileDataSourceMetadata,
33
+ SourceIdentifiers,
34
+ )
33
35
 
34
36
  if TYPE_CHECKING:
35
37
  from confluent_kafka import Consumer, Producer
@@ -11,7 +11,6 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
11
11
  from unstructured_ingest.v2.interfaces import (
12
12
  AccessConfig,
13
13
  ConnectionConfig,
14
- FileData,
15
14
  Uploader,
16
15
  UploaderConfig,
17
16
  UploadStager,
@@ -21,6 +20,7 @@ from unstructured_ingest.v2.logger import logger
21
20
  from unstructured_ingest.v2.processes.connector_registry import (
22
21
  DestinationRegistryEntry,
23
22
  )
23
+ from unstructured_ingest.v2.types.file_data import FileData
24
24
  from unstructured_ingest.v2.utils import get_enhanced_element_id
25
25
 
26
26
  if TYPE_CHECKING:
@@ -15,10 +15,14 @@ from unstructured_ingest.logger import logger
15
15
  from unstructured_ingest.utils.data_prep import flatten_dict
16
16
  from unstructured_ingest.utils.dep_check import requires_dependencies
17
17
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
18
- from unstructured_ingest.v2.interfaces.connector import ConnectionConfig
19
- from unstructured_ingest.v2.interfaces.file_data import FileData
20
- from unstructured_ingest.v2.interfaces.upload_stager import UploadStager, UploadStagerConfig
21
- from unstructured_ingest.v2.interfaces.uploader import Uploader, UploaderConfig
18
+ from unstructured_ingest.v2.interfaces import (
19
+ ConnectionConfig,
20
+ Uploader,
21
+ UploaderConfig,
22
+ UploadStager,
23
+ UploadStagerConfig,
24
+ )
25
+ from unstructured_ingest.v2.types.file_data import FileData
22
26
 
23
27
  CONNECTOR_TYPE = "lancedb"
24
28
 
@@ -14,11 +14,8 @@ from unstructured_ingest.v2.interfaces import (
14
14
  Downloader,
15
15
  DownloaderConfig,
16
16
  DownloadResponse,
17
- FileData,
18
- FileDataSourceMetadata,
19
17
  Indexer,
20
18
  IndexerConfig,
21
- SourceIdentifiers,
22
19
  Uploader,
23
20
  UploaderConfig,
24
21
  )
@@ -31,6 +28,11 @@ from unstructured_ingest.v2.processes.utils.blob_storage import (
31
28
  BlobStoreUploadStager,
32
29
  BlobStoreUploadStagerConfig,
33
30
  )
31
+ from unstructured_ingest.v2.types.file_data import (
32
+ FileData,
33
+ FileDataSourceMetadata,
34
+ SourceIdentifiers,
35
+ )
34
36
 
35
37
  CONNECTOR_TYPE = "local"
36
38
 
@@ -13,7 +13,6 @@ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
13
13
  from unstructured_ingest.v2.interfaces import (
14
14
  AccessConfig,
15
15
  ConnectionConfig,
16
- FileData,
17
16
  Uploader,
18
17
  UploaderConfig,
19
18
  UploadStager,
@@ -23,6 +22,7 @@ from unstructured_ingest.v2.logger import logger
23
22
  from unstructured_ingest.v2.processes.connector_registry import (
24
23
  DestinationRegistryEntry,
25
24
  )
25
+ from unstructured_ingest.v2.types.file_data import FileData
26
26
 
27
27
  if TYPE_CHECKING:
28
28
  from pymilvus import MilvusClient
@@ -13,17 +13,12 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
13
13
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
14
14
  from unstructured_ingest.v2.interfaces import (
15
15
  AccessConfig,
16
- BatchFileData,
17
- BatchItem,
18
16
  ConnectionConfig,
19
17
  Downloader,
20
18
  DownloaderConfig,
21
19
  DownloadResponse,
22
- FileData,
23
- FileDataSourceMetadata,
24
20
  Indexer,
25
21
  IndexerConfig,
26
- SourceIdentifiers,
27
22
  Uploader,
28
23
  UploaderConfig,
29
24
  download_responses,
@@ -33,6 +28,13 @@ from unstructured_ingest.v2.processes.connector_registry import (
33
28
  DestinationRegistryEntry,
34
29
  SourceRegistryEntry,
35
30
  )
31
+ from unstructured_ingest.v2.types.file_data import (
32
+ BatchFileData,
33
+ BatchItem,
34
+ FileData,
35
+ FileDataSourceMetadata,
36
+ SourceIdentifiers,
37
+ )
36
38
 
37
39
  if TYPE_CHECKING:
38
40
  from pymongo import MongoClient
@@ -18,7 +18,6 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
18
18
  from unstructured_ingest.v2.interfaces import (
19
19
  AccessConfig,
20
20
  ConnectionConfig,
21
- FileData,
22
21
  Uploader,
23
22
  UploaderConfig,
24
23
  UploadStager,
@@ -28,6 +27,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
28
27
  DestinationRegistryEntry,
29
28
  )
30
29
  from unstructured_ingest.v2.processes.connectors.utils import format_and_truncate_orig_elements
30
+ from unstructured_ingest.v2.types.file_data import FileData
31
31
 
32
32
  SimilarityFunction = Literal["cosine"]
33
33
 
@@ -12,14 +12,16 @@ from unstructured_ingest.v2.interfaces import (
12
12
  Downloader,
13
13
  DownloaderConfig,
14
14
  DownloadResponse,
15
- FileData,
16
- FileDataSourceMetadata,
17
15
  Indexer,
18
16
  IndexerConfig,
19
- SourceIdentifiers,
20
17
  )
21
18
  from unstructured_ingest.v2.logger import logger
22
19
  from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
20
+ from unstructured_ingest.v2.types.file_data import (
21
+ FileData,
22
+ FileDataSourceMetadata,
23
+ SourceIdentifiers,
24
+ )
23
25
 
24
26
  if TYPE_CHECKING:
25
27
  from unstructured_ingest.v2.processes.connectors.notion.client import Client
@@ -22,11 +22,8 @@ from unstructured_ingest.v2.interfaces import (
22
22
  Downloader,
23
23
  DownloaderConfig,
24
24
  DownloadResponse,
25
- FileData,
26
- FileDataSourceMetadata,
27
25
  Indexer,
28
26
  IndexerConfig,
29
- SourceIdentifiers,
30
27
  Uploader,
31
28
  UploaderConfig,
32
29
  )
@@ -39,6 +36,11 @@ from unstructured_ingest.v2.processes.utils.blob_storage import (
39
36
  BlobStoreUploadStager,
40
37
  BlobStoreUploadStagerConfig,
41
38
  )
39
+ from unstructured_ingest.v2.types.file_data import (
40
+ FileData,
41
+ FileDataSourceMetadata,
42
+ SourceIdentifiers,
43
+ )
42
44
 
43
45
  if TYPE_CHECKING:
44
46
  from office365.graph_client import GraphClient
@@ -16,12 +16,15 @@ from unstructured_ingest.v2.interfaces import (
16
16
  Downloader,
17
17
  DownloaderConfig,
18
18
  DownloadResponse,
19
- FileData,
20
19
  Indexer,
21
20
  IndexerConfig,
22
21
  )
23
- from unstructured_ingest.v2.interfaces.file_data import FileDataSourceMetadata, SourceIdentifiers
24
22
  from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
23
+ from unstructured_ingest.v2.types.file_data import (
24
+ FileData,
25
+ FileDataSourceMetadata,
26
+ SourceIdentifiers,
27
+ )
25
28
 
26
29
  MAX_EMAILS_PER_FOLDER = 1_000_000 # Maximum number of emails per folder
27
30
 
@@ -13,7 +13,6 @@ from unstructured_ingest.v2.errors import UserError
13
13
  from unstructured_ingest.v2.interfaces import (
14
14
  AccessConfig,
15
15
  ConnectionConfig,
16
- FileData,
17
16
  UploaderConfig,
18
17
  UploadStager,
19
18
  UploadStagerConfig,
@@ -21,6 +20,7 @@ from unstructured_ingest.v2.interfaces import (
21
20
  )
22
21
  from unstructured_ingest.v2.logger import logger
23
22
  from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
23
+ from unstructured_ingest.v2.types.file_data import FileData
24
24
  from unstructured_ingest.v2.utils import get_enhanced_element_id
25
25
 
26
26
  if TYPE_CHECKING:
@@ -13,13 +13,13 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
13
13
  from unstructured_ingest.v2.interfaces import (
14
14
  AccessConfig,
15
15
  ConnectionConfig,
16
- FileData,
17
16
  Uploader,
18
17
  UploaderConfig,
19
18
  UploadStager,
20
19
  UploadStagerConfig,
21
20
  )
22
21
  from unstructured_ingest.v2.logger import logger
22
+ from unstructured_ingest.v2.types.file_data import FileData
23
23
  from unstructured_ingest.v2.utils import get_enhanced_element_id
24
24
 
25
25
  if TYPE_CHECKING:
@@ -11,12 +11,12 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
11
11
  from unstructured_ingest.v2.interfaces import (
12
12
  AccessConfig,
13
13
  ConnectionConfig,
14
- FileData,
15
14
  Uploader,
16
15
  UploaderConfig,
17
16
  )
18
17
  from unstructured_ingest.v2.logger import logger
19
18
  from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
19
+ from unstructured_ingest.v2.types.file_data import FileData
20
20
 
21
21
  if TYPE_CHECKING:
22
22
  from redis.asyncio import Redis
@@ -28,16 +28,18 @@ from unstructured_ingest.v2.interfaces import (
28
28
  Downloader,
29
29
  DownloaderConfig,
30
30
  DownloadResponse,
31
- FileData,
32
- FileDataSourceMetadata,
33
31
  Indexer,
34
32
  IndexerConfig,
35
- SourceIdentifiers,
36
33
  )
37
34
  from unstructured_ingest.v2.logger import logger
38
35
  from unstructured_ingest.v2.processes.connector_registry import (
39
36
  SourceRegistryEntry,
40
37
  )
38
+ from unstructured_ingest.v2.types.file_data import (
39
+ FileData,
40
+ FileDataSourceMetadata,
41
+ SourceIdentifiers,
42
+ )
41
43
 
42
44
 
43
45
  class MissingCategoryError(Exception):
@@ -11,9 +11,6 @@ from unstructured_ingest.error import (
11
11
  SourceConnectionNetworkError,
12
12
  )
13
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
14
- from unstructured_ingest.v2.interfaces import (
15
- FileData,
16
- )
17
14
  from unstructured_ingest.v2.logger import logger
18
15
  from unstructured_ingest.v2.processes.connector_registry import (
19
16
  SourceRegistryEntry,
@@ -26,6 +23,9 @@ from unstructured_ingest.v2.processes.connectors.onedrive import (
26
23
  OnedriveIndexer,
27
24
  OnedriveIndexerConfig,
28
25
  )
26
+ from unstructured_ingest.v2.types.file_data import (
27
+ FileData,
28
+ )
29
29
 
30
30
  if TYPE_CHECKING:
31
31
  from office365.onedrive.driveitems.driveItem import DriveItem
@@ -20,12 +20,12 @@ from unstructured_ingest.v2.interfaces import (
20
20
  Indexer,
21
21
  IndexerConfig,
22
22
  )
23
- from unstructured_ingest.v2.interfaces.file_data import (
23
+ from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
24
+ from unstructured_ingest.v2.types.file_data import (
24
25
  FileData,
25
26
  FileDataSourceMetadata,
26
27
  SourceIdentifiers,
27
28
  )
28
- from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
29
29
 
30
30
  if TYPE_CHECKING:
31
31
  from slack_sdk import WebClient
@@ -7,7 +7,6 @@ from pydantic import Field, Secret
7
7
 
8
8
  from unstructured_ingest.utils.data_prep import split_dataframe
9
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
- from unstructured_ingest.v2.interfaces import FileData
11
10
  from unstructured_ingest.v2.logger import logger
12
11
  from unstructured_ingest.v2.processes.connector_registry import (
13
12
  DestinationRegistryEntry,
@@ -20,6 +19,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sql import (
20
19
  SQLUploadStager,
21
20
  SQLUploadStagerConfig,
22
21
  )
22
+ from unstructured_ingest.v2.types.file_data import FileData
23
23
 
24
24
  if TYPE_CHECKING:
25
25
  from databricks.sdk.core import oauth_service_principal
@@ -7,7 +7,6 @@ from pydantic import Field, Secret
7
7
 
8
8
  from unstructured_ingest.utils.data_prep import split_dataframe
9
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
- from unstructured_ingest.v2.interfaces.file_data import FileData
11
10
  from unstructured_ingest.v2.logger import logger
12
11
  from unstructured_ingest.v2.processes.connector_registry import (
13
12
  DestinationRegistryEntry,
@@ -28,6 +27,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sql import (
28
27
  SQLUploadStagerConfig,
29
28
  parse_date_string,
30
29
  )
30
+ from unstructured_ingest.v2.types.file_data import FileData
31
31
 
32
32
  if TYPE_CHECKING:
33
33
  from pandas import DataFrame
@@ -16,17 +16,12 @@ from unstructured_ingest.utils.data_prep import get_data, get_data_df, split_dat
16
16
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
17
17
  from unstructured_ingest.v2.interfaces import (
18
18
  AccessConfig,
19
- BatchFileData,
20
- BatchItem,
21
19
  ConnectionConfig,
22
20
  Downloader,
23
21
  DownloaderConfig,
24
22
  DownloadResponse,
25
- FileData,
26
- FileDataSourceMetadata,
27
23
  Indexer,
28
24
  IndexerConfig,
29
- SourceIdentifiers,
30
25
  Uploader,
31
26
  UploaderConfig,
32
27
  UploadStager,
@@ -34,6 +29,13 @@ from unstructured_ingest.v2.interfaces import (
34
29
  download_responses,
35
30
  )
36
31
  from unstructured_ingest.v2.logger import logger
32
+ from unstructured_ingest.v2.types.file_data import (
33
+ BatchFileData,
34
+ BatchItem,
35
+ FileData,
36
+ FileDataSourceMetadata,
37
+ SourceIdentifiers,
38
+ )
37
39
  from unstructured_ingest.v2.utils import get_enhanced_element_id
38
40
 
39
41
  if TYPE_CHECKING:
@@ -251,8 +253,9 @@ class SQLUploadStager(UploadStager):
251
253
  df[column] = df[column].apply(str)
252
254
  return df
253
255
 
254
- def write_output(self, output_path: Path, data: list[dict]) -> None:
256
+ def write_output(self, output_path: Path, data: list[dict]) -> Path:
255
257
  write_data(path=output_path, data=data)
258
+ return output_path
256
259
 
257
260
  def run(
258
261
  self,
@@ -278,8 +281,10 @@ class SQLUploadStager(UploadStager):
278
281
  output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
279
282
  output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
280
283
 
281
- self.write_output(output_path=output_path, data=df.to_dict(orient="records"))
282
- return output_path
284
+ final_output_path = self.write_output(
285
+ output_path=output_path, data=df.to_dict(orient="records")
286
+ )
287
+ return final_output_path
283
288
 
284
289
 
285
290
  class SQLUploaderConfig(UploaderConfig):
@@ -8,9 +8,6 @@ from unstructured_ingest.error import DestinationConnectionError
8
8
  from unstructured_ingest.utils.data_prep import split_dataframe
9
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
10
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
11
- from unstructured_ingest.v2.interfaces import (
12
- FileData,
13
- )
14
11
  from unstructured_ingest.v2.logger import logger
15
12
  from unstructured_ingest.v2.processes.connector_registry import (
16
13
  DestinationRegistryEntry,
@@ -29,6 +26,9 @@ from unstructured_ingest.v2.processes.connectors.sql.sql import (
29
26
  SQLUploadStager,
30
27
  SQLUploadStagerConfig,
31
28
  )
29
+ from unstructured_ingest.v2.types.file_data import (
30
+ FileData,
31
+ )
32
32
  from unstructured_ingest.v2.utils import get_enhanced_element_id
33
33
 
34
34
  if TYPE_CHECKING:
@@ -14,7 +14,6 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
14
14
  from unstructured_ingest.v2.interfaces import (
15
15
  AccessConfig,
16
16
  ConnectionConfig,
17
- FileData,
18
17
  Uploader,
19
18
  UploaderConfig,
20
19
  UploadStager,
@@ -22,6 +21,7 @@ from unstructured_ingest.v2.interfaces import (
22
21
  )
23
22
  from unstructured_ingest.v2.logger import logger
24
23
  from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
24
+ from unstructured_ingest.v2.types.file_data import FileData
25
25
 
26
26
  BASE_URL = "https://api.vectara.io/v2"
27
27