unstructured-ingest 0.5.23__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/databricks/test_volumes_native.py +1 -1
- test/integration/connectors/duckdb/test_duckdb.py +1 -1
- test/integration/connectors/duckdb/test_motherduck.py +1 -1
- test/integration/connectors/elasticsearch/test_elasticsearch.py +1 -1
- test/integration/connectors/elasticsearch/test_opensearch.py +1 -1
- test/integration/connectors/sql/test_databricks_delta_tables.py +1 -1
- test/integration/connectors/sql/test_postgres.py +1 -1
- test/integration/connectors/sql/test_singlestore.py +1 -1
- test/integration/connectors/sql/test_snowflake.py +1 -1
- test/integration/connectors/sql/test_sqlite.py +1 -1
- test/integration/connectors/test_astradb.py +1 -1
- test/integration/connectors/test_azure_ai_search.py +1 -1
- test/integration/connectors/test_chroma.py +1 -1
- test/integration/connectors/test_delta_table.py +1 -1
- test/integration/connectors/test_lancedb.py +1 -1
- test/integration/connectors/test_milvus.py +1 -1
- test/integration/connectors/test_mongodb.py +1 -1
- test/integration/connectors/test_neo4j.py +5 -5
- test/integration/connectors/test_onedrive.py +1 -1
- test/integration/connectors/test_pinecone.py +1 -1
- test/integration/connectors/test_qdrant.py +1 -1
- test/integration/connectors/test_redis.py +1 -1
- test/integration/connectors/test_s3.py +1 -1
- test/integration/connectors/test_vectara.py +68 -56
- test/integration/connectors/utils/validation/destination.py +2 -1
- test/integration/connectors/utils/validation/source.py +2 -1
- test/integration/connectors/weaviate/test_local.py +1 -1
- test/unit/test_html.py +1 -1
- test/unit/v2/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +1 -1
- test/unit/v2/connectors/motherduck/test_base.py +1 -2
- test/unit/v2/connectors/sql/test_sql.py +1 -1
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/utils/html.py +2 -1
- unstructured_ingest/v2/interfaces/__init__.py +0 -13
- unstructured_ingest/v2/interfaces/downloader.py +1 -1
- unstructured_ingest/v2/interfaces/indexer.py +1 -1
- unstructured_ingest/v2/interfaces/upload_stager.py +2 -2
- unstructured_ingest/v2/interfaces/uploader.py +2 -3
- unstructured_ingest/v2/pipeline/steps/chunk.py +1 -2
- unstructured_ingest/v2/pipeline/steps/download.py +2 -3
- unstructured_ingest/v2/pipeline/steps/embed.py +1 -2
- unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
- unstructured_ingest/v2/pipeline/steps/partition.py +1 -2
- unstructured_ingest/v2/pipeline/steps/stage.py +2 -2
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
- unstructured_ingest/v2/pipeline/steps/upload.py +2 -2
- unstructured_ingest/v2/processes/connectors/airtable.py +1 -2
- unstructured_ingest/v2/processes/connectors/astradb.py +7 -5
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +1 -1
- unstructured_ingest/v2/processes/connectors/chroma.py +1 -1
- unstructured_ingest/v2/processes/connectors/confluence.py +5 -3
- unstructured_ingest/v2/processes/connectors/couchbase.py +7 -5
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +5 -3
- unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +64 -19
- unstructured_ingest/v2/processes/connectors/delta_table.py +1 -1
- unstructured_ingest/v2/processes/connectors/discord.py +5 -3
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +2 -1
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +1 -1
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +1 -1
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +7 -5
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +5 -3
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -4
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +1 -1
- unstructured_ingest/v2/processes/connectors/gitlab.py +5 -3
- unstructured_ingest/v2/processes/connectors/google_drive.py +5 -3
- unstructured_ingest/v2/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +1 -1
- unstructured_ingest/v2/processes/connectors/jira.py +5 -3
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +5 -3
- unstructured_ingest/v2/processes/connectors/kdbai.py +1 -1
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -4
- unstructured_ingest/v2/processes/connectors/local.py +5 -3
- unstructured_ingest/v2/processes/connectors/milvus.py +1 -1
- unstructured_ingest/v2/processes/connectors/mongodb.py +7 -5
- unstructured_ingest/v2/processes/connectors/neo4j.py +1 -1
- unstructured_ingest/v2/processes/connectors/notion/connector.py +5 -3
- unstructured_ingest/v2/processes/connectors/onedrive.py +5 -3
- unstructured_ingest/v2/processes/connectors/outlook.py +5 -2
- unstructured_ingest/v2/processes/connectors/pinecone.py +1 -1
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +1 -1
- unstructured_ingest/v2/processes/connectors/redisdb.py +1 -1
- unstructured_ingest/v2/processes/connectors/salesforce.py +5 -3
- unstructured_ingest/v2/processes/connectors/sharepoint.py +3 -3
- unstructured_ingest/v2/processes/connectors/slack.py +2 -2
- unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +1 -1
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +1 -1
- unstructured_ingest/v2/processes/connectors/sql/sql.py +13 -8
- unstructured_ingest/v2/processes/connectors/sql/vastdb.py +3 -3
- unstructured_ingest/v2/processes/connectors/vectara.py +1 -1
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +1 -1
- unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py +5 -3
- unstructured_ingest/v2/processes/filter.py +1 -1
- unstructured_ingest/v2/processes/uncompress.py +1 -1
- unstructured_ingest/v2/processes/utils/blob_storage.py +2 -1
- unstructured_ingest/v2/utils.py +1 -1
- {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/METADATA +101 -101
- {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/RECORD +104 -105
- unstructured_ingest/v2/interfaces/file_data.py +0 -13
- {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.6.0.dist-info}/top_level.txt +0 -0
|
@@ -11,7 +11,6 @@ from pydantic.functional_validators import BeforeValidator
|
|
|
11
11
|
|
|
12
12
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
13
|
from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserError
|
|
14
|
-
from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
|
|
15
14
|
from unstructured_ingest.v2.logger import logger
|
|
16
15
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
17
16
|
DestinationRegistryEntry,
|
|
@@ -32,6 +31,7 @@ from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
|
32
31
|
BlobStoreUploadStager,
|
|
33
32
|
BlobStoreUploadStagerConfig,
|
|
34
33
|
)
|
|
34
|
+
from unstructured_ingest.v2.types.file_data import FileDataSourceMetadata
|
|
35
35
|
|
|
36
36
|
if TYPE_CHECKING:
|
|
37
37
|
from boxfs import BoxFileSystem
|
|
@@ -15,7 +15,6 @@ from unstructured_ingest.v2.errors import (
|
|
|
15
15
|
from unstructured_ingest.v2.errors import (
|
|
16
16
|
RateLimitError as CustomRateLimitError,
|
|
17
17
|
)
|
|
18
|
-
from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
|
|
19
18
|
from unstructured_ingest.v2.logger import logger
|
|
20
19
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
21
20
|
DestinationRegistryEntry,
|
|
@@ -35,6 +34,7 @@ from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
|
35
34
|
BlobStoreUploadStager,
|
|
36
35
|
BlobStoreUploadStagerConfig,
|
|
37
36
|
)
|
|
37
|
+
from unstructured_ingest.v2.types.file_data import FileDataSourceMetadata
|
|
38
38
|
|
|
39
39
|
if TYPE_CHECKING:
|
|
40
40
|
pass
|
|
@@ -18,16 +18,18 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
18
18
|
Downloader,
|
|
19
19
|
DownloaderConfig,
|
|
20
20
|
DownloadResponse,
|
|
21
|
-
FileData,
|
|
22
|
-
FileDataSourceMetadata,
|
|
23
21
|
Indexer,
|
|
24
22
|
IndexerConfig,
|
|
25
|
-
SourceIdentifiers,
|
|
26
23
|
Uploader,
|
|
27
24
|
UploaderConfig,
|
|
28
25
|
)
|
|
29
26
|
from unstructured_ingest.v2.logger import logger
|
|
30
27
|
from unstructured_ingest.v2.processes.connectors.fsspec.utils import sterilize_dict
|
|
28
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
29
|
+
FileData,
|
|
30
|
+
FileDataSourceMetadata,
|
|
31
|
+
SourceIdentifiers,
|
|
32
|
+
)
|
|
31
33
|
|
|
32
34
|
if TYPE_CHECKING:
|
|
33
35
|
from fsspec import AbstractFileSystem
|
|
@@ -12,7 +12,6 @@ from pydantic import Field, Secret
|
|
|
12
12
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
13
|
from unstructured_ingest.utils.string_and_date_utils import json_to_dict
|
|
14
14
|
from unstructured_ingest.v2.errors import ProviderError, UserError
|
|
15
|
-
from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
|
|
16
15
|
from unstructured_ingest.v2.logger import logger
|
|
17
16
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
18
17
|
DestinationRegistryEntry,
|
|
@@ -32,6 +31,7 @@ from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
|
32
31
|
BlobStoreUploadStager,
|
|
33
32
|
BlobStoreUploadStagerConfig,
|
|
34
33
|
)
|
|
34
|
+
from unstructured_ingest.v2.types.file_data import FileDataSourceMetadata
|
|
35
35
|
|
|
36
36
|
if TYPE_CHECKING:
|
|
37
37
|
from gcsfs import GCSFileSystem
|
|
@@ -8,9 +8,6 @@ from pydantic import Field, Secret
|
|
|
8
8
|
|
|
9
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
10
|
from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserError
|
|
11
|
-
from unstructured_ingest.v2.interfaces import (
|
|
12
|
-
FileDataSourceMetadata,
|
|
13
|
-
)
|
|
14
11
|
from unstructured_ingest.v2.logger import logger
|
|
15
12
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
16
13
|
DestinationRegistryEntry,
|
|
@@ -30,9 +27,15 @@ from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
|
30
27
|
BlobStoreUploadStager,
|
|
31
28
|
BlobStoreUploadStagerConfig,
|
|
32
29
|
)
|
|
30
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
31
|
+
FileDataSourceMetadata,
|
|
32
|
+
)
|
|
33
33
|
|
|
34
34
|
CONNECTOR_TYPE = "s3"
|
|
35
35
|
|
|
36
|
+
# https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html#object-key-guidelines-avoid-characters
|
|
37
|
+
CHARACTERS_TO_AVOID = ["\\", "{", "^", "}", "%", "`", "]", '"', ">", "[", "~", "<", "#", "|"]
|
|
38
|
+
|
|
36
39
|
if TYPE_CHECKING:
|
|
37
40
|
from s3fs import S3FileSystem
|
|
38
41
|
|
|
@@ -91,7 +94,7 @@ class S3ConnectionConfig(FsspecConnectionConfig):
|
|
|
91
94
|
if isinstance(e, PermissionError):
|
|
92
95
|
return UserAuthError(e)
|
|
93
96
|
if isinstance(e, FileNotFoundError):
|
|
94
|
-
return UserError(e)
|
|
97
|
+
return UserError(f"File not found: {e}")
|
|
95
98
|
if cause := getattr(e, "__cause__", None):
|
|
96
99
|
error_response = cause.response
|
|
97
100
|
error_meta = error_response["ResponseMetadata"]
|
|
@@ -140,6 +143,12 @@ class S3Indexer(FsspecIndexer):
|
|
|
140
143
|
}
|
|
141
144
|
if metadata:
|
|
142
145
|
record_locator["metadata"] = metadata
|
|
146
|
+
issue_characters = [char for char in CHARACTERS_TO_AVOID if char in path]
|
|
147
|
+
if issue_characters:
|
|
148
|
+
logger.warning(
|
|
149
|
+
f"File path {path} contains characters "
|
|
150
|
+
f"that can cause issues with S3: {issue_characters}"
|
|
151
|
+
)
|
|
143
152
|
return FileDataSourceMetadata(
|
|
144
153
|
date_created=date_created,
|
|
145
154
|
date_modified=date_modified,
|
|
@@ -11,7 +11,6 @@ from urllib.parse import urlparse
|
|
|
11
11
|
from pydantic import Field, Secret
|
|
12
12
|
|
|
13
13
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
|
-
from unstructured_ingest.v2.interfaces import FileData, FileDataSourceMetadata
|
|
15
14
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
16
15
|
DestinationRegistryEntry,
|
|
17
16
|
SourceRegistryEntry,
|
|
@@ -30,6 +29,7 @@ from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
|
30
29
|
BlobStoreUploadStager,
|
|
31
30
|
BlobStoreUploadStagerConfig,
|
|
32
31
|
)
|
|
32
|
+
from unstructured_ingest.v2.types.file_data import FileData, FileDataSourceMetadata
|
|
33
33
|
|
|
34
34
|
if TYPE_CHECKING:
|
|
35
35
|
from fsspec.implementations.sftp import SFTPFileSystem
|
|
@@ -16,14 +16,16 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
16
16
|
Downloader,
|
|
17
17
|
DownloaderConfig,
|
|
18
18
|
DownloadResponse,
|
|
19
|
-
FileData,
|
|
20
|
-
FileDataSourceMetadata,
|
|
21
19
|
Indexer,
|
|
22
20
|
IndexerConfig,
|
|
23
|
-
SourceIdentifiers,
|
|
24
21
|
)
|
|
25
22
|
from unstructured_ingest.v2.logger import logger
|
|
26
23
|
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
24
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
25
|
+
FileData,
|
|
26
|
+
FileDataSourceMetadata,
|
|
27
|
+
SourceIdentifiers,
|
|
28
|
+
)
|
|
27
29
|
|
|
28
30
|
CONNECTOR_TYPE = "gitlab"
|
|
29
31
|
if TYPE_CHECKING:
|
|
@@ -21,15 +21,17 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
21
21
|
Downloader,
|
|
22
22
|
DownloaderConfig,
|
|
23
23
|
DownloadResponse,
|
|
24
|
-
FileData,
|
|
25
|
-
FileDataSourceMetadata,
|
|
26
24
|
Indexer,
|
|
27
25
|
IndexerConfig,
|
|
28
|
-
SourceIdentifiers,
|
|
29
26
|
)
|
|
30
27
|
from unstructured_ingest.v2.logger import logger
|
|
31
28
|
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
32
29
|
from unstructured_ingest.v2.processes.connectors.utils import conform_string_to_dict
|
|
30
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
31
|
+
FileData,
|
|
32
|
+
FileDataSourceMetadata,
|
|
33
|
+
SourceIdentifiers,
|
|
34
|
+
)
|
|
33
35
|
|
|
34
36
|
CONNECTOR_TYPE = "google_drive"
|
|
35
37
|
|
|
@@ -15,7 +15,6 @@ from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserErro
|
|
|
15
15
|
from unstructured_ingest.v2.interfaces import (
|
|
16
16
|
AccessConfig,
|
|
17
17
|
ConnectionConfig,
|
|
18
|
-
FileData,
|
|
19
18
|
UploaderConfig,
|
|
20
19
|
)
|
|
21
20
|
from unstructured_ingest.v2.logger import logger
|
|
@@ -27,6 +26,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
|
27
26
|
SQLUploadStager,
|
|
28
27
|
SQLUploadStagerConfig,
|
|
29
28
|
)
|
|
29
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
30
30
|
|
|
31
31
|
if TYPE_CHECKING:
|
|
32
32
|
from pyarrow import Table as ArrowTable
|
|
@@ -15,16 +15,18 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
15
15
|
Downloader,
|
|
16
16
|
DownloaderConfig,
|
|
17
17
|
DownloadResponse,
|
|
18
|
-
FileData,
|
|
19
|
-
FileDataSourceMetadata,
|
|
20
18
|
Indexer,
|
|
21
19
|
IndexerConfig,
|
|
22
|
-
SourceIdentifiers,
|
|
23
20
|
)
|
|
24
21
|
from unstructured_ingest.v2.logger import logger
|
|
25
22
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
26
23
|
SourceRegistryEntry,
|
|
27
24
|
)
|
|
25
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
26
|
+
FileData,
|
|
27
|
+
FileDataSourceMetadata,
|
|
28
|
+
SourceIdentifiers,
|
|
29
|
+
)
|
|
28
30
|
|
|
29
31
|
if TYPE_CHECKING:
|
|
30
32
|
from atlassian import Jira
|
|
@@ -21,15 +21,17 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
21
21
|
Downloader,
|
|
22
22
|
DownloaderConfig,
|
|
23
23
|
DownloadResponse,
|
|
24
|
-
FileData,
|
|
25
|
-
FileDataSourceMetadata,
|
|
26
24
|
Indexer,
|
|
27
25
|
IndexerConfig,
|
|
28
|
-
SourceIdentifiers,
|
|
29
26
|
Uploader,
|
|
30
27
|
UploaderConfig,
|
|
31
28
|
)
|
|
32
29
|
from unstructured_ingest.v2.logger import logger
|
|
30
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
31
|
+
FileData,
|
|
32
|
+
FileDataSourceMetadata,
|
|
33
|
+
SourceIdentifiers,
|
|
34
|
+
)
|
|
33
35
|
|
|
34
36
|
if TYPE_CHECKING:
|
|
35
37
|
from confluent_kafka import Consumer, Producer
|
|
@@ -11,7 +11,6 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
|
11
11
|
from unstructured_ingest.v2.interfaces import (
|
|
12
12
|
AccessConfig,
|
|
13
13
|
ConnectionConfig,
|
|
14
|
-
FileData,
|
|
15
14
|
Uploader,
|
|
16
15
|
UploaderConfig,
|
|
17
16
|
UploadStager,
|
|
@@ -21,6 +20,7 @@ from unstructured_ingest.v2.logger import logger
|
|
|
21
20
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
22
21
|
DestinationRegistryEntry,
|
|
23
22
|
)
|
|
23
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
24
24
|
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
25
25
|
|
|
26
26
|
if TYPE_CHECKING:
|
|
@@ -15,10 +15,14 @@ from unstructured_ingest.logger import logger
|
|
|
15
15
|
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
16
16
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
17
17
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
18
|
-
from unstructured_ingest.v2.interfaces
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
18
|
+
from unstructured_ingest.v2.interfaces import (
|
|
19
|
+
ConnectionConfig,
|
|
20
|
+
Uploader,
|
|
21
|
+
UploaderConfig,
|
|
22
|
+
UploadStager,
|
|
23
|
+
UploadStagerConfig,
|
|
24
|
+
)
|
|
25
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
22
26
|
|
|
23
27
|
CONNECTOR_TYPE = "lancedb"
|
|
24
28
|
|
|
@@ -14,11 +14,8 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
14
14
|
Downloader,
|
|
15
15
|
DownloaderConfig,
|
|
16
16
|
DownloadResponse,
|
|
17
|
-
FileData,
|
|
18
|
-
FileDataSourceMetadata,
|
|
19
17
|
Indexer,
|
|
20
18
|
IndexerConfig,
|
|
21
|
-
SourceIdentifiers,
|
|
22
19
|
Uploader,
|
|
23
20
|
UploaderConfig,
|
|
24
21
|
)
|
|
@@ -31,6 +28,11 @@ from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
|
31
28
|
BlobStoreUploadStager,
|
|
32
29
|
BlobStoreUploadStagerConfig,
|
|
33
30
|
)
|
|
31
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
32
|
+
FileData,
|
|
33
|
+
FileDataSourceMetadata,
|
|
34
|
+
SourceIdentifiers,
|
|
35
|
+
)
|
|
34
36
|
|
|
35
37
|
CONNECTOR_TYPE = "local"
|
|
36
38
|
|
|
@@ -13,7 +13,6 @@ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
|
13
13
|
from unstructured_ingest.v2.interfaces import (
|
|
14
14
|
AccessConfig,
|
|
15
15
|
ConnectionConfig,
|
|
16
|
-
FileData,
|
|
17
16
|
Uploader,
|
|
18
17
|
UploaderConfig,
|
|
19
18
|
UploadStager,
|
|
@@ -23,6 +22,7 @@ from unstructured_ingest.v2.logger import logger
|
|
|
23
22
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
24
23
|
DestinationRegistryEntry,
|
|
25
24
|
)
|
|
25
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
26
26
|
|
|
27
27
|
if TYPE_CHECKING:
|
|
28
28
|
from pymilvus import MilvusClient
|
|
@@ -13,17 +13,12 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
|
13
13
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
14
14
|
from unstructured_ingest.v2.interfaces import (
|
|
15
15
|
AccessConfig,
|
|
16
|
-
BatchFileData,
|
|
17
|
-
BatchItem,
|
|
18
16
|
ConnectionConfig,
|
|
19
17
|
Downloader,
|
|
20
18
|
DownloaderConfig,
|
|
21
19
|
DownloadResponse,
|
|
22
|
-
FileData,
|
|
23
|
-
FileDataSourceMetadata,
|
|
24
20
|
Indexer,
|
|
25
21
|
IndexerConfig,
|
|
26
|
-
SourceIdentifiers,
|
|
27
22
|
Uploader,
|
|
28
23
|
UploaderConfig,
|
|
29
24
|
download_responses,
|
|
@@ -33,6 +28,13 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
33
28
|
DestinationRegistryEntry,
|
|
34
29
|
SourceRegistryEntry,
|
|
35
30
|
)
|
|
31
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
32
|
+
BatchFileData,
|
|
33
|
+
BatchItem,
|
|
34
|
+
FileData,
|
|
35
|
+
FileDataSourceMetadata,
|
|
36
|
+
SourceIdentifiers,
|
|
37
|
+
)
|
|
36
38
|
|
|
37
39
|
if TYPE_CHECKING:
|
|
38
40
|
from pymongo import MongoClient
|
|
@@ -18,7 +18,6 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
|
18
18
|
from unstructured_ingest.v2.interfaces import (
|
|
19
19
|
AccessConfig,
|
|
20
20
|
ConnectionConfig,
|
|
21
|
-
FileData,
|
|
22
21
|
Uploader,
|
|
23
22
|
UploaderConfig,
|
|
24
23
|
UploadStager,
|
|
@@ -28,6 +27,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
28
27
|
DestinationRegistryEntry,
|
|
29
28
|
)
|
|
30
29
|
from unstructured_ingest.v2.processes.connectors.utils import format_and_truncate_orig_elements
|
|
30
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
31
31
|
|
|
32
32
|
SimilarityFunction = Literal["cosine"]
|
|
33
33
|
|
|
@@ -12,14 +12,16 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
12
12
|
Downloader,
|
|
13
13
|
DownloaderConfig,
|
|
14
14
|
DownloadResponse,
|
|
15
|
-
FileData,
|
|
16
|
-
FileDataSourceMetadata,
|
|
17
15
|
Indexer,
|
|
18
16
|
IndexerConfig,
|
|
19
|
-
SourceIdentifiers,
|
|
20
17
|
)
|
|
21
18
|
from unstructured_ingest.v2.logger import logger
|
|
22
19
|
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
20
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
21
|
+
FileData,
|
|
22
|
+
FileDataSourceMetadata,
|
|
23
|
+
SourceIdentifiers,
|
|
24
|
+
)
|
|
23
25
|
|
|
24
26
|
if TYPE_CHECKING:
|
|
25
27
|
from unstructured_ingest.v2.processes.connectors.notion.client import Client
|
|
@@ -22,11 +22,8 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
22
22
|
Downloader,
|
|
23
23
|
DownloaderConfig,
|
|
24
24
|
DownloadResponse,
|
|
25
|
-
FileData,
|
|
26
|
-
FileDataSourceMetadata,
|
|
27
25
|
Indexer,
|
|
28
26
|
IndexerConfig,
|
|
29
|
-
SourceIdentifiers,
|
|
30
27
|
Uploader,
|
|
31
28
|
UploaderConfig,
|
|
32
29
|
)
|
|
@@ -39,6 +36,11 @@ from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
|
39
36
|
BlobStoreUploadStager,
|
|
40
37
|
BlobStoreUploadStagerConfig,
|
|
41
38
|
)
|
|
39
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
40
|
+
FileData,
|
|
41
|
+
FileDataSourceMetadata,
|
|
42
|
+
SourceIdentifiers,
|
|
43
|
+
)
|
|
42
44
|
|
|
43
45
|
if TYPE_CHECKING:
|
|
44
46
|
from office365.graph_client import GraphClient
|
|
@@ -16,12 +16,15 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
16
16
|
Downloader,
|
|
17
17
|
DownloaderConfig,
|
|
18
18
|
DownloadResponse,
|
|
19
|
-
FileData,
|
|
20
19
|
Indexer,
|
|
21
20
|
IndexerConfig,
|
|
22
21
|
)
|
|
23
|
-
from unstructured_ingest.v2.interfaces.file_data import FileDataSourceMetadata, SourceIdentifiers
|
|
24
22
|
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
23
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
24
|
+
FileData,
|
|
25
|
+
FileDataSourceMetadata,
|
|
26
|
+
SourceIdentifiers,
|
|
27
|
+
)
|
|
25
28
|
|
|
26
29
|
MAX_EMAILS_PER_FOLDER = 1_000_000 # Maximum number of emails per folder
|
|
27
30
|
|
|
@@ -13,7 +13,6 @@ from unstructured_ingest.v2.errors import UserError
|
|
|
13
13
|
from unstructured_ingest.v2.interfaces import (
|
|
14
14
|
AccessConfig,
|
|
15
15
|
ConnectionConfig,
|
|
16
|
-
FileData,
|
|
17
16
|
UploaderConfig,
|
|
18
17
|
UploadStager,
|
|
19
18
|
UploadStagerConfig,
|
|
@@ -21,6 +20,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
21
20
|
)
|
|
22
21
|
from unstructured_ingest.v2.logger import logger
|
|
23
22
|
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
23
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
24
24
|
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
25
25
|
|
|
26
26
|
if TYPE_CHECKING:
|
|
@@ -13,13 +13,13 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
|
13
13
|
from unstructured_ingest.v2.interfaces import (
|
|
14
14
|
AccessConfig,
|
|
15
15
|
ConnectionConfig,
|
|
16
|
-
FileData,
|
|
17
16
|
Uploader,
|
|
18
17
|
UploaderConfig,
|
|
19
18
|
UploadStager,
|
|
20
19
|
UploadStagerConfig,
|
|
21
20
|
)
|
|
22
21
|
from unstructured_ingest.v2.logger import logger
|
|
22
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
23
23
|
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
24
24
|
|
|
25
25
|
if TYPE_CHECKING:
|
|
@@ -11,12 +11,12 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
|
11
11
|
from unstructured_ingest.v2.interfaces import (
|
|
12
12
|
AccessConfig,
|
|
13
13
|
ConnectionConfig,
|
|
14
|
-
FileData,
|
|
15
14
|
Uploader,
|
|
16
15
|
UploaderConfig,
|
|
17
16
|
)
|
|
18
17
|
from unstructured_ingest.v2.logger import logger
|
|
19
18
|
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
19
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
20
20
|
|
|
21
21
|
if TYPE_CHECKING:
|
|
22
22
|
from redis.asyncio import Redis
|
|
@@ -28,16 +28,18 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
28
28
|
Downloader,
|
|
29
29
|
DownloaderConfig,
|
|
30
30
|
DownloadResponse,
|
|
31
|
-
FileData,
|
|
32
|
-
FileDataSourceMetadata,
|
|
33
31
|
Indexer,
|
|
34
32
|
IndexerConfig,
|
|
35
|
-
SourceIdentifiers,
|
|
36
33
|
)
|
|
37
34
|
from unstructured_ingest.v2.logger import logger
|
|
38
35
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
39
36
|
SourceRegistryEntry,
|
|
40
37
|
)
|
|
38
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
39
|
+
FileData,
|
|
40
|
+
FileDataSourceMetadata,
|
|
41
|
+
SourceIdentifiers,
|
|
42
|
+
)
|
|
41
43
|
|
|
42
44
|
|
|
43
45
|
class MissingCategoryError(Exception):
|
|
@@ -11,9 +11,6 @@ from unstructured_ingest.error import (
|
|
|
11
11
|
SourceConnectionNetworkError,
|
|
12
12
|
)
|
|
13
13
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
|
-
from unstructured_ingest.v2.interfaces import (
|
|
15
|
-
FileData,
|
|
16
|
-
)
|
|
17
14
|
from unstructured_ingest.v2.logger import logger
|
|
18
15
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
19
16
|
SourceRegistryEntry,
|
|
@@ -26,6 +23,9 @@ from unstructured_ingest.v2.processes.connectors.onedrive import (
|
|
|
26
23
|
OnedriveIndexer,
|
|
27
24
|
OnedriveIndexerConfig,
|
|
28
25
|
)
|
|
26
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
27
|
+
FileData,
|
|
28
|
+
)
|
|
29
29
|
|
|
30
30
|
if TYPE_CHECKING:
|
|
31
31
|
from office365.onedrive.driveitems.driveItem import DriveItem
|
|
@@ -20,12 +20,12 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
20
20
|
Indexer,
|
|
21
21
|
IndexerConfig,
|
|
22
22
|
)
|
|
23
|
-
from unstructured_ingest.v2.
|
|
23
|
+
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
24
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
24
25
|
FileData,
|
|
25
26
|
FileDataSourceMetadata,
|
|
26
27
|
SourceIdentifiers,
|
|
27
28
|
)
|
|
28
|
-
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
29
29
|
|
|
30
30
|
if TYPE_CHECKING:
|
|
31
31
|
from slack_sdk import WebClient
|
|
@@ -7,7 +7,6 @@ from pydantic import Field, Secret
|
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.utils.data_prep import split_dataframe
|
|
9
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
|
-
from unstructured_ingest.v2.interfaces import FileData
|
|
11
10
|
from unstructured_ingest.v2.logger import logger
|
|
12
11
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
13
12
|
DestinationRegistryEntry,
|
|
@@ -20,6 +19,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
|
20
19
|
SQLUploadStager,
|
|
21
20
|
SQLUploadStagerConfig,
|
|
22
21
|
)
|
|
22
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
23
23
|
|
|
24
24
|
if TYPE_CHECKING:
|
|
25
25
|
from databricks.sdk.core import oauth_service_principal
|
|
@@ -7,7 +7,6 @@ from pydantic import Field, Secret
|
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.utils.data_prep import split_dataframe
|
|
9
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
|
-
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
11
10
|
from unstructured_ingest.v2.logger import logger
|
|
12
11
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
13
12
|
DestinationRegistryEntry,
|
|
@@ -28,6 +27,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
|
28
27
|
SQLUploadStagerConfig,
|
|
29
28
|
parse_date_string,
|
|
30
29
|
)
|
|
30
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
31
31
|
|
|
32
32
|
if TYPE_CHECKING:
|
|
33
33
|
from pandas import DataFrame
|
|
@@ -16,17 +16,12 @@ from unstructured_ingest.utils.data_prep import get_data, get_data_df, split_dat
|
|
|
16
16
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
17
17
|
from unstructured_ingest.v2.interfaces import (
|
|
18
18
|
AccessConfig,
|
|
19
|
-
BatchFileData,
|
|
20
|
-
BatchItem,
|
|
21
19
|
ConnectionConfig,
|
|
22
20
|
Downloader,
|
|
23
21
|
DownloaderConfig,
|
|
24
22
|
DownloadResponse,
|
|
25
|
-
FileData,
|
|
26
|
-
FileDataSourceMetadata,
|
|
27
23
|
Indexer,
|
|
28
24
|
IndexerConfig,
|
|
29
|
-
SourceIdentifiers,
|
|
30
25
|
Uploader,
|
|
31
26
|
UploaderConfig,
|
|
32
27
|
UploadStager,
|
|
@@ -34,6 +29,13 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
34
29
|
download_responses,
|
|
35
30
|
)
|
|
36
31
|
from unstructured_ingest.v2.logger import logger
|
|
32
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
33
|
+
BatchFileData,
|
|
34
|
+
BatchItem,
|
|
35
|
+
FileData,
|
|
36
|
+
FileDataSourceMetadata,
|
|
37
|
+
SourceIdentifiers,
|
|
38
|
+
)
|
|
37
39
|
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
38
40
|
|
|
39
41
|
if TYPE_CHECKING:
|
|
@@ -251,8 +253,9 @@ class SQLUploadStager(UploadStager):
|
|
|
251
253
|
df[column] = df[column].apply(str)
|
|
252
254
|
return df
|
|
253
255
|
|
|
254
|
-
def write_output(self, output_path: Path, data: list[dict]) ->
|
|
256
|
+
def write_output(self, output_path: Path, data: list[dict]) -> Path:
|
|
255
257
|
write_data(path=output_path, data=data)
|
|
258
|
+
return output_path
|
|
256
259
|
|
|
257
260
|
def run(
|
|
258
261
|
self,
|
|
@@ -278,8 +281,10 @@ class SQLUploadStager(UploadStager):
|
|
|
278
281
|
output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
|
|
279
282
|
output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
|
|
280
283
|
|
|
281
|
-
self.write_output(
|
|
282
|
-
|
|
284
|
+
final_output_path = self.write_output(
|
|
285
|
+
output_path=output_path, data=df.to_dict(orient="records")
|
|
286
|
+
)
|
|
287
|
+
return final_output_path
|
|
283
288
|
|
|
284
289
|
|
|
285
290
|
class SQLUploaderConfig(UploaderConfig):
|
|
@@ -8,9 +8,6 @@ from unstructured_ingest.error import DestinationConnectionError
|
|
|
8
8
|
from unstructured_ingest.utils.data_prep import split_dataframe
|
|
9
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
10
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
11
|
-
from unstructured_ingest.v2.interfaces import (
|
|
12
|
-
FileData,
|
|
13
|
-
)
|
|
14
11
|
from unstructured_ingest.v2.logger import logger
|
|
15
12
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
16
13
|
DestinationRegistryEntry,
|
|
@@ -29,6 +26,9 @@ from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
|
29
26
|
SQLUploadStager,
|
|
30
27
|
SQLUploadStagerConfig,
|
|
31
28
|
)
|
|
29
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
30
|
+
FileData,
|
|
31
|
+
)
|
|
32
32
|
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
33
33
|
|
|
34
34
|
if TYPE_CHECKING:
|
|
@@ -14,7 +14,6 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
|
14
14
|
from unstructured_ingest.v2.interfaces import (
|
|
15
15
|
AccessConfig,
|
|
16
16
|
ConnectionConfig,
|
|
17
|
-
FileData,
|
|
18
17
|
Uploader,
|
|
19
18
|
UploaderConfig,
|
|
20
19
|
UploadStager,
|
|
@@ -22,6 +21,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
22
21
|
)
|
|
23
22
|
from unstructured_ingest.v2.logger import logger
|
|
24
23
|
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
24
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
25
25
|
|
|
26
26
|
BASE_URL = "https://api.vectara.io/v2"
|
|
27
27
|
|