unstructured-ingest 1.2.9__py3-none-any.whl → 1.2.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/bedrock.py +1 -1
- unstructured_ingest/embed/octoai.py +1 -1
- unstructured_ingest/embed/openai.py +1 -1
- unstructured_ingest/embed/togetherai.py +4 -4
- unstructured_ingest/embed/vertexai.py +1 -1
- unstructured_ingest/embed/voyageai.py +2 -2
- unstructured_ingest/error.py +113 -6
- unstructured_ingest/errors_v2.py +139 -8
- unstructured_ingest/interfaces/downloader.py +2 -2
- unstructured_ingest/processes/connectors/airtable.py +1 -0
- unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
- unstructured_ingest/processes/connectors/chroma.py +2 -2
- unstructured_ingest/processes/connectors/confluence.py +6 -2
- unstructured_ingest/processes/connectors/databricks/volumes.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -0
- unstructured_ingest/processes/connectors/delta_table.py +3 -3
- unstructured_ingest/processes/connectors/discord.py +3 -3
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +1 -1
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +3 -2
- unstructured_ingest/processes/connectors/fsspec/azure.py +1 -1
- unstructured_ingest/processes/connectors/fsspec/box.py +1 -1
- unstructured_ingest/processes/connectors/fsspec/dropbox.py +3 -2
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +8 -10
- unstructured_ingest/processes/connectors/fsspec/gcs.py +2 -2
- unstructured_ingest/processes/connectors/fsspec/s3.py +1 -1
- unstructured_ingest/processes/connectors/github.py +8 -3
- unstructured_ingest/processes/connectors/gitlab.py +1 -1
- unstructured_ingest/processes/connectors/google_drive.py +2 -4
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +12 -10
- unstructured_ingest/processes/connectors/jira.py +1 -1
- unstructured_ingest/processes/connectors/kafka/kafka.py +5 -5
- unstructured_ingest/processes/connectors/local.py +2 -1
- unstructured_ingest/processes/connectors/milvus.py +6 -1
- unstructured_ingest/processes/connectors/mongodb.py +6 -1
- unstructured_ingest/processes/connectors/neo4j.py +6 -2
- unstructured_ingest/processes/connectors/notion/client.py +14 -14
- unstructured_ingest/processes/connectors/notion/connector.py +1 -1
- unstructured_ingest/processes/connectors/onedrive.py +2 -1
- unstructured_ingest/processes/connectors/outlook.py +1 -1
- unstructured_ingest/processes/connectors/pinecone.py +8 -6
- unstructured_ingest/processes/connectors/redisdb.py +2 -2
- unstructured_ingest/processes/connectors/salesforce.py +6 -6
- unstructured_ingest/processes/connectors/sharepoint.py +5 -2
- unstructured_ingest/processes/connectors/slack.py +1 -1
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +1 -0
- unstructured_ingest/processes/connectors/sql/sqlite.py +1 -0
- unstructured_ingest/processes/connectors/vectara.py +1 -1
- unstructured_ingest/processes/connectors/weaviate/cloud.py +1 -0
- unstructured_ingest/processes/connectors/weaviate/weaviate.py +1 -1
- unstructured_ingest/processes/connectors/zendesk/client.py +8 -2
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +4 -1
- unstructured_ingest/processes/partitioner.py +1 -1
- unstructured_ingest/unstructured_api.py +1 -1
- {unstructured_ingest-1.2.9.dist-info → unstructured_ingest-1.2.11.dist-info}/METADATA +1 -1
- {unstructured_ingest-1.2.9.dist-info → unstructured_ingest-1.2.11.dist-info}/RECORD +59 -59
- {unstructured_ingest-1.2.9.dist-info → unstructured_ingest-1.2.11.dist-info}/WHEEL +0 -0
- {unstructured_ingest-1.2.9.dist-info → unstructured_ingest-1.2.11.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-1.2.9.dist-info → unstructured_ingest-1.2.11.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.11" # pragma: no cover
|
|
@@ -8,13 +8,13 @@ from unstructured_ingest.embed.interfaces import (
|
|
|
8
8
|
BaseEmbeddingEncoder,
|
|
9
9
|
EmbeddingConfig,
|
|
10
10
|
)
|
|
11
|
-
from unstructured_ingest.
|
|
11
|
+
from unstructured_ingest.error import (
|
|
12
12
|
ProviderError,
|
|
13
13
|
UserAuthError,
|
|
14
14
|
UserError,
|
|
15
15
|
is_internal_error,
|
|
16
16
|
)
|
|
17
|
-
from unstructured_ingest.
|
|
17
|
+
from unstructured_ingest.error import (
|
|
18
18
|
RateLimitError as CustomRateLimitError,
|
|
19
19
|
)
|
|
20
20
|
from unstructured_ingest.logger import logger
|
|
@@ -46,8 +46,8 @@ class TogetherAIEmbeddingConfig(EmbeddingConfig):
|
|
|
46
46
|
return UserAuthError(message)
|
|
47
47
|
if isinstance(e, RateLimitError):
|
|
48
48
|
return CustomRateLimitError(message)
|
|
49
|
-
|
|
50
|
-
status_code = getattr(e,
|
|
49
|
+
|
|
50
|
+
status_code = getattr(e, "status_code", None)
|
|
51
51
|
if status_code is not None:
|
|
52
52
|
if 400 <= status_code < 500:
|
|
53
53
|
return UserError(message)
|
|
@@ -13,7 +13,7 @@ from unstructured_ingest.embed.interfaces import (
|
|
|
13
13
|
BaseEmbeddingEncoder,
|
|
14
14
|
EmbeddingConfig,
|
|
15
15
|
)
|
|
16
|
-
from unstructured_ingest.
|
|
16
|
+
from unstructured_ingest.error import UserAuthError, is_internal_error
|
|
17
17
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
18
18
|
|
|
19
19
|
if TYPE_CHECKING:
|
|
@@ -8,8 +8,8 @@ from unstructured_ingest.embed.interfaces import (
|
|
|
8
8
|
BaseEmbeddingEncoder,
|
|
9
9
|
EmbeddingConfig,
|
|
10
10
|
)
|
|
11
|
-
from unstructured_ingest.
|
|
12
|
-
from unstructured_ingest.
|
|
11
|
+
from unstructured_ingest.error import ProviderError, UserAuthError, UserError, is_internal_error
|
|
12
|
+
from unstructured_ingest.error import (
|
|
13
13
|
RateLimitError as CustomRateLimitError,
|
|
14
14
|
)
|
|
15
15
|
from unstructured_ingest.logger import logger
|
unstructured_ingest/error.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
from abc import ABC
|
|
2
2
|
from functools import wraps
|
|
3
|
+
from typing import Optional
|
|
3
4
|
|
|
4
5
|
|
|
5
|
-
class
|
|
6
|
+
class UnstructuredIngestError(Exception, ABC):
|
|
6
7
|
error_string: str
|
|
8
|
+
status_code: Optional[int] = None
|
|
7
9
|
|
|
8
10
|
@classmethod
|
|
9
11
|
def wrap(cls, f):
|
|
@@ -25,25 +27,130 @@ class CustomError(Exception, ABC):
|
|
|
25
27
|
return wrapper
|
|
26
28
|
|
|
27
29
|
|
|
28
|
-
class
|
|
30
|
+
class ConnectionError(UnstructuredIngestError):
|
|
31
|
+
error_string = "Connection error: {}"
|
|
32
|
+
status_code: Optional[int] = 400
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class SourceConnectionError(ConnectionError):
|
|
29
36
|
error_string = "Error in getting data from upstream data source: {}"
|
|
37
|
+
status_code: Optional[int] = 400
|
|
30
38
|
|
|
31
39
|
|
|
32
40
|
class SourceConnectionNetworkError(SourceConnectionError):
|
|
33
41
|
error_string = "Error in connecting to upstream data source: {}"
|
|
42
|
+
status_code: Optional[int] = 400
|
|
34
43
|
|
|
35
44
|
|
|
36
|
-
class DestinationConnectionError(
|
|
45
|
+
class DestinationConnectionError(ConnectionError):
|
|
37
46
|
error_string = "Error in connecting to downstream data source: {}"
|
|
47
|
+
status_code: Optional[int] = 400
|
|
38
48
|
|
|
39
49
|
|
|
40
|
-
class EmbeddingEncoderConnectionError(
|
|
50
|
+
class EmbeddingEncoderConnectionError(ConnectionError):
|
|
41
51
|
error_string = "Error in connecting to the embedding model provider: {}"
|
|
52
|
+
status_code: Optional[int] = 400
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class UserError(UnstructuredIngestError):
|
|
56
|
+
error_string = "User error: {}"
|
|
57
|
+
status_code: Optional[int] = 401
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class UserAuthError(UserError):
|
|
61
|
+
error_string = "User authentication error: {}"
|
|
62
|
+
status_code: Optional[int] = 401
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class RateLimitError(UserError):
|
|
66
|
+
error_string = "Rate limit error: {}"
|
|
67
|
+
status_code: Optional[int] = 429
|
|
68
|
+
|
|
42
69
|
|
|
70
|
+
class NotFoundError(UnstructuredIngestError):
|
|
71
|
+
error_string = "Not found error: {}"
|
|
72
|
+
status_code: Optional[int] = 404
|
|
43
73
|
|
|
44
|
-
|
|
74
|
+
|
|
75
|
+
class TimeoutError(UnstructuredIngestError):
|
|
76
|
+
error_string = "Timeout error: {}"
|
|
77
|
+
status_code: Optional[int] = 408
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class ResponseError(UnstructuredIngestError):
|
|
81
|
+
error_string = "Response error: {}"
|
|
82
|
+
status_code: Optional[int] = 400
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class WriteError(UnstructuredIngestError):
|
|
45
86
|
error_string = "Error in writing to downstream data source: {}"
|
|
87
|
+
status_code: Optional[int] = 400
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class ProviderError(UnstructuredIngestError):
|
|
91
|
+
error_string = "Provider error: {}"
|
|
92
|
+
status_code: Optional[int] = 500
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class ValueError(UnstructuredIngestError):
|
|
96
|
+
error_string = "Value error: {}"
|
|
46
97
|
|
|
47
98
|
|
|
48
|
-
class PartitionError(
|
|
99
|
+
class PartitionError(UnstructuredIngestError):
|
|
49
100
|
error_string = "Error in partitioning content: {}"
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class QuotaError(UserError):
|
|
104
|
+
error_string = "Quota error: {}"
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class MissingCategoryError(UnstructuredIngestError):
|
|
108
|
+
error_string = "Missing category error: {}"
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class ValidationError(UnstructuredIngestError):
|
|
112
|
+
error_string = "Validation error: {}"
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class KeyError(UnstructuredIngestError):
|
|
116
|
+
error_string = "Key error: {}"
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class FileExistsError(UnstructuredIngestError):
|
|
120
|
+
error_string = "File exists error: {}"
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class TypeError(UnstructuredIngestError):
|
|
124
|
+
error_string = "Type error: {}"
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class IcebergCommitFailedException(UnstructuredIngestError):
|
|
128
|
+
error_string = "Failed to commit changes to the iceberg table"
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
recognized_errors = [
|
|
132
|
+
UserError,
|
|
133
|
+
UserAuthError,
|
|
134
|
+
RateLimitError,
|
|
135
|
+
QuotaError,
|
|
136
|
+
ProviderError,
|
|
137
|
+
NotFoundError,
|
|
138
|
+
TypeError,
|
|
139
|
+
ValueError,
|
|
140
|
+
FileExistsError,
|
|
141
|
+
TimeoutError,
|
|
142
|
+
KeyError,
|
|
143
|
+
ResponseError,
|
|
144
|
+
ValidationError,
|
|
145
|
+
PartitionError,
|
|
146
|
+
WriteError,
|
|
147
|
+
ConnectionError,
|
|
148
|
+
SourceConnectionError,
|
|
149
|
+
SourceConnectionNetworkError,
|
|
150
|
+
DestinationConnectionError,
|
|
151
|
+
EmbeddingEncoderConnectionError,
|
|
152
|
+
]
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def is_internal_error(e: Exception) -> bool:
|
|
156
|
+
return any(isinstance(e, recognized_error) for recognized_error in recognized_errors)
|
unstructured_ingest/errors_v2.py
CHANGED
|
@@ -1,24 +1,155 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from functools import wraps
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class UnstructuredIngestError(Exception, ABC):
|
|
7
|
+
error_string: str
|
|
8
|
+
status_code: Optional[int] = None
|
|
9
|
+
|
|
10
|
+
@classmethod
|
|
11
|
+
def wrap(cls, f):
|
|
12
|
+
"""
|
|
13
|
+
Provides a wrapper for a function that catches any exception and
|
|
14
|
+
re-raises it as the customer error. If the exception itself is already an instance
|
|
15
|
+
of the custom error, re-raises original error.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
@wraps(f)
|
|
19
|
+
def wrapper(*args, **kwargs):
|
|
20
|
+
try:
|
|
21
|
+
return f(*args, **kwargs)
|
|
22
|
+
except BaseException as error:
|
|
23
|
+
if not isinstance(error, cls) and not issubclass(type(error), cls):
|
|
24
|
+
raise cls(cls.error_string.format(str(error))) from error
|
|
25
|
+
raise
|
|
26
|
+
|
|
27
|
+
return wrapper
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ConnectionError(UnstructuredIngestError):
|
|
31
|
+
error_string = "Connection error: {}"
|
|
32
|
+
status_code: Optional[int] = 400
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class SourceConnectionError(ConnectionError):
|
|
36
|
+
error_string = "Error in getting data from upstream data source: {}"
|
|
37
|
+
status_code: Optional[int] = 400
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class SourceConnectionNetworkError(SourceConnectionError):
|
|
41
|
+
error_string = "Error in connecting to upstream data source: {}"
|
|
42
|
+
status_code: Optional[int] = 400
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class DestinationConnectionError(ConnectionError):
|
|
46
|
+
error_string = "Error in connecting to downstream data source: {}"
|
|
47
|
+
status_code: Optional[int] = 400
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class EmbeddingEncoderConnectionError(ConnectionError):
|
|
51
|
+
error_string = "Error in connecting to the embedding model provider: {}"
|
|
52
|
+
status_code: Optional[int] = 400
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class UserError(UnstructuredIngestError):
|
|
56
|
+
error_string = "User error: {}"
|
|
57
|
+
status_code: Optional[int] = 401
|
|
3
58
|
|
|
4
59
|
|
|
5
60
|
class UserAuthError(UserError):
|
|
6
|
-
|
|
61
|
+
error_string = "User authentication error: {}"
|
|
62
|
+
status_code: Optional[int] = 401
|
|
7
63
|
|
|
8
64
|
|
|
9
65
|
class RateLimitError(UserError):
|
|
10
|
-
|
|
66
|
+
error_string = "Rate limit error: {}"
|
|
67
|
+
status_code: Optional[int] = 429
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class NotFoundError(UnstructuredIngestError):
|
|
71
|
+
error_string = "Not found error: {}"
|
|
72
|
+
status_code: Optional[int] = 404
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class TimeoutError(UnstructuredIngestError):
|
|
76
|
+
error_string = "Timeout error: {}"
|
|
77
|
+
status_code: Optional[int] = 408
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class ResponseError(UnstructuredIngestError):
|
|
81
|
+
error_string = "Response error: {}"
|
|
82
|
+
status_code: Optional[int] = 400
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class WriteError(UnstructuredIngestError):
|
|
86
|
+
error_string = "Error in writing to downstream data source: {}"
|
|
87
|
+
status_code: Optional[int] = 400
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class ProviderError(UnstructuredIngestError):
|
|
91
|
+
error_string = "Provider error: {}"
|
|
92
|
+
status_code: Optional[int] = 500
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class ValueError(UnstructuredIngestError):
|
|
96
|
+
error_string = "Value error: {}"
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class PartitionError(UnstructuredIngestError):
|
|
100
|
+
error_string = "Error in partitioning content: {}"
|
|
11
101
|
|
|
12
102
|
|
|
13
103
|
class QuotaError(UserError):
|
|
14
|
-
|
|
104
|
+
error_string = "Quota error: {}"
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class MissingCategoryError(UnstructuredIngestError):
|
|
108
|
+
error_string = "Missing category error: {}"
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class ValidationError(UnstructuredIngestError):
|
|
112
|
+
error_string = "Validation error: {}"
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class KeyError(UnstructuredIngestError):
|
|
116
|
+
error_string = "Key error: {}"
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class FileExistsError(UnstructuredIngestError):
|
|
120
|
+
error_string = "File exists error: {}"
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class TypeError(UnstructuredIngestError):
|
|
124
|
+
error_string = "Type error: {}"
|
|
15
125
|
|
|
16
126
|
|
|
17
|
-
class
|
|
18
|
-
|
|
127
|
+
class IcebergCommitFailedException(UnstructuredIngestError):
|
|
128
|
+
error_string = "Failed to commit changes to the iceberg table"
|
|
19
129
|
|
|
20
130
|
|
|
21
|
-
recognized_errors = [
|
|
131
|
+
recognized_errors = [
|
|
132
|
+
UserError,
|
|
133
|
+
UserAuthError,
|
|
134
|
+
RateLimitError,
|
|
135
|
+
QuotaError,
|
|
136
|
+
ProviderError,
|
|
137
|
+
NotFoundError,
|
|
138
|
+
TypeError,
|
|
139
|
+
ValueError,
|
|
140
|
+
FileExistsError,
|
|
141
|
+
TimeoutError,
|
|
142
|
+
KeyError,
|
|
143
|
+
ResponseError,
|
|
144
|
+
ValidationError,
|
|
145
|
+
PartitionError,
|
|
146
|
+
WriteError,
|
|
147
|
+
ConnectionError,
|
|
148
|
+
SourceConnectionError,
|
|
149
|
+
SourceConnectionNetworkError,
|
|
150
|
+
DestinationConnectionError,
|
|
151
|
+
EmbeddingEncoderConnectionError,
|
|
152
|
+
]
|
|
22
153
|
|
|
23
154
|
|
|
24
155
|
def is_internal_error(e: Exception) -> bool:
|
|
@@ -36,11 +36,11 @@ class Downloader(BaseProcess, BaseConnector, ABC):
|
|
|
36
36
|
def get_download_path(self, file_data: FileData) -> Optional[Path]:
|
|
37
37
|
if not file_data.source_identifiers:
|
|
38
38
|
return None
|
|
39
|
-
|
|
39
|
+
|
|
40
40
|
rel_path = file_data.source_identifiers.relative_path
|
|
41
41
|
if not rel_path:
|
|
42
42
|
return None
|
|
43
|
-
|
|
43
|
+
|
|
44
44
|
rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
|
|
45
45
|
return self.download_dir / Path(rel_path)
|
|
46
46
|
|
|
@@ -6,6 +6,7 @@ from uuid import NAMESPACE_DNS, uuid5
|
|
|
6
6
|
from pydantic import BaseModel, Field, Secret, field_validator
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
9
|
+
from unstructured_ingest.error import ValueError
|
|
9
10
|
from unstructured_ingest.interfaces import (
|
|
10
11
|
AccessConfig,
|
|
11
12
|
ConnectionConfig,
|
|
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Any, Generator
|
|
|
6
6
|
from pydantic import Field, Secret
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.data_types.file_data import FileData
|
|
9
|
-
from unstructured_ingest.error import DestinationConnectionError, WriteError
|
|
9
|
+
from unstructured_ingest.error import DestinationConnectionError, ValueError, WriteError
|
|
10
10
|
from unstructured_ingest.interfaces import (
|
|
11
11
|
AccessConfig,
|
|
12
12
|
ConnectionConfig,
|
|
@@ -7,7 +7,7 @@ from pydantic import Field, Secret
|
|
|
7
7
|
from pydantic.functional_validators import BeforeValidator
|
|
8
8
|
|
|
9
9
|
from unstructured_ingest.data_types.file_data import FileData
|
|
10
|
-
from unstructured_ingest.error import DestinationConnectionError
|
|
10
|
+
from unstructured_ingest.error import DestinationConnectionError, ValueError
|
|
11
11
|
from unstructured_ingest.interfaces import (
|
|
12
12
|
AccessConfig,
|
|
13
13
|
ConnectionConfig,
|
|
@@ -151,7 +151,7 @@ class ChromaUploader(Uploader):
|
|
|
151
151
|
metadatas=batch["metadatas"],
|
|
152
152
|
)
|
|
153
153
|
except Exception as e:
|
|
154
|
-
raise
|
|
154
|
+
raise DestinationConnectionError(f"chroma error: {e}") from e
|
|
155
155
|
|
|
156
156
|
@staticmethod
|
|
157
157
|
def prepare_chroma_list(chunk: tuple[dict[str, Any]]) -> dict[str, list[Any]]:
|
|
@@ -11,8 +11,12 @@ from unstructured_ingest.data_types.file_data import (
|
|
|
11
11
|
FileDataSourceMetadata,
|
|
12
12
|
SourceIdentifiers,
|
|
13
13
|
)
|
|
14
|
-
from unstructured_ingest.error import
|
|
15
|
-
|
|
14
|
+
from unstructured_ingest.error import (
|
|
15
|
+
SourceConnectionError,
|
|
16
|
+
UserAuthError,
|
|
17
|
+
UserError,
|
|
18
|
+
ValueError,
|
|
19
|
+
)
|
|
16
20
|
from unstructured_ingest.interfaces import (
|
|
17
21
|
AccessConfig,
|
|
18
22
|
ConnectionConfig,
|
|
@@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
|
8
8
|
from pydantic import Field
|
|
9
9
|
|
|
10
10
|
from unstructured_ingest.data_types.file_data import FileData
|
|
11
|
+
from unstructured_ingest.error import ValueError
|
|
11
12
|
from unstructured_ingest.interfaces import (
|
|
12
13
|
Uploader,
|
|
13
14
|
UploaderConfig,
|
|
@@ -9,7 +9,7 @@ from urllib.parse import urlparse
|
|
|
9
9
|
from pydantic import Field, Secret
|
|
10
10
|
|
|
11
11
|
from unstructured_ingest.data_types.file_data import FileData
|
|
12
|
-
from unstructured_ingest.error import DestinationConnectionError
|
|
12
|
+
from unstructured_ingest.error import DestinationConnectionError, ValueError
|
|
13
13
|
from unstructured_ingest.interfaces import (
|
|
14
14
|
AccessConfig,
|
|
15
15
|
ConnectionConfig,
|
|
@@ -254,12 +254,12 @@ class DeltaTableUploader(Uploader):
|
|
|
254
254
|
if not queue.empty():
|
|
255
255
|
error_message = queue.get()
|
|
256
256
|
logger.error("Exception occurred in write_deltalake: %s", error_message)
|
|
257
|
-
raise
|
|
257
|
+
raise DestinationConnectionError(f"Error in write_deltalake: {error_message}")
|
|
258
258
|
|
|
259
259
|
# If the subprocess terminated abnormally but produced no traceback (e.g., SIGABRT),
|
|
260
260
|
# still raise a helpful error for callers.
|
|
261
261
|
if not current_process().daemon and writer.exitcode != 0:
|
|
262
|
-
raise
|
|
262
|
+
raise DestinationConnectionError(
|
|
263
263
|
f"write_deltalake subprocess exited with code {writer.exitcode}"
|
|
264
264
|
)
|
|
265
265
|
|
|
@@ -9,7 +9,7 @@ from unstructured_ingest.data_types.file_data import (
|
|
|
9
9
|
FileDataSourceMetadata,
|
|
10
10
|
SourceIdentifiers,
|
|
11
11
|
)
|
|
12
|
-
from unstructured_ingest.error import
|
|
12
|
+
from unstructured_ingest.error import UserAuthError, ValueError
|
|
13
13
|
from unstructured_ingest.interfaces import (
|
|
14
14
|
AccessConfig,
|
|
15
15
|
ConnectionConfig,
|
|
@@ -70,9 +70,9 @@ class DiscordIndexer(Indexer):
|
|
|
70
70
|
|
|
71
71
|
def precheck(self) -> None:
|
|
72
72
|
if not self.connection_config.access_config.get_secret_value().token:
|
|
73
|
-
raise
|
|
73
|
+
raise UserAuthError("Discord token is missing")
|
|
74
74
|
if not self.index_config.channels:
|
|
75
|
-
raise
|
|
75
|
+
raise ValueError("No channels provided")
|
|
76
76
|
|
|
77
77
|
def get_channel_file_data(self, channel_id: str) -> Optional[FileData]:
|
|
78
78
|
# Fetch channel metadata
|
|
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
|
6
6
|
from pydantic import Field, Secret
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.data_types.file_data import FileData
|
|
9
|
-
from unstructured_ingest.error import DestinationConnectionError
|
|
9
|
+
from unstructured_ingest.error import DestinationConnectionError, ValueError
|
|
10
10
|
from unstructured_ingest.interfaces import (
|
|
11
11
|
AccessConfig,
|
|
12
12
|
ConnectionConfig,
|
|
@@ -19,6 +19,7 @@ from unstructured_ingest.error import (
|
|
|
19
19
|
DestinationConnectionError,
|
|
20
20
|
SourceConnectionError,
|
|
21
21
|
SourceConnectionNetworkError,
|
|
22
|
+
UnstructuredIngestError,
|
|
22
23
|
)
|
|
23
24
|
from unstructured_ingest.interfaces import (
|
|
24
25
|
AccessConfig,
|
|
@@ -440,10 +441,10 @@ class ElasticsearchUploader(Uploader):
|
|
|
440
441
|
logger.error(
|
|
441
442
|
f"Batch upload failed - {e} - with following errors: {sanitized_errors}"
|
|
442
443
|
)
|
|
443
|
-
raise e
|
|
444
|
+
raise DestinationConnectionError(str(e))
|
|
444
445
|
except Exception as e:
|
|
445
446
|
logger.error(f"Batch upload failed - {e}")
|
|
446
|
-
raise e
|
|
447
|
+
raise UnstructuredIngestError(str(e))
|
|
447
448
|
|
|
448
449
|
def _sanitize_bulk_index_error(self, error: dict[str, dict]) -> dict:
|
|
449
450
|
"""Remove data uploaded to index from the log, leave only error information.
|
|
@@ -8,7 +8,7 @@ from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
|
8
8
|
from pydantic import Field, Secret
|
|
9
9
|
|
|
10
10
|
from unstructured_ingest.data_types.file_data import FileDataSourceMetadata
|
|
11
|
-
from unstructured_ingest.
|
|
11
|
+
from unstructured_ingest.error import ProviderError, UserAuthError, UserError, ValueError
|
|
12
12
|
from unstructured_ingest.logger import logger
|
|
13
13
|
from unstructured_ingest.processes.connector_registry import (
|
|
14
14
|
DestinationRegistryEntry,
|
|
@@ -10,7 +10,7 @@ from pydantic import Field, Secret
|
|
|
10
10
|
from pydantic.functional_validators import BeforeValidator
|
|
11
11
|
|
|
12
12
|
from unstructured_ingest.data_types.file_data import FileDataSourceMetadata
|
|
13
|
-
from unstructured_ingest.
|
|
13
|
+
from unstructured_ingest.error import ProviderError, UserAuthError, UserError
|
|
14
14
|
from unstructured_ingest.logger import logger
|
|
15
15
|
from unstructured_ingest.processes.connector_registry import (
|
|
16
16
|
DestinationRegistryEntry,
|
|
@@ -7,12 +7,13 @@ from typing import TYPE_CHECKING, Any, Optional
|
|
|
7
7
|
from pydantic import Field, Secret
|
|
8
8
|
|
|
9
9
|
from unstructured_ingest.data_types.file_data import FileDataSourceMetadata
|
|
10
|
-
from unstructured_ingest.
|
|
10
|
+
from unstructured_ingest.error import (
|
|
11
11
|
ProviderError,
|
|
12
12
|
UserAuthError,
|
|
13
13
|
UserError,
|
|
14
|
+
ValueError,
|
|
14
15
|
)
|
|
15
|
-
from unstructured_ingest.
|
|
16
|
+
from unstructured_ingest.error import (
|
|
16
17
|
RateLimitError as CustomRateLimitError,
|
|
17
18
|
)
|
|
18
19
|
from unstructured_ingest.logger import logger
|
|
@@ -17,6 +17,7 @@ from unstructured_ingest.data_types.file_data import (
|
|
|
17
17
|
FileDataSourceMetadata,
|
|
18
18
|
SourceIdentifiers,
|
|
19
19
|
)
|
|
20
|
+
from unstructured_ingest.error import TypeError, ValueError
|
|
20
21
|
from unstructured_ingest.interfaces import (
|
|
21
22
|
AccessConfig,
|
|
22
23
|
ConnectionConfig,
|
|
@@ -265,29 +266,26 @@ FsspecDownloaderConfigT = TypeVar("FsspecDownloaderConfigT", bound=FsspecDownloa
|
|
|
265
266
|
@dataclass
|
|
266
267
|
class FsspecDownloader(Downloader):
|
|
267
268
|
TEMP_DIR_PREFIX = "unstructured_"
|
|
268
|
-
|
|
269
|
+
|
|
269
270
|
protocol: str
|
|
270
271
|
connection_config: FsspecConnectionConfigT
|
|
271
272
|
connector_type: str = CONNECTOR_TYPE
|
|
272
273
|
download_config: Optional[FsspecDownloaderConfigT] = field(
|
|
273
274
|
default_factory=lambda: FsspecDownloaderConfig()
|
|
274
275
|
)
|
|
275
|
-
|
|
276
|
+
|
|
276
277
|
def get_download_path(self, file_data: FileData) -> Optional[Path]:
|
|
277
278
|
has_source_identifiers = file_data.source_identifiers is not None
|
|
278
279
|
has_filename = has_source_identifiers and file_data.source_identifiers.filename
|
|
279
|
-
|
|
280
|
+
|
|
280
281
|
if not (has_source_identifiers and has_filename):
|
|
281
282
|
return None
|
|
282
|
-
|
|
283
|
+
|
|
283
284
|
filename = file_data.source_identifiers.filename
|
|
284
|
-
|
|
285
|
+
|
|
285
286
|
mkdir_concurrent_safe(self.download_dir)
|
|
286
|
-
|
|
287
|
-
temp_dir = tempfile.mkdtemp(
|
|
288
|
-
prefix=self.TEMP_DIR_PREFIX,
|
|
289
|
-
dir=self.download_dir
|
|
290
|
-
)
|
|
287
|
+
|
|
288
|
+
temp_dir = tempfile.mkdtemp(prefix=self.TEMP_DIR_PREFIX, dir=self.download_dir)
|
|
291
289
|
return Path(temp_dir) / filename
|
|
292
290
|
|
|
293
291
|
def is_async(self) -> bool:
|
|
@@ -10,7 +10,7 @@ from dateutil import parser
|
|
|
10
10
|
from pydantic import Field, Secret
|
|
11
11
|
|
|
12
12
|
from unstructured_ingest.data_types.file_data import FileDataSourceMetadata
|
|
13
|
-
from unstructured_ingest.
|
|
13
|
+
from unstructured_ingest.error import ProviderError, UserError, ValueError
|
|
14
14
|
from unstructured_ingest.logger import logger
|
|
15
15
|
from unstructured_ingest.processes.connector_registry import (
|
|
16
16
|
DestinationRegistryEntry,
|
|
@@ -125,7 +125,7 @@ class GcsConnectionConfig(FsspecConnectionConfig):
|
|
|
125
125
|
raise UserError(message)
|
|
126
126
|
if http_error_code >= 500:
|
|
127
127
|
raise ProviderError(message)
|
|
128
|
-
logger.error(f"
|
|
128
|
+
logger.error(f"({type(e)} from gcs): {e}", exc_info=True)
|
|
129
129
|
return e
|
|
130
130
|
|
|
131
131
|
|
|
@@ -10,7 +10,7 @@ from pydantic import Field, Secret
|
|
|
10
10
|
from unstructured_ingest.data_types.file_data import (
|
|
11
11
|
FileDataSourceMetadata,
|
|
12
12
|
)
|
|
13
|
-
from unstructured_ingest.
|
|
13
|
+
from unstructured_ingest.error import ProviderError, UserAuthError, UserError
|
|
14
14
|
from unstructured_ingest.logger import logger
|
|
15
15
|
from unstructured_ingest.processes.connector_registry import (
|
|
16
16
|
DestinationRegistryEntry,
|