unstructured-ingest 1.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/cli/__init__.py +0 -0
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +269 -0
- unstructured_ingest/cli/base/dest.py +84 -0
- unstructured_ingest/cli/base/importer.py +34 -0
- unstructured_ingest/cli/base/src.py +75 -0
- unstructured_ingest/cli/cli.py +24 -0
- unstructured_ingest/cli/cmds.py +14 -0
- unstructured_ingest/cli/utils/__init__.py +0 -0
- unstructured_ingest/cli/utils/click.py +237 -0
- unstructured_ingest/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/data_types/__init__.py +0 -0
- unstructured_ingest/data_types/entities.py +17 -0
- unstructured_ingest/data_types/file_data.py +116 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +63 -0
- unstructured_ingest/embed/bedrock.py +323 -0
- unstructured_ingest/embed/huggingface.py +69 -0
- unstructured_ingest/embed/interfaces.py +146 -0
- unstructured_ingest/embed/mixedbreadai.py +134 -0
- unstructured_ingest/embed/octoai.py +133 -0
- unstructured_ingest/embed/openai.py +142 -0
- unstructured_ingest/embed/togetherai.py +116 -0
- unstructured_ingest/embed/vertexai.py +109 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/error.py +156 -0
- unstructured_ingest/errors_v2.py +156 -0
- unstructured_ingest/interfaces/__init__.py +27 -0
- unstructured_ingest/interfaces/connector.py +56 -0
- unstructured_ingest/interfaces/downloader.py +90 -0
- unstructured_ingest/interfaces/indexer.py +29 -0
- unstructured_ingest/interfaces/process.py +22 -0
- unstructured_ingest/interfaces/processor.py +88 -0
- unstructured_ingest/interfaces/upload_stager.py +89 -0
- unstructured_ingest/interfaces/uploader.py +67 -0
- unstructured_ingest/logger.py +39 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/otel.py +128 -0
- unstructured_ingest/pipeline/__init__.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +211 -0
- unstructured_ingest/pipeline/otel.py +32 -0
- unstructured_ingest/pipeline/pipeline.py +408 -0
- unstructured_ingest/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/pipeline/steps/chunk.py +78 -0
- unstructured_ingest/pipeline/steps/download.py +206 -0
- unstructured_ingest/pipeline/steps/embed.py +77 -0
- unstructured_ingest/pipeline/steps/filter.py +35 -0
- unstructured_ingest/pipeline/steps/index.py +86 -0
- unstructured_ingest/pipeline/steps/partition.py +77 -0
- unstructured_ingest/pipeline/steps/stage.py +65 -0
- unstructured_ingest/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/pipeline/steps/upload.py +58 -0
- unstructured_ingest/processes/__init__.py +18 -0
- unstructured_ingest/processes/chunker.py +131 -0
- unstructured_ingest/processes/connector_registry.py +69 -0
- unstructured_ingest/processes/connectors/__init__.py +129 -0
- unstructured_ingest/processes/connectors/airtable.py +238 -0
- unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +592 -0
- unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
- unstructured_ingest/processes/connectors/chroma.py +193 -0
- unstructured_ingest/processes/connectors/confluence.py +527 -0
- unstructured_ingest/processes/connectors/couchbase.py +336 -0
- unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
- unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
- unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
- unstructured_ingest/processes/connectors/delta_table.py +310 -0
- unstructured_ingest/processes/connectors/discord.py +161 -0
- unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
- unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
- unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
- unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
- unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
- unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
- unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
- unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/processes/connectors/github.py +226 -0
- unstructured_ingest/processes/connectors/gitlab.py +270 -0
- unstructured_ingest/processes/connectors/google_drive.py +848 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
- unstructured_ingest/processes/connectors/jira.py +522 -0
- unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
- unstructured_ingest/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/processes/connectors/kdbai.py +156 -0
- unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
- unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/processes/connectors/local.py +227 -0
- unstructured_ingest/processes/connectors/milvus.py +311 -0
- unstructured_ingest/processes/connectors/mongodb.py +389 -0
- unstructured_ingest/processes/connectors/neo4j.py +534 -0
- unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/processes/connectors/notion/connector.py +350 -0
- unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
- unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
- unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
- unstructured_ingest/processes/connectors/onedrive.py +485 -0
- unstructured_ingest/processes/connectors/outlook.py +242 -0
- unstructured_ingest/processes/connectors/pinecone.py +400 -0
- unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
- unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/processes/connectors/redisdb.py +214 -0
- unstructured_ingest/processes/connectors/salesforce.py +307 -0
- unstructured_ingest/processes/connectors/sharepoint.py +282 -0
- unstructured_ingest/processes/connectors/slack.py +249 -0
- unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
- unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
- unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
- unstructured_ingest/processes/connectors/sql/sql.py +456 -0
- unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
- unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
- unstructured_ingest/processes/connectors/utils.py +60 -0
- unstructured_ingest/processes/connectors/vectara.py +348 -0
- unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
- unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
- unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
- unstructured_ingest/processes/embedder.py +203 -0
- unstructured_ingest/processes/filter.py +60 -0
- unstructured_ingest/processes/partitioner.py +233 -0
- unstructured_ingest/processes/uncompress.py +61 -0
- unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest/processes/utils/blob_storage.py +32 -0
- unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- unstructured_ingest/unstructured_api.py +140 -0
- unstructured_ingest/utils/__init__.py +5 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +72 -0
- unstructured_ingest/utils/constants.py +2 -0
- unstructured_ingest/utils/data_prep.py +216 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/filesystem.py +27 -0
- unstructured_ingest/utils/html.py +174 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/utils/pydantic_models.py +52 -0
- unstructured_ingest/utils/string_and_date_utils.py +74 -0
- unstructured_ingest/utils/table.py +80 -0
- unstructured_ingest/utils/tls.py +15 -0
- unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
- unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
- unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
- unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
- unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from functools import wraps
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class UnstructuredIngestError(Exception, ABC):
|
|
7
|
+
error_string: str
|
|
8
|
+
status_code: Optional[int] = None
|
|
9
|
+
|
|
10
|
+
@classmethod
|
|
11
|
+
def wrap(cls, f):
|
|
12
|
+
"""
|
|
13
|
+
Provides a wrapper for a function that catches any exception and
|
|
14
|
+
re-raises it as the customer error. If the exception itself is already an instance
|
|
15
|
+
of the custom error, re-raises original error.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
@wraps(f)
|
|
19
|
+
def wrapper(*args, **kwargs):
|
|
20
|
+
try:
|
|
21
|
+
return f(*args, **kwargs)
|
|
22
|
+
except BaseException as error:
|
|
23
|
+
if not isinstance(error, cls) and not issubclass(type(error), cls):
|
|
24
|
+
raise cls(cls.error_string.format(str(error))) from error
|
|
25
|
+
raise
|
|
26
|
+
|
|
27
|
+
return wrapper
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ConnectionError(UnstructuredIngestError):
|
|
31
|
+
error_string = "Connection error: {}"
|
|
32
|
+
status_code: Optional[int] = 400
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class SourceConnectionError(ConnectionError):
|
|
36
|
+
error_string = "Error in getting data from upstream data source: {}"
|
|
37
|
+
status_code: Optional[int] = 400
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class SourceConnectionNetworkError(SourceConnectionError):
|
|
41
|
+
error_string = "Error in connecting to upstream data source: {}"
|
|
42
|
+
status_code: Optional[int] = 400
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class DestinationConnectionError(ConnectionError):
|
|
46
|
+
error_string = "Error in connecting to downstream data source: {}"
|
|
47
|
+
status_code: Optional[int] = 400
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class EmbeddingEncoderConnectionError(ConnectionError):
|
|
51
|
+
error_string = "Error in connecting to the embedding model provider: {}"
|
|
52
|
+
status_code: Optional[int] = 400
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class UserError(UnstructuredIngestError):
|
|
56
|
+
error_string = "User error: {}"
|
|
57
|
+
status_code: Optional[int] = 401
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class UserAuthError(UserError):
|
|
61
|
+
error_string = "User authentication error: {}"
|
|
62
|
+
status_code: Optional[int] = 401
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class RateLimitError(UserError):
|
|
66
|
+
error_string = "Rate limit error: {}"
|
|
67
|
+
status_code: Optional[int] = 429
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class NotFoundError(UnstructuredIngestError):
|
|
71
|
+
error_string = "Not found error: {}"
|
|
72
|
+
status_code: Optional[int] = 404
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class TimeoutError(UnstructuredIngestError):
|
|
76
|
+
error_string = "Timeout error: {}"
|
|
77
|
+
status_code: Optional[int] = 408
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class ResponseError(UnstructuredIngestError):
|
|
81
|
+
error_string = "Response error: {}"
|
|
82
|
+
status_code: Optional[int] = 400
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class WriteError(UnstructuredIngestError):
|
|
86
|
+
error_string = "Error in writing to downstream data source: {}"
|
|
87
|
+
status_code: Optional[int] = 400
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class ProviderError(UnstructuredIngestError):
|
|
91
|
+
error_string = "Provider error: {}"
|
|
92
|
+
status_code: Optional[int] = 500
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class ValueError(UnstructuredIngestError):
|
|
96
|
+
error_string = "Value error: {}"
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class PartitionError(UnstructuredIngestError):
|
|
100
|
+
error_string = "Error in partitioning content: {}"
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class QuotaError(UserError):
|
|
104
|
+
error_string = "Quota error: {}"
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class MissingCategoryError(UnstructuredIngestError):
|
|
108
|
+
error_string = "Missing category error: {}"
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class ValidationError(UnstructuredIngestError):
|
|
112
|
+
error_string = "Validation error: {}"
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class KeyError(UnstructuredIngestError):
|
|
116
|
+
error_string = "Key error: {}"
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class FileExistsError(UnstructuredIngestError):
|
|
120
|
+
error_string = "File exists error: {}"
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class TypeError(UnstructuredIngestError):
|
|
124
|
+
error_string = "Type error: {}"
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class IcebergCommitFailedException(UnstructuredIngestError):
|
|
128
|
+
error_string = "Failed to commit changes to the iceberg table"
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
recognized_errors = [
|
|
132
|
+
UserError,
|
|
133
|
+
UserAuthError,
|
|
134
|
+
RateLimitError,
|
|
135
|
+
QuotaError,
|
|
136
|
+
ProviderError,
|
|
137
|
+
NotFoundError,
|
|
138
|
+
TypeError,
|
|
139
|
+
ValueError,
|
|
140
|
+
FileExistsError,
|
|
141
|
+
TimeoutError,
|
|
142
|
+
KeyError,
|
|
143
|
+
ResponseError,
|
|
144
|
+
ValidationError,
|
|
145
|
+
PartitionError,
|
|
146
|
+
WriteError,
|
|
147
|
+
ConnectionError,
|
|
148
|
+
SourceConnectionError,
|
|
149
|
+
SourceConnectionNetworkError,
|
|
150
|
+
DestinationConnectionError,
|
|
151
|
+
EmbeddingEncoderConnectionError,
|
|
152
|
+
]
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def is_internal_error(e: Exception) -> bool:
|
|
156
|
+
return any(isinstance(e, recognized_error) for recognized_error in recognized_errors)
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from functools import wraps
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class UnstructuredIngestError(Exception, ABC):
|
|
7
|
+
error_string: str
|
|
8
|
+
status_code: Optional[int] = None
|
|
9
|
+
|
|
10
|
+
@classmethod
|
|
11
|
+
def wrap(cls, f):
|
|
12
|
+
"""
|
|
13
|
+
Provides a wrapper for a function that catches any exception and
|
|
14
|
+
re-raises it as the customer error. If the exception itself is already an instance
|
|
15
|
+
of the custom error, re-raises original error.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
@wraps(f)
|
|
19
|
+
def wrapper(*args, **kwargs):
|
|
20
|
+
try:
|
|
21
|
+
return f(*args, **kwargs)
|
|
22
|
+
except BaseException as error:
|
|
23
|
+
if not isinstance(error, cls) and not issubclass(type(error), cls):
|
|
24
|
+
raise cls(cls.error_string.format(str(error))) from error
|
|
25
|
+
raise
|
|
26
|
+
|
|
27
|
+
return wrapper
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ConnectionError(UnstructuredIngestError):
|
|
31
|
+
error_string = "Connection error: {}"
|
|
32
|
+
status_code: Optional[int] = 400
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class SourceConnectionError(ConnectionError):
|
|
36
|
+
error_string = "Error in getting data from upstream data source: {}"
|
|
37
|
+
status_code: Optional[int] = 400
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class SourceConnectionNetworkError(SourceConnectionError):
|
|
41
|
+
error_string = "Error in connecting to upstream data source: {}"
|
|
42
|
+
status_code: Optional[int] = 400
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class DestinationConnectionError(ConnectionError):
|
|
46
|
+
error_string = "Error in connecting to downstream data source: {}"
|
|
47
|
+
status_code: Optional[int] = 400
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class EmbeddingEncoderConnectionError(ConnectionError):
|
|
51
|
+
error_string = "Error in connecting to the embedding model provider: {}"
|
|
52
|
+
status_code: Optional[int] = 400
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class UserError(UnstructuredIngestError):
|
|
56
|
+
error_string = "User error: {}"
|
|
57
|
+
status_code: Optional[int] = 401
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class UserAuthError(UserError):
|
|
61
|
+
error_string = "User authentication error: {}"
|
|
62
|
+
status_code: Optional[int] = 401
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class RateLimitError(UserError):
|
|
66
|
+
error_string = "Rate limit error: {}"
|
|
67
|
+
status_code: Optional[int] = 429
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class NotFoundError(UnstructuredIngestError):
|
|
71
|
+
error_string = "Not found error: {}"
|
|
72
|
+
status_code: Optional[int] = 404
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class TimeoutError(UnstructuredIngestError):
|
|
76
|
+
error_string = "Timeout error: {}"
|
|
77
|
+
status_code: Optional[int] = 408
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class ResponseError(UnstructuredIngestError):
|
|
81
|
+
error_string = "Response error: {}"
|
|
82
|
+
status_code: Optional[int] = 400
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class WriteError(UnstructuredIngestError):
|
|
86
|
+
error_string = "Error in writing to downstream data source: {}"
|
|
87
|
+
status_code: Optional[int] = 400
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class ProviderError(UnstructuredIngestError):
|
|
91
|
+
error_string = "Provider error: {}"
|
|
92
|
+
status_code: Optional[int] = 500
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class ValueError(UnstructuredIngestError):
|
|
96
|
+
error_string = "Value error: {}"
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class PartitionError(UnstructuredIngestError):
|
|
100
|
+
error_string = "Error in partitioning content: {}"
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class QuotaError(UserError):
|
|
104
|
+
error_string = "Quota error: {}"
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class MissingCategoryError(UnstructuredIngestError):
|
|
108
|
+
error_string = "Missing category error: {}"
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class ValidationError(UnstructuredIngestError):
|
|
112
|
+
error_string = "Validation error: {}"
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class KeyError(UnstructuredIngestError):
|
|
116
|
+
error_string = "Key error: {}"
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class FileExistsError(UnstructuredIngestError):
|
|
120
|
+
error_string = "File exists error: {}"
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class TypeError(UnstructuredIngestError):
|
|
124
|
+
error_string = "Type error: {}"
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class IcebergCommitFailedException(UnstructuredIngestError):
|
|
128
|
+
error_string = "Failed to commit changes to the iceberg table"
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
recognized_errors = [
|
|
132
|
+
UserError,
|
|
133
|
+
UserAuthError,
|
|
134
|
+
RateLimitError,
|
|
135
|
+
QuotaError,
|
|
136
|
+
ProviderError,
|
|
137
|
+
NotFoundError,
|
|
138
|
+
TypeError,
|
|
139
|
+
ValueError,
|
|
140
|
+
FileExistsError,
|
|
141
|
+
TimeoutError,
|
|
142
|
+
KeyError,
|
|
143
|
+
ResponseError,
|
|
144
|
+
ValidationError,
|
|
145
|
+
PartitionError,
|
|
146
|
+
WriteError,
|
|
147
|
+
ConnectionError,
|
|
148
|
+
SourceConnectionError,
|
|
149
|
+
SourceConnectionNetworkError,
|
|
150
|
+
DestinationConnectionError,
|
|
151
|
+
EmbeddingEncoderConnectionError,
|
|
152
|
+
]
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def is_internal_error(e: Exception) -> bool:
|
|
156
|
+
return any(isinstance(e, recognized_error) for recognized_error in recognized_errors)
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from .connector import AccessConfig, BaseConnector, ConnectionConfig
|
|
2
|
+
from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
|
|
3
|
+
from .indexer import Indexer, IndexerConfig
|
|
4
|
+
from .process import BaseProcess
|
|
5
|
+
from .processor import ProcessorConfig
|
|
6
|
+
from .upload_stager import UploadStager, UploadStagerConfig
|
|
7
|
+
from .uploader import UploadContent, Uploader, UploaderConfig, VectorDBUploader
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"DownloadResponse",
|
|
11
|
+
"download_responses",
|
|
12
|
+
"Downloader",
|
|
13
|
+
"DownloaderConfig",
|
|
14
|
+
"Indexer",
|
|
15
|
+
"IndexerConfig",
|
|
16
|
+
"BaseProcess",
|
|
17
|
+
"ProcessorConfig",
|
|
18
|
+
"UploadStager",
|
|
19
|
+
"UploadStagerConfig",
|
|
20
|
+
"Uploader",
|
|
21
|
+
"UploaderConfig",
|
|
22
|
+
"UploadContent",
|
|
23
|
+
"AccessConfig",
|
|
24
|
+
"ConnectionConfig",
|
|
25
|
+
"BaseConnector",
|
|
26
|
+
"VectorDBUploader",
|
|
27
|
+
]
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Any, TypeVar, Union
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Secret, model_validator
|
|
6
|
+
from pydantic.types import _SecretBase
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.processes.utils.logging.connector import ConnectorLoggingMixin
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AccessConfig(BaseModel):
|
|
12
|
+
"""Meant to designate holding any sensitive information associated with other configs
|
|
13
|
+
and also for access specific configs."""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
AccessConfigT = TypeVar("AccessConfigT", bound=AccessConfig)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ConnectionConfig(BaseModel):
|
|
20
|
+
access_config: Secret[AccessConfigT]
|
|
21
|
+
|
|
22
|
+
def get_access_config(self) -> dict[str, Any]:
|
|
23
|
+
if not self.access_config:
|
|
24
|
+
return {}
|
|
25
|
+
return self.access_config.get_secret_value().model_dump()
|
|
26
|
+
|
|
27
|
+
@model_validator(mode="after")
|
|
28
|
+
def check_access_config(self):
|
|
29
|
+
access_config = self.access_config
|
|
30
|
+
if self._is_access_config_optional() and access_config is None:
|
|
31
|
+
return self
|
|
32
|
+
if not isinstance(access_config, _SecretBase):
|
|
33
|
+
raise ValueError("access_config must be an instance of SecretBase")
|
|
34
|
+
return self
|
|
35
|
+
|
|
36
|
+
def _is_access_config_optional(self) -> bool:
|
|
37
|
+
access_config_type = self.model_fields["access_config"].annotation
|
|
38
|
+
return (
|
|
39
|
+
hasattr(access_config_type, "__origin__")
|
|
40
|
+
and hasattr(access_config_type, "__args__")
|
|
41
|
+
and access_config_type.__origin__ is Union
|
|
42
|
+
and len(access_config_type.__args__) == 2
|
|
43
|
+
and type(None) in access_config_type.__args__
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
ConnectionConfigT = TypeVar("ConnectionConfigT", bound=ConnectionConfig)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class BaseConnector(ABC, ConnectorLoggingMixin):
|
|
52
|
+
connection_config: ConnectionConfigT
|
|
53
|
+
|
|
54
|
+
def __post_init__(self):
|
|
55
|
+
"""Initialize the logging mixin after dataclass initialization."""
|
|
56
|
+
ConnectorLoggingMixin.__init__(self)
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from abc import ABC
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Optional, TypedDict, TypeVar, Union
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
9
|
+
from unstructured_ingest.interfaces.connector import BaseConnector
|
|
10
|
+
from unstructured_ingest.interfaces.process import BaseProcess
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DownloaderConfig(BaseModel):
|
|
14
|
+
download_dir: Optional[Path] = Field(
|
|
15
|
+
default=None,
|
|
16
|
+
description="Where files are downloaded to, defaults to a location at"
|
|
17
|
+
"`$HOME/.cache/unstructured/ingest/<connector name>/<SHA256>`.",
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
DownloaderConfigT = TypeVar("DownloaderConfigT", bound=DownloaderConfig)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DownloadResponse(TypedDict):
|
|
25
|
+
file_data: FileData
|
|
26
|
+
path: Path
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
download_responses = Union[list[DownloadResponse], DownloadResponse]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class Downloader(BaseProcess, BaseConnector, ABC):
|
|
33
|
+
connector_type: str
|
|
34
|
+
download_config: DownloaderConfigT
|
|
35
|
+
|
|
36
|
+
def get_download_path(self, file_data: FileData) -> Optional[Path]:
|
|
37
|
+
if not file_data.source_identifiers:
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
rel_path = file_data.source_identifiers.relative_path
|
|
41
|
+
if not rel_path:
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
|
|
45
|
+
return self.download_dir / Path(rel_path)
|
|
46
|
+
|
|
47
|
+
@staticmethod
|
|
48
|
+
def is_float(value: str):
|
|
49
|
+
try:
|
|
50
|
+
float(value)
|
|
51
|
+
return True
|
|
52
|
+
except ValueError:
|
|
53
|
+
return False
|
|
54
|
+
|
|
55
|
+
def generate_download_response(
|
|
56
|
+
self, file_data: FileData, download_path: Path
|
|
57
|
+
) -> DownloadResponse:
|
|
58
|
+
if (
|
|
59
|
+
file_data.metadata.date_modified
|
|
60
|
+
and self.is_float(file_data.metadata.date_modified)
|
|
61
|
+
and file_data.metadata.date_created
|
|
62
|
+
and self.is_float(file_data.metadata.date_created)
|
|
63
|
+
):
|
|
64
|
+
date_modified = float(file_data.metadata.date_modified)
|
|
65
|
+
date_created = float(file_data.metadata.date_created)
|
|
66
|
+
os.utime(download_path, times=(date_created, date_modified))
|
|
67
|
+
file_data.local_download_path = str(download_path.resolve())
|
|
68
|
+
return DownloadResponse(file_data=file_data, path=download_path)
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def download_dir(self) -> Path:
|
|
72
|
+
if self.download_config.download_dir is None:
|
|
73
|
+
self.download_config.download_dir = (
|
|
74
|
+
Path.home()
|
|
75
|
+
/ ".cache"
|
|
76
|
+
/ "unstructured"
|
|
77
|
+
/ "ingest"
|
|
78
|
+
/ "download"
|
|
79
|
+
/ self.connector_type
|
|
80
|
+
).resolve()
|
|
81
|
+
return self.download_config.download_dir
|
|
82
|
+
|
|
83
|
+
def is_async(self) -> bool:
|
|
84
|
+
return True
|
|
85
|
+
|
|
86
|
+
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
87
|
+
raise NotImplementedError()
|
|
88
|
+
|
|
89
|
+
async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
90
|
+
return self.run(file_data=file_data, **kwargs)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from typing import Any, AsyncGenerator, Generator, Optional, TypeVar
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
7
|
+
from unstructured_ingest.interfaces.connector import BaseConnector
|
|
8
|
+
from unstructured_ingest.interfaces.process import BaseProcess
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class IndexerConfig(BaseModel):
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
IndexerConfigT = TypeVar("IndexerConfigT", bound=IndexerConfig)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Indexer(BaseProcess, BaseConnector, ABC):
|
|
19
|
+
connector_type: str
|
|
20
|
+
index_config: Optional[IndexerConfigT] = None
|
|
21
|
+
|
|
22
|
+
def is_async(self) -> bool:
|
|
23
|
+
return False
|
|
24
|
+
|
|
25
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
26
|
+
raise NotImplementedError()
|
|
27
|
+
|
|
28
|
+
async def run_async(self, **kwargs: Any) -> AsyncGenerator[FileData, None]:
|
|
29
|
+
raise NotImplementedError()
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class BaseProcess(ABC):
|
|
8
|
+
def is_async(self) -> bool:
|
|
9
|
+
return False
|
|
10
|
+
|
|
11
|
+
def init(self, **kwargs: Any) -> None:
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
def precheck(self) -> None:
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
@abstractmethod
|
|
18
|
+
def run(self, **kwargs: Any) -> Any:
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
async def run_async(self, **kwargs: Any) -> Any:
|
|
22
|
+
return self.run(**kwargs)
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from asyncio import Semaphore
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Optional
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
7
|
+
|
|
8
|
+
DEFAULT_WORK_DIR = str((Path.home() / ".cache" / "unstructured" / "ingest" / "pipeline").resolve())
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ProcessorConfig(BaseModel):
|
|
12
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
13
|
+
|
|
14
|
+
reprocess: bool = Field(
|
|
15
|
+
default=False,
|
|
16
|
+
description="Reprocess a downloaded file even if the relevant structured "
|
|
17
|
+
"output .json file in output directory already exists.",
|
|
18
|
+
)
|
|
19
|
+
verbose: bool = Field(default=False)
|
|
20
|
+
tqdm: bool = Field(default=False, description="Display tqdm progress bar")
|
|
21
|
+
work_dir: str = Field(
|
|
22
|
+
default_factory=lambda: DEFAULT_WORK_DIR,
|
|
23
|
+
description="Where to place working files when processing each step",
|
|
24
|
+
)
|
|
25
|
+
num_processes: int = Field(
|
|
26
|
+
default=2, description="Number of parallel processes with which to process docs"
|
|
27
|
+
)
|
|
28
|
+
max_connections: Optional[int] = Field(
|
|
29
|
+
default=None, description="Limit of concurrent connectionts"
|
|
30
|
+
)
|
|
31
|
+
raise_on_error: bool = Field(
|
|
32
|
+
default=False,
|
|
33
|
+
description="Is set, will raise error if any doc in the pipeline fail. "
|
|
34
|
+
"Otherwise will log error and continue with other docs",
|
|
35
|
+
)
|
|
36
|
+
disable_parallelism: bool = Field(
|
|
37
|
+
default_factory=lambda: os.getenv("INGEST_DISABLE_PARALLELISM", "false").lower() == "true",
|
|
38
|
+
)
|
|
39
|
+
preserve_downloads: bool = Field(
|
|
40
|
+
default=False, description="Don't delete downloaded files after process completes"
|
|
41
|
+
)
|
|
42
|
+
download_only: bool = Field(
|
|
43
|
+
default=False, description="skip the rest of the process after files are downloaded"
|
|
44
|
+
)
|
|
45
|
+
re_download: bool = Field(
|
|
46
|
+
default=False,
|
|
47
|
+
description="If set, will re-download downloaded files "
|
|
48
|
+
"regardless of if they already exist locally",
|
|
49
|
+
)
|
|
50
|
+
uncompress: bool = Field(
|
|
51
|
+
default=False,
|
|
52
|
+
description="Uncompress any archived files. Currently supporting "
|
|
53
|
+
"zip and tar files based on file extension.",
|
|
54
|
+
)
|
|
55
|
+
iter_delete: bool = Field(
|
|
56
|
+
default=False,
|
|
57
|
+
description="If limited on memory, this can be enabled to delete "
|
|
58
|
+
"cached content as it's used and no longer needed in the pipeline.",
|
|
59
|
+
)
|
|
60
|
+
delete_cache: bool = Field(
|
|
61
|
+
default=False,
|
|
62
|
+
description="If set, will delete the cache work directory when process finishes",
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# OTEL support
|
|
66
|
+
otel_endpoint: Optional[str] = Field(
|
|
67
|
+
default=None, description="OTEL endpoint to publish trace data to"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Used to keep track of state in pipeline
|
|
71
|
+
status: dict = Field(default_factory=dict)
|
|
72
|
+
semaphore: Optional[Semaphore] = Field(init=False, default=None, exclude=True)
|
|
73
|
+
|
|
74
|
+
def model_post_init(self, __context: Any) -> None:
|
|
75
|
+
if self.max_connections is not None:
|
|
76
|
+
self.semaphore = Semaphore(self.max_connections)
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def mp_supported(self) -> bool:
|
|
80
|
+
return not self.disable_parallelism and self.num_processes > 1
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def async_supported(self) -> bool:
|
|
84
|
+
if self.disable_parallelism:
|
|
85
|
+
return False
|
|
86
|
+
if self.max_connections is not None and isinstance(self.max_connections, int):
|
|
87
|
+
return self.max_connections > 1
|
|
88
|
+
return True
|