unstructured-ingest 1.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/cli/__init__.py +0 -0
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +269 -0
- unstructured_ingest/cli/base/dest.py +84 -0
- unstructured_ingest/cli/base/importer.py +34 -0
- unstructured_ingest/cli/base/src.py +75 -0
- unstructured_ingest/cli/cli.py +24 -0
- unstructured_ingest/cli/cmds.py +14 -0
- unstructured_ingest/cli/utils/__init__.py +0 -0
- unstructured_ingest/cli/utils/click.py +237 -0
- unstructured_ingest/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/data_types/__init__.py +0 -0
- unstructured_ingest/data_types/entities.py +17 -0
- unstructured_ingest/data_types/file_data.py +116 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +63 -0
- unstructured_ingest/embed/bedrock.py +323 -0
- unstructured_ingest/embed/huggingface.py +69 -0
- unstructured_ingest/embed/interfaces.py +146 -0
- unstructured_ingest/embed/mixedbreadai.py +134 -0
- unstructured_ingest/embed/octoai.py +133 -0
- unstructured_ingest/embed/openai.py +142 -0
- unstructured_ingest/embed/togetherai.py +116 -0
- unstructured_ingest/embed/vertexai.py +109 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/error.py +156 -0
- unstructured_ingest/errors_v2.py +156 -0
- unstructured_ingest/interfaces/__init__.py +27 -0
- unstructured_ingest/interfaces/connector.py +56 -0
- unstructured_ingest/interfaces/downloader.py +90 -0
- unstructured_ingest/interfaces/indexer.py +29 -0
- unstructured_ingest/interfaces/process.py +22 -0
- unstructured_ingest/interfaces/processor.py +88 -0
- unstructured_ingest/interfaces/upload_stager.py +89 -0
- unstructured_ingest/interfaces/uploader.py +67 -0
- unstructured_ingest/logger.py +39 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/otel.py +128 -0
- unstructured_ingest/pipeline/__init__.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +211 -0
- unstructured_ingest/pipeline/otel.py +32 -0
- unstructured_ingest/pipeline/pipeline.py +408 -0
- unstructured_ingest/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/pipeline/steps/chunk.py +78 -0
- unstructured_ingest/pipeline/steps/download.py +206 -0
- unstructured_ingest/pipeline/steps/embed.py +77 -0
- unstructured_ingest/pipeline/steps/filter.py +35 -0
- unstructured_ingest/pipeline/steps/index.py +86 -0
- unstructured_ingest/pipeline/steps/partition.py +77 -0
- unstructured_ingest/pipeline/steps/stage.py +65 -0
- unstructured_ingest/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/pipeline/steps/upload.py +58 -0
- unstructured_ingest/processes/__init__.py +18 -0
- unstructured_ingest/processes/chunker.py +131 -0
- unstructured_ingest/processes/connector_registry.py +69 -0
- unstructured_ingest/processes/connectors/__init__.py +129 -0
- unstructured_ingest/processes/connectors/airtable.py +238 -0
- unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +592 -0
- unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
- unstructured_ingest/processes/connectors/chroma.py +193 -0
- unstructured_ingest/processes/connectors/confluence.py +527 -0
- unstructured_ingest/processes/connectors/couchbase.py +336 -0
- unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
- unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
- unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
- unstructured_ingest/processes/connectors/delta_table.py +310 -0
- unstructured_ingest/processes/connectors/discord.py +161 -0
- unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
- unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
- unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
- unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
- unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
- unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
- unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
- unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/processes/connectors/github.py +226 -0
- unstructured_ingest/processes/connectors/gitlab.py +270 -0
- unstructured_ingest/processes/connectors/google_drive.py +848 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
- unstructured_ingest/processes/connectors/jira.py +522 -0
- unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
- unstructured_ingest/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/processes/connectors/kdbai.py +156 -0
- unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
- unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/processes/connectors/local.py +227 -0
- unstructured_ingest/processes/connectors/milvus.py +311 -0
- unstructured_ingest/processes/connectors/mongodb.py +389 -0
- unstructured_ingest/processes/connectors/neo4j.py +534 -0
- unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/processes/connectors/notion/connector.py +350 -0
- unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
- unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
- unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
- unstructured_ingest/processes/connectors/onedrive.py +485 -0
- unstructured_ingest/processes/connectors/outlook.py +242 -0
- unstructured_ingest/processes/connectors/pinecone.py +400 -0
- unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
- unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/processes/connectors/redisdb.py +214 -0
- unstructured_ingest/processes/connectors/salesforce.py +307 -0
- unstructured_ingest/processes/connectors/sharepoint.py +282 -0
- unstructured_ingest/processes/connectors/slack.py +249 -0
- unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
- unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
- unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
- unstructured_ingest/processes/connectors/sql/sql.py +456 -0
- unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
- unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
- unstructured_ingest/processes/connectors/utils.py +60 -0
- unstructured_ingest/processes/connectors/vectara.py +348 -0
- unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
- unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
- unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
- unstructured_ingest/processes/embedder.py +203 -0
- unstructured_ingest/processes/filter.py +60 -0
- unstructured_ingest/processes/partitioner.py +233 -0
- unstructured_ingest/processes/uncompress.py +61 -0
- unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest/processes/utils/blob_storage.py +32 -0
- unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- unstructured_ingest/unstructured_api.py +140 -0
- unstructured_ingest/utils/__init__.py +5 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +72 -0
- unstructured_ingest/utils/constants.py +2 -0
- unstructured_ingest/utils/data_prep.py +216 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/filesystem.py +27 -0
- unstructured_ingest/utils/html.py +174 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/utils/pydantic_models.py +52 -0
- unstructured_ingest/utils/string_and_date_utils.py +74 -0
- unstructured_ingest/utils/table.py +80 -0
- unstructured_ingest/utils/tls.py +15 -0
- unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
- unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
- unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
- unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
- unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Optional
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, Field, SecretStr
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.error import UserError
|
|
9
|
+
from unstructured_ingest.interfaces.process import BaseProcess
|
|
10
|
+
from unstructured_ingest.logger import logger
|
|
11
|
+
from unstructured_ingest.unstructured_api import call_api_async
|
|
12
|
+
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
13
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class PartitionerConfig(BaseModel):
|
|
17
|
+
strategy: str = Field(
|
|
18
|
+
default="auto",
|
|
19
|
+
description="The method that will be used to process the documents. ",
|
|
20
|
+
examples=["fast", "hi_res", "auto"],
|
|
21
|
+
)
|
|
22
|
+
ocr_languages: Optional[list[str]] = Field(
|
|
23
|
+
default=None,
|
|
24
|
+
description="A list of language packs to specify which languages to use for OCR, "
|
|
25
|
+
"The appropriate Tesseract language pack needs to be installed.",
|
|
26
|
+
examples=["eng", "deu", "eng,deu"],
|
|
27
|
+
)
|
|
28
|
+
encoding: Optional[str] = Field(
|
|
29
|
+
default=None,
|
|
30
|
+
description="Text encoding to use when reading documents. "
|
|
31
|
+
"By default the encoding is detected automatically.",
|
|
32
|
+
)
|
|
33
|
+
additional_partition_args: Optional[dict[str, Any]] = Field(
|
|
34
|
+
default=None, description="Additional values to pass through to partition()"
|
|
35
|
+
)
|
|
36
|
+
skip_infer_table_types: Optional[list[str]] = Field(
|
|
37
|
+
default=None, description="Optional list of document data_types to skip table extraction on"
|
|
38
|
+
)
|
|
39
|
+
fields_include: list[str] = Field(
|
|
40
|
+
default_factory=lambda: ["element_id", "text", "type", "metadata", "embeddings"],
|
|
41
|
+
description="If set, include the specified top-level fields in an element.",
|
|
42
|
+
)
|
|
43
|
+
flatten_metadata: bool = Field(
|
|
44
|
+
default=False,
|
|
45
|
+
description="Results in flattened json elements. "
|
|
46
|
+
"Specifically, the metadata key values are brought to "
|
|
47
|
+
"the top-level of the element, and the `metadata` key itself is removed.",
|
|
48
|
+
)
|
|
49
|
+
metadata_exclude: list[str] = Field(
|
|
50
|
+
default_factory=list,
|
|
51
|
+
description="If set, drop the specified metadata fields if they exist.",
|
|
52
|
+
)
|
|
53
|
+
element_exclude: list[str] = Field(
|
|
54
|
+
default_factory=list,
|
|
55
|
+
description="If set, drop the specified element data_types, if they exist.",
|
|
56
|
+
)
|
|
57
|
+
metadata_include: list[str] = Field(
|
|
58
|
+
default_factory=list,
|
|
59
|
+
description="If set, include the specified metadata "
|
|
60
|
+
"fields if they exist and drop all other fields. ",
|
|
61
|
+
)
|
|
62
|
+
partition_endpoint: Optional[str] = Field(
|
|
63
|
+
default="https://api.unstructuredapp.io/general/v0/general",
|
|
64
|
+
description="If partitioning via api, use the following host.",
|
|
65
|
+
)
|
|
66
|
+
partition_by_api: bool = Field(
|
|
67
|
+
default=False,
|
|
68
|
+
description="Use a remote API to partition the files."
|
|
69
|
+
" Otherwise, use the function from partition.auto",
|
|
70
|
+
)
|
|
71
|
+
api_timeout_ms: Optional[int] = Field(
|
|
72
|
+
default=None, description="Timeout in milliseconds for all api call during partitioning."
|
|
73
|
+
)
|
|
74
|
+
api_key: Optional[SecretStr] = Field(
|
|
75
|
+
default=None, description="API Key for partition endpoint."
|
|
76
|
+
)
|
|
77
|
+
hi_res_model_name: Optional[str] = Field(
|
|
78
|
+
default=None, description="Model name for hi-res strategy."
|
|
79
|
+
)
|
|
80
|
+
raise_unsupported_filetype: bool = Field(
|
|
81
|
+
default=False, description="Raise an error if the file type is not supported"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
def model_post_init(self, __context: Any) -> None:
|
|
85
|
+
if self.metadata_exclude and self.metadata_include:
|
|
86
|
+
raise ValueError(
|
|
87
|
+
"metadata_exclude and metadata_include are "
|
|
88
|
+
"mutually exclusive with each other. Cannot specify both."
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
def to_partition_kwargs(self) -> dict[str, Any]:
|
|
92
|
+
partition_kwargs: dict[str, Any] = {
|
|
93
|
+
"strategy": self.strategy,
|
|
94
|
+
"languages": self.ocr_languages,
|
|
95
|
+
"hi_res_model_name": self.hi_res_model_name,
|
|
96
|
+
"skip_infer_table_types": self.skip_infer_table_types,
|
|
97
|
+
}
|
|
98
|
+
# Don't inject information if None and allow default values in method to be used
|
|
99
|
+
partition_kwargs = {k: v for k, v in partition_kwargs.items() if v is not None}
|
|
100
|
+
if self.additional_partition_args:
|
|
101
|
+
partition_kwargs.update(self.additional_partition_args)
|
|
102
|
+
return partition_kwargs
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@dataclass
|
|
106
|
+
class Partitioner(BaseProcess, ABC):
|
|
107
|
+
config: PartitionerConfig
|
|
108
|
+
|
|
109
|
+
def is_async(self) -> bool:
|
|
110
|
+
return self.config.partition_by_api
|
|
111
|
+
|
|
112
|
+
def postprocess(self, elements: list[dict]) -> list[dict]:
|
|
113
|
+
element_dicts = [e.copy() for e in elements]
|
|
114
|
+
if self.config.element_exclude:
|
|
115
|
+
element_dicts = list(
|
|
116
|
+
filter(
|
|
117
|
+
lambda element: element["type"] not in self.config.element_exclude,
|
|
118
|
+
element_dicts,
|
|
119
|
+
)
|
|
120
|
+
)
|
|
121
|
+
for elem in element_dicts:
|
|
122
|
+
if self.config.metadata_exclude:
|
|
123
|
+
ex_list = self.config.metadata_exclude
|
|
124
|
+
for ex in ex_list:
|
|
125
|
+
if "." in ex: # handle nested fields
|
|
126
|
+
nested_fields = ex.split(".")
|
|
127
|
+
current_elem = elem
|
|
128
|
+
for f in nested_fields[:-1]:
|
|
129
|
+
if f in current_elem:
|
|
130
|
+
current_elem = current_elem[f]
|
|
131
|
+
field_to_exclude = nested_fields[-1]
|
|
132
|
+
if field_to_exclude in current_elem:
|
|
133
|
+
current_elem.pop(field_to_exclude, None)
|
|
134
|
+
else: # handle top-level fields
|
|
135
|
+
elem["metadata"].pop(ex, None) # type: ignore[attr-defined]
|
|
136
|
+
elif self.config.metadata_include:
|
|
137
|
+
in_list = self.config.metadata_include
|
|
138
|
+
for k in list(elem["metadata"].keys()): # type: ignore[attr-defined]
|
|
139
|
+
if k not in in_list:
|
|
140
|
+
elem["metadata"].pop(k, None) # type: ignore[attr-defined]
|
|
141
|
+
in_list = self.config.fields_include
|
|
142
|
+
elem = {k: v for k, v in elem.items() if k in in_list}
|
|
143
|
+
|
|
144
|
+
if self.config.flatten_metadata and "metadata" in elem:
|
|
145
|
+
metadata = elem.pop("metadata")
|
|
146
|
+
elem.update(flatten_dict(metadata, keys_to_omit=["data_source_record_locator"]))
|
|
147
|
+
return element_dicts
|
|
148
|
+
|
|
149
|
+
@requires_dependencies(dependencies=["unstructured"])
|
|
150
|
+
def partition_locally(
|
|
151
|
+
self, filename: Path, metadata: Optional[dict] = None, **kwargs
|
|
152
|
+
) -> list[dict]:
|
|
153
|
+
from unstructured.documents.elements import DataSourceMetadata
|
|
154
|
+
from unstructured.partition.auto import partition
|
|
155
|
+
from unstructured.staging.base import elements_to_dicts
|
|
156
|
+
|
|
157
|
+
@dataclass
|
|
158
|
+
class FileDataSourceMetadata(DataSourceMetadata):
|
|
159
|
+
filesize_bytes: Optional[int] = None
|
|
160
|
+
|
|
161
|
+
metadata = metadata or {}
|
|
162
|
+
logger.debug(f"using local partition with kwargs: {self.config.to_partition_kwargs()}")
|
|
163
|
+
logger.debug(f"partitioning file {filename} with metadata {metadata}")
|
|
164
|
+
try:
|
|
165
|
+
elements = partition(
|
|
166
|
+
filename=str(filename.resolve()),
|
|
167
|
+
data_source_metadata=FileDataSourceMetadata.from_dict(metadata),
|
|
168
|
+
**self.config.to_partition_kwargs(),
|
|
169
|
+
)
|
|
170
|
+
except ValueError as sdk_error:
|
|
171
|
+
if (
|
|
172
|
+
self.is_unstructured_error_unsupported_filetype(sdk_error=sdk_error)
|
|
173
|
+
and not self.config.raise_unsupported_filetype
|
|
174
|
+
):
|
|
175
|
+
logger.warning(
|
|
176
|
+
f"Unsupported file type for strategy {self.config.strategy}: {filename}"
|
|
177
|
+
)
|
|
178
|
+
return []
|
|
179
|
+
raise sdk_error
|
|
180
|
+
return self.postprocess(elements=elements_to_dicts(elements))
|
|
181
|
+
|
|
182
|
+
@requires_dependencies(dependencies=["unstructured_client"], extras="remote")
|
|
183
|
+
async def partition_via_api(
|
|
184
|
+
self, filename: Path, metadata: Optional[dict] = None, **kwargs
|
|
185
|
+
) -> list[dict]:
|
|
186
|
+
metadata = metadata or {}
|
|
187
|
+
logger.debug(f"partitioning file {filename} with metadata: {metadata}")
|
|
188
|
+
|
|
189
|
+
elements = await call_api_async(
|
|
190
|
+
server_url=self.config.partition_endpoint,
|
|
191
|
+
api_key=self.config.api_key.get_secret_value(),
|
|
192
|
+
filename=filename,
|
|
193
|
+
api_parameters=self.config.to_partition_kwargs(),
|
|
194
|
+
timeout_ms=self.config.api_timeout_ms,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
# Append the data source metadata the auto partition does for you
|
|
198
|
+
for element in elements:
|
|
199
|
+
element["metadata"]["data_source"] = metadata
|
|
200
|
+
return self.postprocess(elements=elements)
|
|
201
|
+
|
|
202
|
+
def is_unstructured_error_unsupported_filetype(self, sdk_error: ValueError) -> bool:
|
|
203
|
+
error_msg = sdk_error.args[0]
|
|
204
|
+
return (
|
|
205
|
+
"Invalid file" in error_msg
|
|
206
|
+
or "Unstructured schema" in error_msg
|
|
207
|
+
or "fast strategy is not available for image files" in error_msg
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
def is_client_error_unsupported_filetype(self, error: UserError) -> bool:
|
|
211
|
+
error_msg = error.args[0]
|
|
212
|
+
return "fast strategy is not available for image files" in error_msg or (
|
|
213
|
+
"file type" in error_msg.lower() and "is not supported" in error_msg.lower()
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
def run(self, filename: Path, metadata: Optional[dict] = None, **kwargs) -> list[dict]:
|
|
217
|
+
return self.partition_locally(filename, metadata=metadata, **kwargs)
|
|
218
|
+
|
|
219
|
+
async def run_async(
|
|
220
|
+
self, filename: Path, metadata: Optional[dict] = None, **kwargs
|
|
221
|
+
) -> list[dict]:
|
|
222
|
+
try:
|
|
223
|
+
return await self.partition_via_api(filename, metadata=metadata, **kwargs)
|
|
224
|
+
except UserError as user_error:
|
|
225
|
+
if (
|
|
226
|
+
self.is_client_error_unsupported_filetype(error=user_error)
|
|
227
|
+
and not self.config.raise_unsupported_filetype
|
|
228
|
+
):
|
|
229
|
+
logger.warning(
|
|
230
|
+
f"Unsupported file type for strategy {self.config.strategy}: {filename}"
|
|
231
|
+
)
|
|
232
|
+
return []
|
|
233
|
+
raise user_error
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from copy import copy
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
from uuid import NAMESPACE_DNS, uuid5
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
11
|
+
from unstructured_ingest.interfaces.process import BaseProcess
|
|
12
|
+
from unstructured_ingest.logger import logger
|
|
13
|
+
from unstructured_ingest.utils.compression import TAR_FILE_EXT, ZIP_FILE_EXT, uncompress_file
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class UncompressConfig(BaseModel):
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class Uncompressor(BaseProcess, ABC):
|
|
22
|
+
config: UncompressConfig = field(default_factory=UncompressConfig)
|
|
23
|
+
|
|
24
|
+
def is_async(self) -> bool:
|
|
25
|
+
return True
|
|
26
|
+
|
|
27
|
+
def run(self, file_data: FileData, **kwargs: Any) -> list[FileData]:
|
|
28
|
+
local_filepath = Path(file_data.local_download_path)
|
|
29
|
+
if local_filepath.suffix not in TAR_FILE_EXT + ZIP_FILE_EXT:
|
|
30
|
+
return [file_data]
|
|
31
|
+
new_path = uncompress_file(filename=str(local_filepath))
|
|
32
|
+
new_files = [i for i in Path(new_path).rglob("*") if i.is_file()]
|
|
33
|
+
responses = []
|
|
34
|
+
logger.debug(
|
|
35
|
+
"uncompressed {} files from original file {}: {}".format(
|
|
36
|
+
len(new_files), local_filepath, ", ".join([str(f) for f in new_files])
|
|
37
|
+
)
|
|
38
|
+
)
|
|
39
|
+
for f in new_files:
|
|
40
|
+
new_file_data = copy(file_data)
|
|
41
|
+
new_file_data.identifier = str(uuid5(NAMESPACE_DNS, str(f)))
|
|
42
|
+
new_file_data.local_download_path = str(f.resolve())
|
|
43
|
+
new_rel_download_path = str(f).replace(str(Path(local_filepath.parent)), "")[1:]
|
|
44
|
+
new_file_data.source_identifiers = SourceIdentifiers(
|
|
45
|
+
filename=f.name,
|
|
46
|
+
fullpath=str(file_data.source_identifiers.fullpath).replace(
|
|
47
|
+
file_data.source_identifiers.filename, new_rel_download_path
|
|
48
|
+
),
|
|
49
|
+
rel_path=(
|
|
50
|
+
str(file_data.source_identifiers.rel_path).replace(
|
|
51
|
+
file_data.source_identifiers.filename, new_rel_download_path
|
|
52
|
+
)
|
|
53
|
+
if file_data.source_identifiers.rel_path
|
|
54
|
+
else None
|
|
55
|
+
),
|
|
56
|
+
)
|
|
57
|
+
responses.append(new_file_data)
|
|
58
|
+
return responses
|
|
59
|
+
|
|
60
|
+
async def run_async(self, file_data: FileData, **kwargs: Any) -> list[FileData]:
|
|
61
|
+
return self.run(file_data=file_data, **kwargs)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
6
|
+
from unstructured_ingest.interfaces import UploadStager, UploadStagerConfig
|
|
7
|
+
from unstructured_ingest.utils.data_prep import get_json_data, write_data
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BlobStoreUploadStagerConfig(UploadStagerConfig):
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class BlobStoreUploadStager(UploadStager):
|
|
16
|
+
upload_stager_config: BlobStoreUploadStagerConfig = field(
|
|
17
|
+
default_factory=BlobStoreUploadStagerConfig
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
def run(
|
|
21
|
+
self,
|
|
22
|
+
elements_filepath: Path,
|
|
23
|
+
file_data: FileData,
|
|
24
|
+
output_dir: Path,
|
|
25
|
+
output_filename: str,
|
|
26
|
+
**kwargs: Any,
|
|
27
|
+
) -> Path:
|
|
28
|
+
output_file = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
|
|
29
|
+
# Always save as json
|
|
30
|
+
data = get_json_data(elements_filepath)
|
|
31
|
+
write_data(path=output_file.with_suffix(".json"), data=data)
|
|
32
|
+
return output_file.with_suffix(".json")
|