unstructured-ingest 1.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/cli/__init__.py +0 -0
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +269 -0
- unstructured_ingest/cli/base/dest.py +84 -0
- unstructured_ingest/cli/base/importer.py +34 -0
- unstructured_ingest/cli/base/src.py +75 -0
- unstructured_ingest/cli/cli.py +24 -0
- unstructured_ingest/cli/cmds.py +14 -0
- unstructured_ingest/cli/utils/__init__.py +0 -0
- unstructured_ingest/cli/utils/click.py +237 -0
- unstructured_ingest/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/data_types/__init__.py +0 -0
- unstructured_ingest/data_types/entities.py +17 -0
- unstructured_ingest/data_types/file_data.py +116 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +63 -0
- unstructured_ingest/embed/bedrock.py +323 -0
- unstructured_ingest/embed/huggingface.py +69 -0
- unstructured_ingest/embed/interfaces.py +146 -0
- unstructured_ingest/embed/mixedbreadai.py +134 -0
- unstructured_ingest/embed/octoai.py +133 -0
- unstructured_ingest/embed/openai.py +142 -0
- unstructured_ingest/embed/togetherai.py +116 -0
- unstructured_ingest/embed/vertexai.py +109 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/error.py +156 -0
- unstructured_ingest/errors_v2.py +156 -0
- unstructured_ingest/interfaces/__init__.py +27 -0
- unstructured_ingest/interfaces/connector.py +56 -0
- unstructured_ingest/interfaces/downloader.py +90 -0
- unstructured_ingest/interfaces/indexer.py +29 -0
- unstructured_ingest/interfaces/process.py +22 -0
- unstructured_ingest/interfaces/processor.py +88 -0
- unstructured_ingest/interfaces/upload_stager.py +89 -0
- unstructured_ingest/interfaces/uploader.py +67 -0
- unstructured_ingest/logger.py +39 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/otel.py +128 -0
- unstructured_ingest/pipeline/__init__.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +211 -0
- unstructured_ingest/pipeline/otel.py +32 -0
- unstructured_ingest/pipeline/pipeline.py +408 -0
- unstructured_ingest/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/pipeline/steps/chunk.py +78 -0
- unstructured_ingest/pipeline/steps/download.py +206 -0
- unstructured_ingest/pipeline/steps/embed.py +77 -0
- unstructured_ingest/pipeline/steps/filter.py +35 -0
- unstructured_ingest/pipeline/steps/index.py +86 -0
- unstructured_ingest/pipeline/steps/partition.py +77 -0
- unstructured_ingest/pipeline/steps/stage.py +65 -0
- unstructured_ingest/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/pipeline/steps/upload.py +58 -0
- unstructured_ingest/processes/__init__.py +18 -0
- unstructured_ingest/processes/chunker.py +131 -0
- unstructured_ingest/processes/connector_registry.py +69 -0
- unstructured_ingest/processes/connectors/__init__.py +129 -0
- unstructured_ingest/processes/connectors/airtable.py +238 -0
- unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +592 -0
- unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
- unstructured_ingest/processes/connectors/chroma.py +193 -0
- unstructured_ingest/processes/connectors/confluence.py +527 -0
- unstructured_ingest/processes/connectors/couchbase.py +336 -0
- unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
- unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
- unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
- unstructured_ingest/processes/connectors/delta_table.py +310 -0
- unstructured_ingest/processes/connectors/discord.py +161 -0
- unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
- unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
- unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
- unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
- unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
- unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
- unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
- unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/processes/connectors/github.py +226 -0
- unstructured_ingest/processes/connectors/gitlab.py +270 -0
- unstructured_ingest/processes/connectors/google_drive.py +848 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
- unstructured_ingest/processes/connectors/jira.py +522 -0
- unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
- unstructured_ingest/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/processes/connectors/kdbai.py +156 -0
- unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
- unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/processes/connectors/local.py +227 -0
- unstructured_ingest/processes/connectors/milvus.py +311 -0
- unstructured_ingest/processes/connectors/mongodb.py +389 -0
- unstructured_ingest/processes/connectors/neo4j.py +534 -0
- unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/processes/connectors/notion/connector.py +350 -0
- unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
- unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
- unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
- unstructured_ingest/processes/connectors/onedrive.py +485 -0
- unstructured_ingest/processes/connectors/outlook.py +242 -0
- unstructured_ingest/processes/connectors/pinecone.py +400 -0
- unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
- unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/processes/connectors/redisdb.py +214 -0
- unstructured_ingest/processes/connectors/salesforce.py +307 -0
- unstructured_ingest/processes/connectors/sharepoint.py +282 -0
- unstructured_ingest/processes/connectors/slack.py +249 -0
- unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
- unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
- unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
- unstructured_ingest/processes/connectors/sql/sql.py +456 -0
- unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
- unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
- unstructured_ingest/processes/connectors/utils.py +60 -0
- unstructured_ingest/processes/connectors/vectara.py +348 -0
- unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
- unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
- unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
- unstructured_ingest/processes/embedder.py +203 -0
- unstructured_ingest/processes/filter.py +60 -0
- unstructured_ingest/processes/partitioner.py +233 -0
- unstructured_ingest/processes/uncompress.py +61 -0
- unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest/processes/utils/blob_storage.py +32 -0
- unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- unstructured_ingest/unstructured_api.py +140 -0
- unstructured_ingest/utils/__init__.py +5 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +72 -0
- unstructured_ingest/utils/constants.py +2 -0
- unstructured_ingest/utils/data_prep.py +216 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/filesystem.py +27 -0
- unstructured_ingest/utils/html.py +174 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/utils/pydantic_models.py +52 -0
- unstructured_ingest/utils/string_and_date_utils.py +74 -0
- unstructured_ingest/utils/table.py +80 -0
- unstructured_ingest/utils/tls.py +15 -0
- unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
- unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
- unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
- unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
- unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
|
@@ -0,0 +1,408 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
import multiprocessing as mp
|
|
6
|
+
import shutil
|
|
7
|
+
from dataclasses import InitVar, dataclass, field
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.interfaces import ProcessorConfig, Uploader
|
|
12
|
+
from unstructured_ingest.logger import logger, make_default_logger
|
|
13
|
+
from unstructured_ingest.otel import OtelHandler
|
|
14
|
+
from unstructured_ingest.pipeline.interfaces import PipelineStep
|
|
15
|
+
from unstructured_ingest.pipeline.steps.chunk import Chunker, ChunkStep
|
|
16
|
+
from unstructured_ingest.pipeline.steps.download import DownloaderT, DownloadStep
|
|
17
|
+
from unstructured_ingest.pipeline.steps.embed import Embedder, EmbedStep
|
|
18
|
+
from unstructured_ingest.pipeline.steps.filter import Filterer, FilterStep
|
|
19
|
+
from unstructured_ingest.pipeline.steps.index import IndexerT, IndexStep
|
|
20
|
+
from unstructured_ingest.pipeline.steps.partition import Partitioner, PartitionStep
|
|
21
|
+
from unstructured_ingest.pipeline.steps.stage import UploadStager, UploadStageStep
|
|
22
|
+
from unstructured_ingest.pipeline.steps.uncompress import Uncompressor, UncompressStep
|
|
23
|
+
from unstructured_ingest.pipeline.steps.upload import UploadStep
|
|
24
|
+
from unstructured_ingest.processes.chunker import ChunkerConfig
|
|
25
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
26
|
+
ConnectionConfig,
|
|
27
|
+
DownloaderConfigT,
|
|
28
|
+
IndexerConfigT,
|
|
29
|
+
UploaderConfigT,
|
|
30
|
+
UploadStagerConfigT,
|
|
31
|
+
destination_registry,
|
|
32
|
+
source_registry,
|
|
33
|
+
)
|
|
34
|
+
from unstructured_ingest.processes.connectors.local import LocalUploader
|
|
35
|
+
from unstructured_ingest.processes.embedder import EmbedderConfig
|
|
36
|
+
from unstructured_ingest.processes.filter import FiltererConfig
|
|
37
|
+
from unstructured_ingest.processes.partitioner import PartitionerConfig
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class PipelineError(Exception):
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class Pipeline:
|
|
46
|
+
context: ProcessorConfig
|
|
47
|
+
|
|
48
|
+
indexer: InitVar[IndexerT]
|
|
49
|
+
indexer_step: IndexStep = field(init=False)
|
|
50
|
+
|
|
51
|
+
downloader: InitVar[DownloaderT]
|
|
52
|
+
downloader_step: DownloadStep = field(init=False)
|
|
53
|
+
|
|
54
|
+
partitioner: InitVar[Partitioner]
|
|
55
|
+
partitioner_step: PartitionStep = field(init=False)
|
|
56
|
+
|
|
57
|
+
chunker: InitVar[Chunker | None] = None
|
|
58
|
+
chunker_step: ChunkStep | None = field(init=False, default=None)
|
|
59
|
+
|
|
60
|
+
embedder: InitVar[Embedder | None] = None
|
|
61
|
+
embedder_step: EmbedStep | None = field(init=False, default=None)
|
|
62
|
+
|
|
63
|
+
stager: InitVar[UploadStager | None] = None
|
|
64
|
+
stager_step: UploadStageStep | None = field(init=False, default=None)
|
|
65
|
+
|
|
66
|
+
uploader: InitVar[Uploader] = field(default=LocalUploader())
|
|
67
|
+
uploader_step: UploadStep | None = field(init=False, default=None)
|
|
68
|
+
|
|
69
|
+
uncompress_step: UncompressStep | None = field(init=False, default=None)
|
|
70
|
+
|
|
71
|
+
filterer: InitVar[Filterer | None] = None
|
|
72
|
+
filter_step: FilterStep | None = field(init=False, default=None)
|
|
73
|
+
|
|
74
|
+
def __post_init__(
|
|
75
|
+
self,
|
|
76
|
+
indexer: IndexerT,
|
|
77
|
+
downloader: DownloaderT,
|
|
78
|
+
partitioner: Partitioner,
|
|
79
|
+
chunker: Chunker | None = None,
|
|
80
|
+
embedder: Embedder | None = None,
|
|
81
|
+
stager: UploadStager | None = None,
|
|
82
|
+
uploader: Uploader | None = None,
|
|
83
|
+
filterer: Filterer | None = None,
|
|
84
|
+
):
|
|
85
|
+
make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
|
|
86
|
+
otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint)
|
|
87
|
+
otel_handler.init_trace()
|
|
88
|
+
self.indexer_step = IndexStep(process=indexer, context=self.context)
|
|
89
|
+
self.downloader_step = DownloadStep(process=downloader, context=self.context)
|
|
90
|
+
self.filter_step = FilterStep(process=filterer, context=self.context) if filterer else None
|
|
91
|
+
self.partitioner_step = PartitionStep(process=partitioner, context=self.context)
|
|
92
|
+
self.chunker_step = ChunkStep(process=chunker, context=self.context) if chunker else None
|
|
93
|
+
|
|
94
|
+
self.embedder_step = EmbedStep(process=embedder, context=self.context) if embedder else None
|
|
95
|
+
|
|
96
|
+
self.stager_step = UploadStageStep(process=stager, context=self.context) if stager else None
|
|
97
|
+
self.uploader_step = UploadStep(process=uploader, context=self.context)
|
|
98
|
+
if self.context.uncompress:
|
|
99
|
+
process = Uncompressor()
|
|
100
|
+
self.uncompress_step = UncompressStep(process=process, context=self.context)
|
|
101
|
+
|
|
102
|
+
self.check_destination_connector()
|
|
103
|
+
|
|
104
|
+
def check_destination_connector(self):
|
|
105
|
+
# Make sure that if the set destination connector expects a stager, one is also set
|
|
106
|
+
if not self.uploader_step:
|
|
107
|
+
return
|
|
108
|
+
uploader_connector_type = self.uploader_step.process.connector_type
|
|
109
|
+
registry_entry = destination_registry[uploader_connector_type]
|
|
110
|
+
if registry_entry.upload_stager and self.stager_step is None:
|
|
111
|
+
try:
|
|
112
|
+
self.stager_step = UploadStageStep(
|
|
113
|
+
process=registry_entry.upload_stager(), context=self.context
|
|
114
|
+
)
|
|
115
|
+
return
|
|
116
|
+
except Exception as e:
|
|
117
|
+
logger.debug(f"failed to instantiate required stager on user's behalf: {e}")
|
|
118
|
+
raise ValueError(
|
|
119
|
+
f"pipeline with uploader type {self.uploader_step.process.__class__.__name__} "
|
|
120
|
+
f"expects a stager of type {registry_entry.upload_stager.__name__} "
|
|
121
|
+
f"but one was not set"
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
def cleanup(self):
|
|
125
|
+
if self.context.delete_cache and Path(self.context.work_dir).exists():
|
|
126
|
+
logger.info(f"deleting cache directory: {self.context.work_dir}")
|
|
127
|
+
shutil.rmtree(self.context.work_dir)
|
|
128
|
+
|
|
129
|
+
def log_statuses(self):
|
|
130
|
+
if status := self.context.status:
|
|
131
|
+
logger.error(f"{len(status)} failed documents:")
|
|
132
|
+
for k, v in status.items():
|
|
133
|
+
for kk, vv in v.items():
|
|
134
|
+
logger.error(f"{k}: [{kk}] {vv}")
|
|
135
|
+
|
|
136
|
+
def _run_initialization(self):
|
|
137
|
+
failures = {}
|
|
138
|
+
init_kwargs = {}
|
|
139
|
+
for step in self._get_ordered_steps():
|
|
140
|
+
try:
|
|
141
|
+
step.process.init(**init_kwargs)
|
|
142
|
+
step.process.precheck()
|
|
143
|
+
# Make sure embedder dimensions available for downstream steps
|
|
144
|
+
if isinstance(step.process, Embedder):
|
|
145
|
+
embed_dimensions = step.process.config.get_embedder().dimension
|
|
146
|
+
init_kwargs["vector_length"] = embed_dimensions
|
|
147
|
+
|
|
148
|
+
except Exception as e:
|
|
149
|
+
failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
|
|
150
|
+
if failures:
|
|
151
|
+
for k, v in failures.items():
|
|
152
|
+
logger.error(f"Step initialization failure: {k}: {v}")
|
|
153
|
+
raise PipelineError("Initialization failed")
|
|
154
|
+
|
|
155
|
+
def run(self):
|
|
156
|
+
otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.info)
|
|
157
|
+
try:
|
|
158
|
+
with otel_handler.get_tracer().start_as_current_span(
|
|
159
|
+
"ingest process", record_exception=True
|
|
160
|
+
):
|
|
161
|
+
self._run_initialization()
|
|
162
|
+
self._run()
|
|
163
|
+
finally:
|
|
164
|
+
self.log_statuses()
|
|
165
|
+
self.cleanup()
|
|
166
|
+
if self.context.status:
|
|
167
|
+
raise PipelineError("Pipeline did not run successfully")
|
|
168
|
+
|
|
169
|
+
def clean_results(self, results: list[Any | list[Any]] | None) -> list[Any] | None:
|
|
170
|
+
if not results:
|
|
171
|
+
return None
|
|
172
|
+
results = [r for r in results if r]
|
|
173
|
+
flat = []
|
|
174
|
+
for r in results:
|
|
175
|
+
if isinstance(r, list):
|
|
176
|
+
flat.extend(r)
|
|
177
|
+
else:
|
|
178
|
+
flat.append(r)
|
|
179
|
+
final = [f for f in flat if f]
|
|
180
|
+
return final or None
|
|
181
|
+
|
|
182
|
+
def _get_ordered_steps(self) -> list[PipelineStep]:
|
|
183
|
+
steps = [self.indexer_step, self.downloader_step]
|
|
184
|
+
if self.uncompress_step:
|
|
185
|
+
steps.append(self.uncompress_step)
|
|
186
|
+
steps.append(self.partitioner_step)
|
|
187
|
+
if self.chunker_step:
|
|
188
|
+
steps.append(self.chunker_step)
|
|
189
|
+
if self.embedder_step:
|
|
190
|
+
steps.append(self.embedder_step)
|
|
191
|
+
if self.stager_step:
|
|
192
|
+
steps.append(self.stager_step)
|
|
193
|
+
steps.append(self.uploader_step)
|
|
194
|
+
return steps
|
|
195
|
+
|
|
196
|
+
def apply_filter(self, records: list[dict]) -> list[dict]:
|
|
197
|
+
if not self.filter_step:
|
|
198
|
+
return records
|
|
199
|
+
data_to_filter = [{"file_data_path": i["file_data_path"]} for i in records]
|
|
200
|
+
filtered_data = self.filter_step(data_to_filter)
|
|
201
|
+
filtered_data = [f for f in filtered_data if f is not None]
|
|
202
|
+
filtered_file_data_paths = [r["file_data_path"] for r in filtered_data]
|
|
203
|
+
filtered_records = [r for r in records if r["file_data_path"] in filtered_file_data_paths]
|
|
204
|
+
return filtered_records
|
|
205
|
+
|
|
206
|
+
def get_indices(self) -> list[dict]:
|
|
207
|
+
if self.indexer_step.process.is_async():
|
|
208
|
+
|
|
209
|
+
async def run_async():
|
|
210
|
+
output = []
|
|
211
|
+
async for i in self.indexer_step.run_async():
|
|
212
|
+
output.append(i)
|
|
213
|
+
return output
|
|
214
|
+
|
|
215
|
+
indices = asyncio.run(run_async())
|
|
216
|
+
else:
|
|
217
|
+
indices = self.indexer_step.run()
|
|
218
|
+
indices_inputs = [{"file_data_path": i} for i in indices]
|
|
219
|
+
return indices_inputs
|
|
220
|
+
|
|
221
|
+
def _run(self):
|
|
222
|
+
logger.info(
|
|
223
|
+
f"running local pipeline: {self} with configs: {self.context.model_dump_json()}"
|
|
224
|
+
)
|
|
225
|
+
if self.context.mp_supported:
|
|
226
|
+
manager = mp.Manager()
|
|
227
|
+
self.context.status = manager.dict()
|
|
228
|
+
else:
|
|
229
|
+
self.context.status = {}
|
|
230
|
+
|
|
231
|
+
# Index into data source
|
|
232
|
+
indices_inputs = self.get_indices()
|
|
233
|
+
if not indices_inputs:
|
|
234
|
+
logger.info("No files to process after indexer, exiting")
|
|
235
|
+
return
|
|
236
|
+
|
|
237
|
+
# Initial filtering on indexed content
|
|
238
|
+
indices_inputs = self.apply_filter(records=indices_inputs)
|
|
239
|
+
if not indices_inputs:
|
|
240
|
+
logger.info("No files to process after filtering indexed content, exiting")
|
|
241
|
+
return
|
|
242
|
+
|
|
243
|
+
# Download associated content to local file system
|
|
244
|
+
downloaded_data = self.downloader_step(indices_inputs)
|
|
245
|
+
downloaded_data = self.clean_results(results=downloaded_data)
|
|
246
|
+
if not downloaded_data:
|
|
247
|
+
logger.info("No files to process after downloader, exiting")
|
|
248
|
+
return
|
|
249
|
+
|
|
250
|
+
# Post download filtering
|
|
251
|
+
downloaded_data = self.apply_filter(records=downloaded_data)
|
|
252
|
+
if not downloaded_data:
|
|
253
|
+
logger.info("No files to process after filtering downloaded content, exiting")
|
|
254
|
+
return
|
|
255
|
+
|
|
256
|
+
# Run uncompress if available
|
|
257
|
+
if self.uncompress_step:
|
|
258
|
+
downloaded_data = self.uncompress_step(downloaded_data)
|
|
259
|
+
# Flatten list of lists
|
|
260
|
+
downloaded_data = self.clean_results(results=downloaded_data)
|
|
261
|
+
|
|
262
|
+
# Post uncompress filtering
|
|
263
|
+
downloaded_data = self.apply_filter(records=downloaded_data)
|
|
264
|
+
if not downloaded_data:
|
|
265
|
+
logger.info("No files to process after filtering uncompressed content, exiting")
|
|
266
|
+
return
|
|
267
|
+
|
|
268
|
+
if not downloaded_data or self.context.download_only:
|
|
269
|
+
return
|
|
270
|
+
|
|
271
|
+
# Partition content
|
|
272
|
+
elements = self.partitioner_step(downloaded_data)
|
|
273
|
+
elements = self.clean_results(results=elements)
|
|
274
|
+
# Download data non longer needed, delete if possible
|
|
275
|
+
self.downloader_step.delete_cache()
|
|
276
|
+
elements = self.clean_results(results=elements)
|
|
277
|
+
if not elements:
|
|
278
|
+
logger.info("No files to process after partitioning, exiting")
|
|
279
|
+
return
|
|
280
|
+
|
|
281
|
+
# Run element specific modifiers
|
|
282
|
+
last_step = self.partitioner_step
|
|
283
|
+
for step in [s for s in [self.chunker_step, self.embedder_step, self.stager_step] if s]:
|
|
284
|
+
elements = step(elements)
|
|
285
|
+
elements = self.clean_results(results=elements)
|
|
286
|
+
# Delete data from previous step if possible since no longer needed
|
|
287
|
+
last_step.delete_cache()
|
|
288
|
+
last_step = step
|
|
289
|
+
if not elements:
|
|
290
|
+
logger.info(f"no files to process after {step.__class__.__name__}, exiting")
|
|
291
|
+
return
|
|
292
|
+
|
|
293
|
+
# Upload the final result
|
|
294
|
+
self.uploader_step(iterable=elements)
|
|
295
|
+
last_step.delete_cache()
|
|
296
|
+
|
|
297
|
+
def __str__(self):
|
|
298
|
+
s = [str(self.indexer_step)]
|
|
299
|
+
if filter_step := self.filter_step:
|
|
300
|
+
s.append(str(filter_step))
|
|
301
|
+
s.append(str(self.downloader_step))
|
|
302
|
+
if filter_step := self.filter_step:
|
|
303
|
+
s.append(str(filter_step))
|
|
304
|
+
if uncompress_step := self.uncompress_step:
|
|
305
|
+
s.extend([str(uncompress_step), str(filter_step)])
|
|
306
|
+
s.append(str(self.partitioner_step))
|
|
307
|
+
if chunker_step := self.chunker_step:
|
|
308
|
+
s.append(str(chunker_step))
|
|
309
|
+
if embedder_step := self.embedder_step:
|
|
310
|
+
s.append(str(embedder_step))
|
|
311
|
+
if stager_step := self.stager_step:
|
|
312
|
+
s.append(str(stager_step))
|
|
313
|
+
s.append(str(self.uploader_step))
|
|
314
|
+
return " -> ".join(s)
|
|
315
|
+
|
|
316
|
+
@classmethod
|
|
317
|
+
def from_configs(
|
|
318
|
+
cls,
|
|
319
|
+
context: ProcessorConfig,
|
|
320
|
+
indexer_config: IndexerConfigT,
|
|
321
|
+
downloader_config: DownloaderConfigT,
|
|
322
|
+
source_connection_config: ConnectionConfig,
|
|
323
|
+
partitioner_config: PartitionerConfig,
|
|
324
|
+
filterer_config: FiltererConfig | None = None,
|
|
325
|
+
chunker_config: ChunkerConfig | None = None,
|
|
326
|
+
embedder_config: EmbedderConfig | None = None,
|
|
327
|
+
destination_connection_config: ConnectionConfig | None = None,
|
|
328
|
+
stager_config: UploadStagerConfigT | None = None,
|
|
329
|
+
uploader_config: UploaderConfigT | None = None,
|
|
330
|
+
) -> "Pipeline":
|
|
331
|
+
# Get registry key based on indexer config
|
|
332
|
+
source_entry = {
|
|
333
|
+
k: v
|
|
334
|
+
for k, v in source_registry.items()
|
|
335
|
+
if type(indexer_config) is v.indexer_config
|
|
336
|
+
and type(downloader_config) is v.downloader_config
|
|
337
|
+
and type(source_connection_config) is v.connection_config
|
|
338
|
+
}
|
|
339
|
+
if len(source_entry) > 1:
|
|
340
|
+
raise ValueError(
|
|
341
|
+
f"multiple entries found matching provided indexer, "
|
|
342
|
+
f"downloader and connection configs: {source_entry}"
|
|
343
|
+
)
|
|
344
|
+
if len(source_entry) != 1:
|
|
345
|
+
raise ValueError(
|
|
346
|
+
"no entry found in source registry with matching indexer, "
|
|
347
|
+
"downloader and connection configs"
|
|
348
|
+
)
|
|
349
|
+
source = list(source_entry.values())[0]
|
|
350
|
+
pipeline_kwargs = {
|
|
351
|
+
"context": context,
|
|
352
|
+
"indexer": source.indexer(
|
|
353
|
+
index_config=indexer_config, connection_config=source_connection_config
|
|
354
|
+
),
|
|
355
|
+
"downloader": source.downloader(
|
|
356
|
+
download_config=downloader_config, connection_config=source_connection_config
|
|
357
|
+
),
|
|
358
|
+
"partitioner": Partitioner(config=partitioner_config),
|
|
359
|
+
}
|
|
360
|
+
if filterer_config:
|
|
361
|
+
pipeline_kwargs["filterer"] = Filterer(config=filterer_config)
|
|
362
|
+
if chunker_config:
|
|
363
|
+
pipeline_kwargs["chunker"] = Chunker(config=chunker_config)
|
|
364
|
+
if embedder_config:
|
|
365
|
+
pipeline_kwargs["embedder"] = Embedder(config=embedder_config)
|
|
366
|
+
if not uploader_config:
|
|
367
|
+
return Pipeline(**pipeline_kwargs)
|
|
368
|
+
|
|
369
|
+
destination_entry = {
|
|
370
|
+
k: v
|
|
371
|
+
for k, v in destination_registry.items()
|
|
372
|
+
if isinstance(uploader_config, v.uploader_config)
|
|
373
|
+
}
|
|
374
|
+
if destination_connection_config:
|
|
375
|
+
destination_entry = {
|
|
376
|
+
k: v
|
|
377
|
+
for k, v in destination_entry.items()
|
|
378
|
+
if isinstance(destination_connection_config, v.connection_config)
|
|
379
|
+
}
|
|
380
|
+
if stager_config:
|
|
381
|
+
destination_entry = {
|
|
382
|
+
k: v
|
|
383
|
+
for k, v in destination_entry.items()
|
|
384
|
+
if isinstance(stager_config, v.upload_stager_config)
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
if len(destination_entry) > 1:
|
|
388
|
+
raise ValueError(
|
|
389
|
+
f"multiple entries found matching provided uploader, "
|
|
390
|
+
f"stager and connection configs: {destination_entry}"
|
|
391
|
+
)
|
|
392
|
+
if len(destination_entry) != 1:
|
|
393
|
+
raise ValueError(
|
|
394
|
+
"no entry found in destination registry with matching uploader, "
|
|
395
|
+
"stager and connection configs"
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
destination = list(destination_entry.values())[0]
|
|
399
|
+
if stager_config:
|
|
400
|
+
pipeline_kwargs["stager"] = destination.upload_stager(
|
|
401
|
+
upload_stager_config=stager_config
|
|
402
|
+
)
|
|
403
|
+
if uploader_config:
|
|
404
|
+
uploader_kwargs = {"upload_config": uploader_config}
|
|
405
|
+
if destination_connection_config:
|
|
406
|
+
uploader_kwargs["connection_config"] = destination_connection_config
|
|
407
|
+
pipeline_kwargs["uploader"] = destination.uploader(**uploader_kwargs)
|
|
408
|
+
return cls(**pipeline_kwargs)
|
|
File without changes
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import hashlib
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Callable, Optional, TypedDict
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.data_types.file_data import FileData, file_data_from_file
|
|
8
|
+
from unstructured_ingest.logger import logger
|
|
9
|
+
from unstructured_ingest.pipeline.interfaces import PipelineStep
|
|
10
|
+
from unstructured_ingest.processes.chunker import Chunker
|
|
11
|
+
from unstructured_ingest.utils.data_prep import write_data
|
|
12
|
+
from unstructured_ingest.utils.pydantic_models import serialize_base_model_json
|
|
13
|
+
|
|
14
|
+
STEP_ID = "chunk"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ChunkStepResponse(TypedDict):
|
|
18
|
+
file_data_path: str
|
|
19
|
+
path: str
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class ChunkStep(PipelineStep):
|
|
24
|
+
process: Chunker
|
|
25
|
+
identifier: str = STEP_ID
|
|
26
|
+
|
|
27
|
+
def __str__(self):
|
|
28
|
+
return f"{self.identifier} ({self.process.config.chunking_strategy})"
|
|
29
|
+
|
|
30
|
+
def __post_init__(self):
|
|
31
|
+
config = self.process.config.model_dump_json() if self.process.config else None
|
|
32
|
+
logger.info(f"created {self.identifier} with configs: {config}")
|
|
33
|
+
|
|
34
|
+
def should_chunk(self, filepath: Path, file_data: FileData) -> bool:
|
|
35
|
+
if self.context.reprocess or file_data.reprocess:
|
|
36
|
+
return True
|
|
37
|
+
return not filepath.exists()
|
|
38
|
+
|
|
39
|
+
def get_output_filepath(self, filename: Path) -> Path:
|
|
40
|
+
hashed_output_file = f"{self.get_hash(extras=[filename.name])}.ndjson"
|
|
41
|
+
filepath = (self.cache_dir / hashed_output_file).resolve()
|
|
42
|
+
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
43
|
+
return filepath
|
|
44
|
+
|
|
45
|
+
def _save_output(self, output_filepath: str, chunked_content: list[dict]):
|
|
46
|
+
logger.debug(f"writing chunker output to: {output_filepath}")
|
|
47
|
+
write_data(path=Path(output_filepath), data=chunked_content)
|
|
48
|
+
|
|
49
|
+
async def _run_async(
|
|
50
|
+
self, fn: Callable, path: str, file_data_path: str, **kwargs
|
|
51
|
+
) -> ChunkStepResponse:
|
|
52
|
+
path = Path(path)
|
|
53
|
+
file_data = file_data_from_file(path=file_data_path)
|
|
54
|
+
output_filepath = self.get_output_filepath(filename=path)
|
|
55
|
+
if not self.should_chunk(filepath=output_filepath, file_data=file_data):
|
|
56
|
+
logger.debug(f"skipping chunking, output already exists: {output_filepath}")
|
|
57
|
+
return ChunkStepResponse(file_data_path=file_data_path, path=str(output_filepath))
|
|
58
|
+
fn_kwargs = {"elements_filepath": path}
|
|
59
|
+
if not asyncio.iscoroutinefunction(fn):
|
|
60
|
+
chunked_content_raw = fn(**fn_kwargs)
|
|
61
|
+
elif semaphore := self.context.semaphore:
|
|
62
|
+
async with semaphore:
|
|
63
|
+
chunked_content_raw = await fn(**fn_kwargs)
|
|
64
|
+
else:
|
|
65
|
+
chunked_content_raw = await fn(**fn_kwargs)
|
|
66
|
+
self._save_output(
|
|
67
|
+
output_filepath=str(output_filepath),
|
|
68
|
+
chunked_content=chunked_content_raw,
|
|
69
|
+
)
|
|
70
|
+
return ChunkStepResponse(file_data_path=file_data_path, path=str(output_filepath))
|
|
71
|
+
|
|
72
|
+
def get_hash(self, extras: Optional[list[str]]) -> str:
|
|
73
|
+
hashable_string = serialize_base_model_json(
|
|
74
|
+
model=self.process.config, sort_keys=True, ensure_ascii=True
|
|
75
|
+
)
|
|
76
|
+
if extras:
|
|
77
|
+
hashable_string += "".join(extras)
|
|
78
|
+
return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import hashlib
|
|
3
|
+
import json
|
|
4
|
+
import shutil
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Callable, Optional, TypedDict, TypeVar
|
|
8
|
+
|
|
9
|
+
from unstructured_ingest.data_types.file_data import FileData, file_data_from_file
|
|
10
|
+
from unstructured_ingest.interfaces import Downloader, download_responses
|
|
11
|
+
from unstructured_ingest.logger import logger
|
|
12
|
+
from unstructured_ingest.pipeline.interfaces import PipelineStep
|
|
13
|
+
from unstructured_ingest.utils.pydantic_models import serialize_base_model_json
|
|
14
|
+
|
|
15
|
+
DownloaderT = TypeVar("DownloaderT", bound=Downloader)
|
|
16
|
+
|
|
17
|
+
STEP_ID = "download"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DownloadStepResponse(TypedDict):
|
|
21
|
+
file_data_path: str
|
|
22
|
+
path: str
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class DownloadStep(PipelineStep):
|
|
27
|
+
process: DownloaderT
|
|
28
|
+
identifier: str = STEP_ID
|
|
29
|
+
|
|
30
|
+
def __str__(self):
|
|
31
|
+
return f"{self.identifier} ({self.process.__class__.__name__})"
|
|
32
|
+
|
|
33
|
+
def __post_init__(self):
|
|
34
|
+
config = (
|
|
35
|
+
self.process.download_config.model_dump_json() if self.process.download_config else None
|
|
36
|
+
)
|
|
37
|
+
connection_config = (
|
|
38
|
+
self.process.connection_config.model_dump_json()
|
|
39
|
+
if self.process.connection_config
|
|
40
|
+
else None
|
|
41
|
+
)
|
|
42
|
+
logger.info(
|
|
43
|
+
f"Created {self.identifier} with configs: {config}, "
|
|
44
|
+
f"connection configs: {connection_config}"
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
@staticmethod
|
|
48
|
+
def is_float(value: str):
|
|
49
|
+
try:
|
|
50
|
+
float(value)
|
|
51
|
+
return True
|
|
52
|
+
except ValueError:
|
|
53
|
+
return False
|
|
54
|
+
|
|
55
|
+
def should_download(self, file_data: FileData, file_data_path: str) -> bool:
|
|
56
|
+
if self.context.re_download:
|
|
57
|
+
return True
|
|
58
|
+
download_path = self.process.get_download_path(file_data=file_data)
|
|
59
|
+
if not download_path or not download_path.exists():
|
|
60
|
+
return True
|
|
61
|
+
if (
|
|
62
|
+
download_path.is_file()
|
|
63
|
+
and file_data.metadata.date_modified
|
|
64
|
+
and self.is_float(file_data.metadata.date_modified)
|
|
65
|
+
and download_path.stat().st_mtime > float(file_data.metadata.date_modified)
|
|
66
|
+
):
|
|
67
|
+
# Also update file data to mark this to reprocess since this won't change the filename
|
|
68
|
+
file_data.reprocess = True
|
|
69
|
+
file_data.to_file(path=file_data_path)
|
|
70
|
+
return True
|
|
71
|
+
return False
|
|
72
|
+
|
|
73
|
+
def update_file_data(
|
|
74
|
+
self, file_data: FileData, file_data_path: Path, download_path: Path
|
|
75
|
+
) -> None:
|
|
76
|
+
file_data.local_download_path = str(download_path.resolve())
|
|
77
|
+
file_size_bytes = download_path.stat().st_size
|
|
78
|
+
if not file_data.metadata.filesize_bytes and file_size_bytes:
|
|
79
|
+
file_data.metadata.filesize_bytes = file_size_bytes
|
|
80
|
+
if (
|
|
81
|
+
file_data.metadata.filesize_bytes
|
|
82
|
+
and file_data.metadata.filesize_bytes != file_size_bytes
|
|
83
|
+
):
|
|
84
|
+
logger.warning(
|
|
85
|
+
f"file size in original file data "
|
|
86
|
+
f"({file_data.metadata.filesize_bytes}) doesn't "
|
|
87
|
+
f"match size of local file: {file_size_bytes}, updating"
|
|
88
|
+
)
|
|
89
|
+
file_data.metadata.filesize_bytes = file_size_bytes
|
|
90
|
+
logger.debug(f"updating file data with new content: {file_data.model_dump_json()}")
|
|
91
|
+
with file_data_path.open("w") as file:
|
|
92
|
+
file.write(file_data.model_dump_json(indent=2))
|
|
93
|
+
|
|
94
|
+
async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
|
|
95
|
+
file_data = file_data_from_file(path=file_data_path)
|
|
96
|
+
download_path = self.process.get_download_path(file_data=file_data)
|
|
97
|
+
if not self.should_download(file_data=file_data, file_data_path=file_data_path):
|
|
98
|
+
logger.debug(f"skipping download, file already exists locally: {download_path}")
|
|
99
|
+
self.update_file_data(
|
|
100
|
+
file_data=file_data,
|
|
101
|
+
file_data_path=Path(file_data_path),
|
|
102
|
+
download_path=download_path,
|
|
103
|
+
)
|
|
104
|
+
return [DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))]
|
|
105
|
+
fn_kwargs = {"file_data": file_data}
|
|
106
|
+
if not asyncio.iscoroutinefunction(fn):
|
|
107
|
+
download_results = fn(**fn_kwargs)
|
|
108
|
+
elif semaphore := self.context.semaphore:
|
|
109
|
+
async with semaphore:
|
|
110
|
+
download_results = await fn(**fn_kwargs)
|
|
111
|
+
else:
|
|
112
|
+
download_results = await fn(**fn_kwargs)
|
|
113
|
+
return self.create_step_results(
|
|
114
|
+
current_file_data_path=file_data_path,
|
|
115
|
+
download_results=download_results,
|
|
116
|
+
current_file_data=file_data,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
def create_step_results(
|
|
120
|
+
self,
|
|
121
|
+
current_file_data_path: str,
|
|
122
|
+
current_file_data: FileData,
|
|
123
|
+
download_results: download_responses,
|
|
124
|
+
) -> list[DownloadStepResponse]:
|
|
125
|
+
responses = []
|
|
126
|
+
if not isinstance(download_results, list):
|
|
127
|
+
file_data = current_file_data
|
|
128
|
+
file_data_path = current_file_data_path
|
|
129
|
+
download_path = download_results["path"]
|
|
130
|
+
if download_results["file_data"].identifier == current_file_data.identifier:
|
|
131
|
+
self.update_file_data(
|
|
132
|
+
file_data=file_data,
|
|
133
|
+
file_data_path=Path(file_data_path),
|
|
134
|
+
download_path=download_path,
|
|
135
|
+
)
|
|
136
|
+
responses = [
|
|
137
|
+
DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))
|
|
138
|
+
]
|
|
139
|
+
else:
|
|
140
|
+
file_data = download_results["file_data"]
|
|
141
|
+
file_data_path = self.persist_new_file_data(file_data=file_data)
|
|
142
|
+
self.update_file_data(
|
|
143
|
+
file_data=file_data,
|
|
144
|
+
file_data_path=Path(file_data_path),
|
|
145
|
+
download_path=download_path,
|
|
146
|
+
)
|
|
147
|
+
responses = [
|
|
148
|
+
DownloadStepResponse(
|
|
149
|
+
file_data_path=current_file_data_path, path=str(download_results["path"])
|
|
150
|
+
)
|
|
151
|
+
]
|
|
152
|
+
else:
|
|
153
|
+
# Supplemental results generated as part of the download process
|
|
154
|
+
for res in download_results:
|
|
155
|
+
file_data = res["file_data"]
|
|
156
|
+
file_data_path = self.persist_new_file_data(file_data=file_data)
|
|
157
|
+
download_path = res["path"]
|
|
158
|
+
self.update_file_data(
|
|
159
|
+
file_data=file_data,
|
|
160
|
+
file_data_path=Path(file_data_path),
|
|
161
|
+
download_path=download_path,
|
|
162
|
+
)
|
|
163
|
+
responses.append(
|
|
164
|
+
DownloadStepResponse(file_data_path=file_data_path, path=res["path"])
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
return responses
|
|
168
|
+
|
|
169
|
+
def persist_new_file_data(self, file_data: FileData) -> str:
|
|
170
|
+
record_hash = self.get_hash(extras=[file_data.identifier])
|
|
171
|
+
filename = f"{record_hash}.json"
|
|
172
|
+
filepath = (self.cache_dir / filename).resolve()
|
|
173
|
+
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
174
|
+
with open(str(filepath), "w") as f:
|
|
175
|
+
f.write(file_data.model_dump_json(indent=2))
|
|
176
|
+
return str(filepath)
|
|
177
|
+
|
|
178
|
+
def get_hash(self, extras: Optional[list[str]]) -> str:
|
|
179
|
+
download_config_dict = json.loads(
|
|
180
|
+
serialize_base_model_json(model=self.process.download_config)
|
|
181
|
+
)
|
|
182
|
+
connection_config_dict = json.loads(
|
|
183
|
+
serialize_base_model_json(model=self.process.connection_config)
|
|
184
|
+
)
|
|
185
|
+
hashable_dict = {
|
|
186
|
+
"download_config": download_config_dict,
|
|
187
|
+
"connection_config": connection_config_dict,
|
|
188
|
+
}
|
|
189
|
+
hashable_string = json.dumps(hashable_dict, sort_keys=True)
|
|
190
|
+
if extras:
|
|
191
|
+
hashable_string += "".join(extras)
|
|
192
|
+
return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
|
|
193
|
+
|
|
194
|
+
@property
|
|
195
|
+
def cache_dir(self) -> Path:
|
|
196
|
+
return self.process.download_config.download_dir
|
|
197
|
+
|
|
198
|
+
def delete_cache(self):
|
|
199
|
+
if (
|
|
200
|
+
self.context.iter_delete
|
|
201
|
+
and not self.context.preserve_downloads
|
|
202
|
+
and self.cache_dir.exists()
|
|
203
|
+
):
|
|
204
|
+
cache_dir = self.cache_dir
|
|
205
|
+
logger.info(f"deleting {self.identifier} cache dir {cache_dir}")
|
|
206
|
+
shutil.rmtree(cache_dir)
|