unstructured-ingest 1.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/cli/__init__.py +0 -0
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +269 -0
- unstructured_ingest/cli/base/dest.py +84 -0
- unstructured_ingest/cli/base/importer.py +34 -0
- unstructured_ingest/cli/base/src.py +75 -0
- unstructured_ingest/cli/cli.py +24 -0
- unstructured_ingest/cli/cmds.py +14 -0
- unstructured_ingest/cli/utils/__init__.py +0 -0
- unstructured_ingest/cli/utils/click.py +237 -0
- unstructured_ingest/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/data_types/__init__.py +0 -0
- unstructured_ingest/data_types/entities.py +17 -0
- unstructured_ingest/data_types/file_data.py +116 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +63 -0
- unstructured_ingest/embed/bedrock.py +323 -0
- unstructured_ingest/embed/huggingface.py +69 -0
- unstructured_ingest/embed/interfaces.py +146 -0
- unstructured_ingest/embed/mixedbreadai.py +134 -0
- unstructured_ingest/embed/octoai.py +133 -0
- unstructured_ingest/embed/openai.py +142 -0
- unstructured_ingest/embed/togetherai.py +116 -0
- unstructured_ingest/embed/vertexai.py +109 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/error.py +156 -0
- unstructured_ingest/errors_v2.py +156 -0
- unstructured_ingest/interfaces/__init__.py +27 -0
- unstructured_ingest/interfaces/connector.py +56 -0
- unstructured_ingest/interfaces/downloader.py +90 -0
- unstructured_ingest/interfaces/indexer.py +29 -0
- unstructured_ingest/interfaces/process.py +22 -0
- unstructured_ingest/interfaces/processor.py +88 -0
- unstructured_ingest/interfaces/upload_stager.py +89 -0
- unstructured_ingest/interfaces/uploader.py +67 -0
- unstructured_ingest/logger.py +39 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/otel.py +128 -0
- unstructured_ingest/pipeline/__init__.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +211 -0
- unstructured_ingest/pipeline/otel.py +32 -0
- unstructured_ingest/pipeline/pipeline.py +408 -0
- unstructured_ingest/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/pipeline/steps/chunk.py +78 -0
- unstructured_ingest/pipeline/steps/download.py +206 -0
- unstructured_ingest/pipeline/steps/embed.py +77 -0
- unstructured_ingest/pipeline/steps/filter.py +35 -0
- unstructured_ingest/pipeline/steps/index.py +86 -0
- unstructured_ingest/pipeline/steps/partition.py +77 -0
- unstructured_ingest/pipeline/steps/stage.py +65 -0
- unstructured_ingest/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/pipeline/steps/upload.py +58 -0
- unstructured_ingest/processes/__init__.py +18 -0
- unstructured_ingest/processes/chunker.py +131 -0
- unstructured_ingest/processes/connector_registry.py +69 -0
- unstructured_ingest/processes/connectors/__init__.py +129 -0
- unstructured_ingest/processes/connectors/airtable.py +238 -0
- unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +592 -0
- unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
- unstructured_ingest/processes/connectors/chroma.py +193 -0
- unstructured_ingest/processes/connectors/confluence.py +527 -0
- unstructured_ingest/processes/connectors/couchbase.py +336 -0
- unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
- unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
- unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
- unstructured_ingest/processes/connectors/delta_table.py +310 -0
- unstructured_ingest/processes/connectors/discord.py +161 -0
- unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
- unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
- unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
- unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
- unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
- unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
- unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
- unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/processes/connectors/github.py +226 -0
- unstructured_ingest/processes/connectors/gitlab.py +270 -0
- unstructured_ingest/processes/connectors/google_drive.py +848 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
- unstructured_ingest/processes/connectors/jira.py +522 -0
- unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
- unstructured_ingest/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/processes/connectors/kdbai.py +156 -0
- unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
- unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/processes/connectors/local.py +227 -0
- unstructured_ingest/processes/connectors/milvus.py +311 -0
- unstructured_ingest/processes/connectors/mongodb.py +389 -0
- unstructured_ingest/processes/connectors/neo4j.py +534 -0
- unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/processes/connectors/notion/connector.py +350 -0
- unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
- unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
- unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
- unstructured_ingest/processes/connectors/onedrive.py +485 -0
- unstructured_ingest/processes/connectors/outlook.py +242 -0
- unstructured_ingest/processes/connectors/pinecone.py +400 -0
- unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
- unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/processes/connectors/redisdb.py +214 -0
- unstructured_ingest/processes/connectors/salesforce.py +307 -0
- unstructured_ingest/processes/connectors/sharepoint.py +282 -0
- unstructured_ingest/processes/connectors/slack.py +249 -0
- unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
- unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
- unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
- unstructured_ingest/processes/connectors/sql/sql.py +456 -0
- unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
- unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
- unstructured_ingest/processes/connectors/utils.py +60 -0
- unstructured_ingest/processes/connectors/vectara.py +348 -0
- unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
- unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
- unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
- unstructured_ingest/processes/embedder.py +203 -0
- unstructured_ingest/processes/filter.py +60 -0
- unstructured_ingest/processes/partitioner.py +233 -0
- unstructured_ingest/processes/uncompress.py +61 -0
- unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest/processes/utils/blob_storage.py +32 -0
- unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- unstructured_ingest/unstructured_api.py +140 -0
- unstructured_ingest/utils/__init__.py +5 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +72 -0
- unstructured_ingest/utils/constants.py +2 -0
- unstructured_ingest/utils/data_prep.py +216 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/filesystem.py +27 -0
- unstructured_ingest/utils/html.py +174 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/utils/pydantic_models.py +52 -0
- unstructured_ingest/utils/string_and_date_utils.py +74 -0
- unstructured_ingest/utils/table.py +80 -0
- unstructured_ingest/utils/tls.py +15 -0
- unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
- unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
- unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
- unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
- unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.32" # pragma: no cover
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Ingest CLI
|
|
2
|
+
This package helps map user input via a cli to the underlying ingest code to run a small ETL pipeline.
|
|
3
|
+
|
|
4
|
+
## Design Reference
|
|
5
|
+
[cli.py](cli.py) is the main entrypoint to run the cli itself. The key points for this is the interaction between all
|
|
6
|
+
source and destination connectors.
|
|
7
|
+
|
|
8
|
+
To manually run the cli:
|
|
9
|
+
```shell
|
|
10
|
+
PYTHONPATH=. python unstructured_ingest/main.py --help
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
The `main.py` file simply wraps the generated Click command created in `cli.py`.
|
|
14
|
+
|
|
15
|
+
### Source Commands
|
|
16
|
+
All source commands are added as sub commands to the parent ingest Click group. This allows each command to map to
|
|
17
|
+
different connectors with shared and unique parameters.
|
|
18
|
+
|
|
19
|
+
### Destination Commands
|
|
20
|
+
All destination commands are added as sub commands to each parent source command. This allows each invocation of the source
|
|
21
|
+
sub command to display all possible destination subcommands. The code un [utils.py](./utils.py) helps structure the
|
|
22
|
+
generated text from the Click library to be more intuitive on this approach (i.e. list sub commands as `Destinations`).
|
|
23
|
+
|
|
24
|
+
### Configs
|
|
25
|
+
The configs in [configs/](./configs) and connector specific ones in [cmds/](./cmds) help surface all user parameters that
|
|
26
|
+
are needed to marshall the input dictionary from Click into all the respective configs needed to create a full pipeline run.
|
|
27
|
+
Because click returns a flat dictionary of user inputs, the `extract_config` method in `utils.py` helps deserialize this dictionary
|
|
28
|
+
into dataclasses that have nested fields (such as access configs).
|
|
File without changes
|
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
import inspect
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from collections import Counter
|
|
4
|
+
from dataclasses import dataclass, field, fields
|
|
5
|
+
from typing import Any, Optional, Type, TypeVar
|
|
6
|
+
|
|
7
|
+
import click
|
|
8
|
+
from pydantic import BaseModel
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.cli.base.importer import import_from_string
|
|
11
|
+
from unstructured_ingest.cli.utils.click import extract_config
|
|
12
|
+
from unstructured_ingest.cli.utils.model_conversion import options_from_base_model, post_check
|
|
13
|
+
from unstructured_ingest.interfaces import ProcessorConfig
|
|
14
|
+
from unstructured_ingest.logger import logger
|
|
15
|
+
from unstructured_ingest.pipeline.pipeline import Pipeline
|
|
16
|
+
from unstructured_ingest.processes.chunker import Chunker, ChunkerConfig
|
|
17
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
18
|
+
DownloaderT,
|
|
19
|
+
IndexerT,
|
|
20
|
+
RegistryEntry,
|
|
21
|
+
UploaderT,
|
|
22
|
+
UploadStager,
|
|
23
|
+
UploadStagerConfig,
|
|
24
|
+
UploadStagerT,
|
|
25
|
+
destination_registry,
|
|
26
|
+
source_registry,
|
|
27
|
+
)
|
|
28
|
+
from unstructured_ingest.processes.connectors.local import LocalUploader, LocalUploaderConfig
|
|
29
|
+
from unstructured_ingest.processes.embedder import Embedder, EmbedderConfig
|
|
30
|
+
from unstructured_ingest.processes.filter import Filterer, FiltererConfig
|
|
31
|
+
from unstructured_ingest.processes.partitioner import Partitioner, PartitionerConfig
|
|
32
|
+
|
|
33
|
+
CommandT = TypeVar("CommandT", bound=click.Command)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class BaseCmd(ABC):
|
|
38
|
+
cmd_name: str
|
|
39
|
+
registry_entry: RegistryEntry
|
|
40
|
+
default_configs: list[Type[BaseModel]] = field(default_factory=list)
|
|
41
|
+
|
|
42
|
+
@abstractmethod
|
|
43
|
+
def get_registry_options(self):
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
def get_default_options(self) -> list[click.Option]:
|
|
47
|
+
options = []
|
|
48
|
+
for extra in self.default_configs:
|
|
49
|
+
options.extend(options_from_base_model(model=extra))
|
|
50
|
+
return options
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def consolidate_options(cls, options: list[click.Option]) -> list[click.Option]:
|
|
54
|
+
option_names = [option.name for option in options]
|
|
55
|
+
duplicate_names = [name for name, count in Counter(option_names).items() if count > 1]
|
|
56
|
+
if not duplicate_names:
|
|
57
|
+
return options
|
|
58
|
+
consolidated_options = []
|
|
59
|
+
current_names = []
|
|
60
|
+
for option in options:
|
|
61
|
+
if option.name not in current_names:
|
|
62
|
+
current_names.append(option.name)
|
|
63
|
+
consolidated_options.append(option)
|
|
64
|
+
continue
|
|
65
|
+
existing_option = next(o for o in consolidated_options if o.name == option.name)
|
|
66
|
+
if existing_option.__dict__ == option.__dict__:
|
|
67
|
+
continue
|
|
68
|
+
option_diff = cls.get_options_diff(o1=option, o2=existing_option)
|
|
69
|
+
raise ValueError(
|
|
70
|
+
"Conflicting duplicate {} option defined: {}".format(
|
|
71
|
+
option.name, " | ".join([f"{d[0]}: {d[1]}" for d in option_diff])
|
|
72
|
+
)
|
|
73
|
+
)
|
|
74
|
+
return consolidated_options
|
|
75
|
+
|
|
76
|
+
@staticmethod
|
|
77
|
+
def get_options_diff(o1: click.Option, o2: click.Option):
|
|
78
|
+
o1_dict = o1.__dict__
|
|
79
|
+
o2_dict = o2.__dict__
|
|
80
|
+
for d in [o1_dict, o2_dict]:
|
|
81
|
+
d["opts"] = ",".join(d["opts"])
|
|
82
|
+
d["secondary_opts"] = ",".join(d["secondary_opts"])
|
|
83
|
+
option_diff = set(o1_dict.items()) ^ set(o2_dict.items())
|
|
84
|
+
return option_diff
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def cmd_name_key(self):
|
|
88
|
+
return self.cmd_name.replace("-", "_")
|
|
89
|
+
|
|
90
|
+
@property
|
|
91
|
+
def cli_cmd_name(self):
|
|
92
|
+
return self.cmd_name.replace("_", "-")
|
|
93
|
+
|
|
94
|
+
@abstractmethod
|
|
95
|
+
def cmd(self, ctx: click.Context, **options) -> None:
|
|
96
|
+
pass
|
|
97
|
+
|
|
98
|
+
def add_options(self, cmd: CommandT) -> CommandT:
|
|
99
|
+
options = self.get_registry_options()
|
|
100
|
+
options.extend(self.get_default_options())
|
|
101
|
+
post_check(options=options, name=cmd.name)
|
|
102
|
+
cmd.params.extend(options)
|
|
103
|
+
return cmd
|
|
104
|
+
|
|
105
|
+
def get_pipeline(
|
|
106
|
+
self,
|
|
107
|
+
src: str,
|
|
108
|
+
source_options: dict[str, Any],
|
|
109
|
+
dest: Optional[str] = None,
|
|
110
|
+
destination_options: Optional[dict[str, Any]] = None,
|
|
111
|
+
) -> Pipeline:
|
|
112
|
+
logger.debug(
|
|
113
|
+
f"creating pipeline from cli using source {src} with options: {source_options}"
|
|
114
|
+
)
|
|
115
|
+
pipeline_kwargs: dict[str, Any] = {
|
|
116
|
+
"context": self.get_processor_config(options=source_options),
|
|
117
|
+
"downloader": self.get_downloader(src=src, options=source_options),
|
|
118
|
+
"indexer": self.get_indexer(src=src, options=source_options),
|
|
119
|
+
"partitioner": self.get_partitioner(options=source_options),
|
|
120
|
+
}
|
|
121
|
+
if chunker := self.get_chunker(options=source_options):
|
|
122
|
+
pipeline_kwargs["chunker"] = chunker
|
|
123
|
+
if filterer := self.get_filterer(options=source_options):
|
|
124
|
+
pipeline_kwargs["filterer"] = filterer
|
|
125
|
+
if embedder := self.get_embedder(options=source_options):
|
|
126
|
+
pipeline_kwargs["embedder"] = embedder
|
|
127
|
+
if dest:
|
|
128
|
+
logger.debug(
|
|
129
|
+
f"setting destination on pipeline {dest} with options: {destination_options}"
|
|
130
|
+
)
|
|
131
|
+
if uploader_stager := self.get_upload_stager(dest=dest, options=destination_options):
|
|
132
|
+
pipeline_kwargs["stager"] = uploader_stager
|
|
133
|
+
pipeline_kwargs["uploader"] = self.get_uploader(dest=dest, options=destination_options)
|
|
134
|
+
else:
|
|
135
|
+
# Default to local uploader
|
|
136
|
+
# TODO remove after v1 no longer supported
|
|
137
|
+
destination_options = destination_options or {}
|
|
138
|
+
if "output_dir" not in destination_options:
|
|
139
|
+
destination_options["output_dir"] = source_options["output_dir"]
|
|
140
|
+
pipeline_kwargs["uploader"] = self.get_default_uploader(options=destination_options)
|
|
141
|
+
return Pipeline(**pipeline_kwargs)
|
|
142
|
+
|
|
143
|
+
@staticmethod
|
|
144
|
+
def get_default_uploader(options: dict[str, Any]) -> UploaderT:
|
|
145
|
+
uploader_config = extract_config(flat_data=options, config=LocalUploaderConfig)
|
|
146
|
+
return LocalUploader(upload_config=uploader_config)
|
|
147
|
+
|
|
148
|
+
@staticmethod
|
|
149
|
+
def get_chunker(options: dict[str, Any]) -> Optional[Chunker]:
|
|
150
|
+
chunker_config = extract_config(flat_data=options, config=ChunkerConfig)
|
|
151
|
+
if not chunker_config.chunking_strategy:
|
|
152
|
+
return None
|
|
153
|
+
return Chunker(config=chunker_config)
|
|
154
|
+
|
|
155
|
+
@staticmethod
|
|
156
|
+
def get_filterer(options: dict[str, Any]) -> Optional[Filterer]:
|
|
157
|
+
filterer_configs = extract_config(flat_data=options, config=FiltererConfig)
|
|
158
|
+
if not filterer_configs.model_dump():
|
|
159
|
+
return None
|
|
160
|
+
return Filterer(config=filterer_configs)
|
|
161
|
+
|
|
162
|
+
@staticmethod
|
|
163
|
+
def get_embedder(options: dict[str, Any]) -> Optional[Embedder]:
|
|
164
|
+
embedder_config = extract_config(flat_data=options, config=EmbedderConfig)
|
|
165
|
+
if not embedder_config.embedding_provider:
|
|
166
|
+
return None
|
|
167
|
+
return Embedder(config=embedder_config)
|
|
168
|
+
|
|
169
|
+
@staticmethod
|
|
170
|
+
def get_partitioner(options: dict[str, Any]) -> Partitioner:
|
|
171
|
+
partitioner_config = extract_config(flat_data=options, config=PartitionerConfig)
|
|
172
|
+
return Partitioner(config=partitioner_config)
|
|
173
|
+
|
|
174
|
+
@staticmethod
|
|
175
|
+
def get_processor_config(options: dict[str, Any]) -> ProcessorConfig:
|
|
176
|
+
return extract_config(flat_data=options, config=ProcessorConfig)
|
|
177
|
+
|
|
178
|
+
@staticmethod
|
|
179
|
+
def get_indexer(src: str, options: dict[str, Any]) -> IndexerT:
|
|
180
|
+
source_entry = source_registry[src]
|
|
181
|
+
indexer_kwargs: dict[str, Any] = {}
|
|
182
|
+
if indexer_config_cls := source_entry.indexer_config:
|
|
183
|
+
indexer_kwargs["index_config"] = extract_config(
|
|
184
|
+
flat_data=options, config=indexer_config_cls
|
|
185
|
+
)
|
|
186
|
+
if connection_config_cls := source_entry.connection_config:
|
|
187
|
+
indexer_kwargs["connection_config"] = extract_config(
|
|
188
|
+
flat_data=options, config=connection_config_cls
|
|
189
|
+
)
|
|
190
|
+
indexer_cls = source_entry.indexer
|
|
191
|
+
return indexer_cls(**indexer_kwargs)
|
|
192
|
+
|
|
193
|
+
@staticmethod
|
|
194
|
+
def get_downloader(src: str, options: dict[str, Any]) -> DownloaderT:
|
|
195
|
+
source_entry = source_registry[src]
|
|
196
|
+
downloader_kwargs: dict[str, Any] = {}
|
|
197
|
+
if downloader_config_cls := source_entry.downloader_config:
|
|
198
|
+
downloader_kwargs["download_config"] = extract_config(
|
|
199
|
+
flat_data=options, config=downloader_config_cls
|
|
200
|
+
)
|
|
201
|
+
if connection_config_cls := source_entry.connection_config:
|
|
202
|
+
downloader_kwargs["connection_config"] = extract_config(
|
|
203
|
+
flat_data=options, config=connection_config_cls
|
|
204
|
+
)
|
|
205
|
+
downloader_cls = source_entry.downloader
|
|
206
|
+
return downloader_cls(**downloader_kwargs)
|
|
207
|
+
|
|
208
|
+
@staticmethod
|
|
209
|
+
def get_custom_stager(
|
|
210
|
+
stager_reference: str, stager_config_kwargs: Optional[dict] = None
|
|
211
|
+
) -> Optional[UploadStagerT]:
|
|
212
|
+
uploader_cls = import_from_string(stager_reference)
|
|
213
|
+
if not inspect.isclass(uploader_cls):
|
|
214
|
+
raise ValueError(
|
|
215
|
+
f"custom stager must be a reference to a python class, got: {type(uploader_cls)}"
|
|
216
|
+
)
|
|
217
|
+
if not issubclass(uploader_cls, UploadStager):
|
|
218
|
+
raise ValueError(
|
|
219
|
+
"custom stager must be an implementation of the UploadStager interface"
|
|
220
|
+
)
|
|
221
|
+
fields_dict = {f.name: f.type for f in fields(uploader_cls)}
|
|
222
|
+
upload_stager_config_cls = fields_dict["upload_stager_config"]
|
|
223
|
+
if not inspect.isclass(upload_stager_config_cls):
|
|
224
|
+
raise ValueError(
|
|
225
|
+
f"custom stager config must be a class, got: {type(upload_stager_config_cls)}"
|
|
226
|
+
)
|
|
227
|
+
if not issubclass(upload_stager_config_cls, UploadStagerConfig):
|
|
228
|
+
raise ValueError(
|
|
229
|
+
"custom stager config must be an implementation "
|
|
230
|
+
"of the UploadStagerUploadStagerConfig interface"
|
|
231
|
+
)
|
|
232
|
+
upload_stager_kwargs: dict[str, Any] = {}
|
|
233
|
+
if stager_config_kwargs:
|
|
234
|
+
upload_stager_kwargs["upload_stager_config"] = upload_stager_config_cls(
|
|
235
|
+
**stager_config_kwargs
|
|
236
|
+
)
|
|
237
|
+
return uploader_cls(**upload_stager_kwargs)
|
|
238
|
+
|
|
239
|
+
@staticmethod
|
|
240
|
+
def get_upload_stager(dest: str, options: dict[str, Any]) -> Optional[UploadStagerT]:
|
|
241
|
+
if custom_stager := options.get("custom_stager"):
|
|
242
|
+
return BaseCmd.get_custom_stager(
|
|
243
|
+
stager_reference=custom_stager,
|
|
244
|
+
stager_config_kwargs=options.get("custom_stager_config_kwargs"),
|
|
245
|
+
)
|
|
246
|
+
dest_entry = destination_registry[dest]
|
|
247
|
+
upload_stager_kwargs: dict[str, Any] = {}
|
|
248
|
+
if upload_stager_config_cls := dest_entry.upload_stager_config:
|
|
249
|
+
upload_stager_kwargs["upload_stager_config"] = extract_config(
|
|
250
|
+
flat_data=options, config=upload_stager_config_cls
|
|
251
|
+
)
|
|
252
|
+
if upload_stager_cls := dest_entry.upload_stager:
|
|
253
|
+
return upload_stager_cls(**upload_stager_kwargs)
|
|
254
|
+
return None
|
|
255
|
+
|
|
256
|
+
@staticmethod
|
|
257
|
+
def get_uploader(dest, options: dict[str, Any]) -> UploaderT:
|
|
258
|
+
dest_entry = destination_registry[dest]
|
|
259
|
+
uploader_kwargs: dict[str, Any] = {}
|
|
260
|
+
if uploader_config_cls := dest_entry.uploader_config:
|
|
261
|
+
uploader_kwargs["upload_config"] = extract_config(
|
|
262
|
+
flat_data=options, config=uploader_config_cls
|
|
263
|
+
)
|
|
264
|
+
if connection_config_cls := dest_entry.connection_config:
|
|
265
|
+
uploader_kwargs["connection_config"] = extract_config(
|
|
266
|
+
flat_data=options, config=connection_config_cls
|
|
267
|
+
)
|
|
268
|
+
uploader_cls = dest_entry.uploader
|
|
269
|
+
return uploader_cls(**uploader_kwargs)
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.cli.base.cmd import BaseCmd
|
|
7
|
+
from unstructured_ingest.cli.utils.click import Dict, conform_click_options
|
|
8
|
+
from unstructured_ingest.cli.utils.model_conversion import options_from_base_model
|
|
9
|
+
from unstructured_ingest.logger import logger
|
|
10
|
+
from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class DestCmd(BaseCmd):
|
|
15
|
+
registry_entry: DestinationRegistryEntry
|
|
16
|
+
|
|
17
|
+
def get_registry_options(self):
|
|
18
|
+
options = []
|
|
19
|
+
configs = [
|
|
20
|
+
config
|
|
21
|
+
for config in [
|
|
22
|
+
self.registry_entry.uploader_config,
|
|
23
|
+
self.registry_entry.upload_stager_config,
|
|
24
|
+
self.registry_entry.connection_config,
|
|
25
|
+
]
|
|
26
|
+
if config
|
|
27
|
+
]
|
|
28
|
+
for config in configs:
|
|
29
|
+
options.extend(options_from_base_model(model=config))
|
|
30
|
+
options = self.consolidate_options(options=options)
|
|
31
|
+
return options
|
|
32
|
+
|
|
33
|
+
def cmd(self, ctx: click.Context, **options) -> None:
|
|
34
|
+
logger.setLevel(logging.DEBUG if options.get("verbose", False) else logging.INFO)
|
|
35
|
+
if not ctx.parent:
|
|
36
|
+
raise click.ClickException("destination command called without a parent")
|
|
37
|
+
if not ctx.parent.info_name:
|
|
38
|
+
raise click.ClickException("parent command missing info name")
|
|
39
|
+
source_cmd = ctx.parent.info_name.replace("-", "_")
|
|
40
|
+
source_options: dict = ctx.parent.params if ctx.parent else {}
|
|
41
|
+
conform_click_options(options)
|
|
42
|
+
try:
|
|
43
|
+
pipeline = self.get_pipeline(
|
|
44
|
+
src=source_cmd,
|
|
45
|
+
source_options=source_options,
|
|
46
|
+
dest=self.cmd_name,
|
|
47
|
+
destination_options=options,
|
|
48
|
+
)
|
|
49
|
+
pipeline.run()
|
|
50
|
+
except Exception as e:
|
|
51
|
+
logger.error(f"failed to run destination command {self.cmd_name}: {e}", exc_info=True)
|
|
52
|
+
raise click.ClickException(str(e)) from e
|
|
53
|
+
|
|
54
|
+
def get_cmd(self) -> click.Command:
|
|
55
|
+
# Dynamically create the command without the use of click decorators
|
|
56
|
+
fn = self.cmd
|
|
57
|
+
fn = click.pass_context(fn)
|
|
58
|
+
cmd = click.command(fn)
|
|
59
|
+
if not isinstance(cmd, click.core.Command):
|
|
60
|
+
raise ValueError(f"generated command was not of expected type Command: {type(cmd)}")
|
|
61
|
+
cmd.name = self.cli_cmd_name
|
|
62
|
+
cmd.invoke_without_command = True
|
|
63
|
+
self.add_options(cmd)
|
|
64
|
+
cmd.params.append(
|
|
65
|
+
click.Option(
|
|
66
|
+
["--custom-stager"],
|
|
67
|
+
required=False,
|
|
68
|
+
type=str,
|
|
69
|
+
default=None,
|
|
70
|
+
help="Pass a pointer to a custom upload stager to use, "
|
|
71
|
+
"must be in format '<module>:<attribute>'",
|
|
72
|
+
)
|
|
73
|
+
)
|
|
74
|
+
cmd.params.append(
|
|
75
|
+
click.Option(
|
|
76
|
+
["--custom-stager-config-kwargs"],
|
|
77
|
+
required=False,
|
|
78
|
+
type=Dict(),
|
|
79
|
+
default=None,
|
|
80
|
+
help="Any kwargs to instantiate the configuration "
|
|
81
|
+
"associated with the customer stager",
|
|
82
|
+
)
|
|
83
|
+
)
|
|
84
|
+
return cmd
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ImportFromStringError(Exception):
|
|
6
|
+
pass
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def import_from_string(import_str: Any) -> Any:
|
|
10
|
+
if not isinstance(import_str, str):
|
|
11
|
+
return import_str
|
|
12
|
+
|
|
13
|
+
module_str, _, attrs_str = import_str.partition(":")
|
|
14
|
+
if not module_str or not attrs_str:
|
|
15
|
+
message = 'Import string "{import_str}" must be in format "<module>:<attribute>".'
|
|
16
|
+
raise ImportFromStringError(message.format(import_str=import_str))
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
module = importlib.import_module(module_str)
|
|
20
|
+
except ModuleNotFoundError as exc:
|
|
21
|
+
if exc.name != module_str:
|
|
22
|
+
raise exc from None
|
|
23
|
+
message = 'Could not import module "{module_str}".'
|
|
24
|
+
raise ImportFromStringError(message.format(module_str=module_str))
|
|
25
|
+
|
|
26
|
+
instance = module
|
|
27
|
+
try:
|
|
28
|
+
for attr_str in attrs_str.split("."):
|
|
29
|
+
instance = getattr(instance, attr_str)
|
|
30
|
+
except AttributeError:
|
|
31
|
+
message = 'Attribute "{attrs_str}" not found in module "{module_str}".'
|
|
32
|
+
raise ImportFromStringError(message.format(attrs_str=attrs_str, module_str=module_str))
|
|
33
|
+
|
|
34
|
+
return instance
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
import click
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.cli.base.cmd import BaseCmd
|
|
9
|
+
from unstructured_ingest.cli.utils.click import Group, conform_click_options
|
|
10
|
+
from unstructured_ingest.cli.utils.model_conversion import options_from_base_model
|
|
11
|
+
from unstructured_ingest.interfaces import ProcessorConfig
|
|
12
|
+
from unstructured_ingest.logger import logger
|
|
13
|
+
from unstructured_ingest.processes import (
|
|
14
|
+
ChunkerConfig,
|
|
15
|
+
EmbedderConfig,
|
|
16
|
+
FiltererConfig,
|
|
17
|
+
PartitionerConfig,
|
|
18
|
+
)
|
|
19
|
+
from unstructured_ingest.processes.connector_registry import SourceRegistryEntry
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class SrcCmd(BaseCmd):
|
|
24
|
+
registry_entry: SourceRegistryEntry
|
|
25
|
+
default_configs: list[BaseModel] = field(
|
|
26
|
+
default_factory=lambda: [
|
|
27
|
+
ProcessorConfig,
|
|
28
|
+
PartitionerConfig,
|
|
29
|
+
EmbedderConfig,
|
|
30
|
+
FiltererConfig,
|
|
31
|
+
ChunkerConfig,
|
|
32
|
+
]
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
def get_registry_options(self):
|
|
36
|
+
options = []
|
|
37
|
+
configs = [
|
|
38
|
+
config
|
|
39
|
+
for config in [
|
|
40
|
+
self.registry_entry.connection_config,
|
|
41
|
+
self.registry_entry.indexer_config,
|
|
42
|
+
self.registry_entry.downloader_config,
|
|
43
|
+
]
|
|
44
|
+
if config
|
|
45
|
+
]
|
|
46
|
+
for config in configs:
|
|
47
|
+
options.extend(options_from_base_model(model=config))
|
|
48
|
+
options = self.consolidate_options(options=options)
|
|
49
|
+
return options
|
|
50
|
+
|
|
51
|
+
def cmd(self, ctx: click.Context, **options: dict[str, Any]) -> None:
|
|
52
|
+
if ctx.invoked_subcommand:
|
|
53
|
+
return
|
|
54
|
+
|
|
55
|
+
conform_click_options(options)
|
|
56
|
+
logger.setLevel(logging.DEBUG if options.get("verbose", False) else logging.INFO)
|
|
57
|
+
try:
|
|
58
|
+
pipeline = self.get_pipeline(src=self.cmd_name, source_options=options)
|
|
59
|
+
pipeline.run()
|
|
60
|
+
except Exception as e:
|
|
61
|
+
logger.error(f"failed to run source command {self.cmd_name}: {e}", exc_info=True)
|
|
62
|
+
raise click.ClickException(str(e)) from e
|
|
63
|
+
|
|
64
|
+
def get_cmd(self) -> click.Group:
|
|
65
|
+
# Dynamically create the command without the use of click decorators
|
|
66
|
+
fn = self.cmd
|
|
67
|
+
fn = click.pass_context(fn)
|
|
68
|
+
cmd = click.group(fn, cls=Group)
|
|
69
|
+
if not isinstance(cmd, click.core.Group):
|
|
70
|
+
raise ValueError(f"generated src command was not of expected type Group: {type(cmd)}")
|
|
71
|
+
cmd.name = self.cli_cmd_name
|
|
72
|
+
cmd.invoke_without_command = True
|
|
73
|
+
self.add_options(cmd)
|
|
74
|
+
|
|
75
|
+
return cmd
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import click
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.cli.cmds import dest, src
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@click.group()
|
|
7
|
+
def ingest():
|
|
8
|
+
pass
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_cmd() -> click.Command:
|
|
12
|
+
"""Construct and return a Click command object representing the main command for the CLI.
|
|
13
|
+
|
|
14
|
+
This function adds all dest_subcommand(s) to each src_subcommand, and adds all of those
|
|
15
|
+
to the main command as nested subcommands.
|
|
16
|
+
"""
|
|
17
|
+
cmd = ingest
|
|
18
|
+
# Add all subcommands
|
|
19
|
+
for src_subcommand in src:
|
|
20
|
+
# Add all destination subcommands
|
|
21
|
+
for dest_subcommand in dest:
|
|
22
|
+
src_subcommand.add_command(dest_subcommand)
|
|
23
|
+
cmd.add_command(src_subcommand)
|
|
24
|
+
return cmd
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import click
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.cli.base import DestCmd, SrcCmd
|
|
4
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
5
|
+
destination_registry,
|
|
6
|
+
source_registry,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
src_cmds = [SrcCmd(cmd_name=k, registry_entry=v) for k, v in source_registry.items()]
|
|
10
|
+
dest_cmds = [DestCmd(cmd_name=k, registry_entry=v) for k, v in destination_registry.items()]
|
|
11
|
+
|
|
12
|
+
src: list[click.Group] = [v.get_cmd() for v in src_cmds]
|
|
13
|
+
|
|
14
|
+
dest: list[click.Command] = [v.get_cmd() for v in dest_cmds]
|
|
File without changes
|