unstructured-ingest 1.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/cli/__init__.py +0 -0
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +269 -0
- unstructured_ingest/cli/base/dest.py +84 -0
- unstructured_ingest/cli/base/importer.py +34 -0
- unstructured_ingest/cli/base/src.py +75 -0
- unstructured_ingest/cli/cli.py +24 -0
- unstructured_ingest/cli/cmds.py +14 -0
- unstructured_ingest/cli/utils/__init__.py +0 -0
- unstructured_ingest/cli/utils/click.py +237 -0
- unstructured_ingest/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/data_types/__init__.py +0 -0
- unstructured_ingest/data_types/entities.py +17 -0
- unstructured_ingest/data_types/file_data.py +116 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +63 -0
- unstructured_ingest/embed/bedrock.py +323 -0
- unstructured_ingest/embed/huggingface.py +69 -0
- unstructured_ingest/embed/interfaces.py +146 -0
- unstructured_ingest/embed/mixedbreadai.py +134 -0
- unstructured_ingest/embed/octoai.py +133 -0
- unstructured_ingest/embed/openai.py +142 -0
- unstructured_ingest/embed/togetherai.py +116 -0
- unstructured_ingest/embed/vertexai.py +109 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/error.py +156 -0
- unstructured_ingest/errors_v2.py +156 -0
- unstructured_ingest/interfaces/__init__.py +27 -0
- unstructured_ingest/interfaces/connector.py +56 -0
- unstructured_ingest/interfaces/downloader.py +90 -0
- unstructured_ingest/interfaces/indexer.py +29 -0
- unstructured_ingest/interfaces/process.py +22 -0
- unstructured_ingest/interfaces/processor.py +88 -0
- unstructured_ingest/interfaces/upload_stager.py +89 -0
- unstructured_ingest/interfaces/uploader.py +67 -0
- unstructured_ingest/logger.py +39 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/otel.py +128 -0
- unstructured_ingest/pipeline/__init__.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +211 -0
- unstructured_ingest/pipeline/otel.py +32 -0
- unstructured_ingest/pipeline/pipeline.py +408 -0
- unstructured_ingest/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/pipeline/steps/chunk.py +78 -0
- unstructured_ingest/pipeline/steps/download.py +206 -0
- unstructured_ingest/pipeline/steps/embed.py +77 -0
- unstructured_ingest/pipeline/steps/filter.py +35 -0
- unstructured_ingest/pipeline/steps/index.py +86 -0
- unstructured_ingest/pipeline/steps/partition.py +77 -0
- unstructured_ingest/pipeline/steps/stage.py +65 -0
- unstructured_ingest/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/pipeline/steps/upload.py +58 -0
- unstructured_ingest/processes/__init__.py +18 -0
- unstructured_ingest/processes/chunker.py +131 -0
- unstructured_ingest/processes/connector_registry.py +69 -0
- unstructured_ingest/processes/connectors/__init__.py +129 -0
- unstructured_ingest/processes/connectors/airtable.py +238 -0
- unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +592 -0
- unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
- unstructured_ingest/processes/connectors/chroma.py +193 -0
- unstructured_ingest/processes/connectors/confluence.py +527 -0
- unstructured_ingest/processes/connectors/couchbase.py +336 -0
- unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
- unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
- unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
- unstructured_ingest/processes/connectors/delta_table.py +310 -0
- unstructured_ingest/processes/connectors/discord.py +161 -0
- unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
- unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
- unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
- unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
- unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
- unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
- unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
- unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/processes/connectors/github.py +226 -0
- unstructured_ingest/processes/connectors/gitlab.py +270 -0
- unstructured_ingest/processes/connectors/google_drive.py +848 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
- unstructured_ingest/processes/connectors/jira.py +522 -0
- unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
- unstructured_ingest/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/processes/connectors/kdbai.py +156 -0
- unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
- unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/processes/connectors/local.py +227 -0
- unstructured_ingest/processes/connectors/milvus.py +311 -0
- unstructured_ingest/processes/connectors/mongodb.py +389 -0
- unstructured_ingest/processes/connectors/neo4j.py +534 -0
- unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/processes/connectors/notion/connector.py +350 -0
- unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
- unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
- unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
- unstructured_ingest/processes/connectors/onedrive.py +485 -0
- unstructured_ingest/processes/connectors/outlook.py +242 -0
- unstructured_ingest/processes/connectors/pinecone.py +400 -0
- unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
- unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/processes/connectors/redisdb.py +214 -0
- unstructured_ingest/processes/connectors/salesforce.py +307 -0
- unstructured_ingest/processes/connectors/sharepoint.py +282 -0
- unstructured_ingest/processes/connectors/slack.py +249 -0
- unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
- unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
- unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
- unstructured_ingest/processes/connectors/sql/sql.py +456 -0
- unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
- unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
- unstructured_ingest/processes/connectors/utils.py +60 -0
- unstructured_ingest/processes/connectors/vectara.py +348 -0
- unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
- unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
- unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
- unstructured_ingest/processes/embedder.py +203 -0
- unstructured_ingest/processes/filter.py +60 -0
- unstructured_ingest/processes/partitioner.py +233 -0
- unstructured_ingest/processes/uncompress.py +61 -0
- unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest/processes/utils/blob_storage.py +32 -0
- unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- unstructured_ingest/unstructured_api.py +140 -0
- unstructured_ingest/utils/__init__.py +5 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +72 -0
- unstructured_ingest/utils/constants.py +2 -0
- unstructured_ingest/utils/data_prep.py +216 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/filesystem.py +27 -0
- unstructured_ingest/utils/html.py +174 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/utils/pydantic_models.py +52 -0
- unstructured_ingest/utils/string_and_date_utils.py +74 -0
- unstructured_ingest/utils/table.py +80 -0
- unstructured_ingest/utils/tls.py +15 -0
- unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
- unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
- unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
- unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
- unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import os
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from time import time
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
7
|
+
|
|
8
|
+
from pydantic import Field, Secret
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.data_types.file_data import (
|
|
11
|
+
FileDataSourceMetadata,
|
|
12
|
+
)
|
|
13
|
+
from unstructured_ingest.error import ProviderError, UserAuthError, UserError
|
|
14
|
+
from unstructured_ingest.logger import logger
|
|
15
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
16
|
+
DestinationRegistryEntry,
|
|
17
|
+
SourceRegistryEntry,
|
|
18
|
+
)
|
|
19
|
+
from unstructured_ingest.processes.connectors.fsspec.fsspec import (
|
|
20
|
+
FsspecAccessConfig,
|
|
21
|
+
FsspecConnectionConfig,
|
|
22
|
+
FsspecDownloader,
|
|
23
|
+
FsspecDownloaderConfig,
|
|
24
|
+
FsspecIndexer,
|
|
25
|
+
FsspecIndexerConfig,
|
|
26
|
+
FsspecUploader,
|
|
27
|
+
FsspecUploaderConfig,
|
|
28
|
+
)
|
|
29
|
+
from unstructured_ingest.processes.utils.blob_storage import (
|
|
30
|
+
BlobStoreUploadStager,
|
|
31
|
+
BlobStoreUploadStagerConfig,
|
|
32
|
+
)
|
|
33
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
34
|
+
|
|
35
|
+
CONNECTOR_TYPE = "s3"
|
|
36
|
+
|
|
37
|
+
# https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html#object-key-guidelines-avoid-characters # noqa
|
|
38
|
+
CHARACTERS_TO_AVOID = ["\\", "{", "^", "}", "%", "`", "]", '"', ">", "[", "~", "<", "#", "|"]
|
|
39
|
+
|
|
40
|
+
if TYPE_CHECKING:
|
|
41
|
+
from s3fs import S3FileSystem
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class S3IndexerConfig(FsspecIndexerConfig):
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class S3AccessConfig(FsspecAccessConfig):
|
|
49
|
+
key: Optional[str] = Field(
|
|
50
|
+
default=None,
|
|
51
|
+
description="If not anonymous, use this access key ID, if specified. Takes precedence "
|
|
52
|
+
"over `aws_access_key_id` in client_kwargs.",
|
|
53
|
+
)
|
|
54
|
+
secret: Optional[str] = Field(
|
|
55
|
+
default=None, description="If not anonymous, use this secret access key, if specified."
|
|
56
|
+
)
|
|
57
|
+
token: Optional[str] = Field(
|
|
58
|
+
default=None, description="If not anonymous, use this security token, if specified."
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class S3ConnectionConfig(FsspecConnectionConfig):
|
|
63
|
+
supported_protocols: list[str] = field(default_factory=lambda: ["s3", "s3a"], init=False)
|
|
64
|
+
access_config: Secret[S3AccessConfig] = Field(default=S3AccessConfig(), validate_default=True)
|
|
65
|
+
endpoint_url: Optional[str] = Field(
|
|
66
|
+
default=None,
|
|
67
|
+
description="Use this endpoint_url, if specified. Needed for "
|
|
68
|
+
"connecting to non-AWS S3 buckets.",
|
|
69
|
+
)
|
|
70
|
+
anonymous: bool = Field(
|
|
71
|
+
default=False, description="Connect to s3 without local AWS credentials."
|
|
72
|
+
)
|
|
73
|
+
ambient_credentials: bool = Field(
|
|
74
|
+
default=False,
|
|
75
|
+
description="Explicitly allow using ambient AWS credentials from .aws folder, "
|
|
76
|
+
"environment variables, or IAM roles. Requires ALLOW_AMBIENT_CREDENTIALS_S3 environment "
|
|
77
|
+
"variable to also be set to 'true' (case insensitive) for security. When False (default), "
|
|
78
|
+
"only explicit credentials or anonymous access are allowed.",
|
|
79
|
+
)
|
|
80
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
81
|
+
|
|
82
|
+
def get_access_config(self) -> dict[str, Any]:
|
|
83
|
+
access_config = self.access_config.get_secret_value()
|
|
84
|
+
has_explicit_credentials = bool(
|
|
85
|
+
access_config.key or access_config.secret or access_config.token
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
access_configs: dict[str, Any]
|
|
89
|
+
|
|
90
|
+
if has_explicit_credentials:
|
|
91
|
+
access_configs = {"anon": False}
|
|
92
|
+
# Avoid injecting None by filtering out k,v pairs where the value is None
|
|
93
|
+
access_configs.update(
|
|
94
|
+
{k: v for k, v in access_config.model_dump().items() if v is not None}
|
|
95
|
+
)
|
|
96
|
+
elif self.ambient_credentials:
|
|
97
|
+
if os.getenv("ALLOW_AMBIENT_CREDENTIALS_S3", "").lower() == "true":
|
|
98
|
+
logger.info(
|
|
99
|
+
"Using ambient AWS credentials (environment variables, .aws folder, IAM roles)"
|
|
100
|
+
)
|
|
101
|
+
access_configs = {"anon": False}
|
|
102
|
+
# Don't pass explicit credentials, let s3fs/boto3 auto-detect
|
|
103
|
+
else:
|
|
104
|
+
# Field allows but environment doesn't - raise error for security
|
|
105
|
+
raise UserAuthError(
|
|
106
|
+
"Ambient credentials requested (ambient_credentials=True) but "
|
|
107
|
+
"ALLOW_AMBIENT_CREDENTIALS_S3 environment variable is not set to 'true'. "
|
|
108
|
+
)
|
|
109
|
+
elif self.anonymous:
|
|
110
|
+
access_configs = {"anon": True}
|
|
111
|
+
else:
|
|
112
|
+
# User set anonymous=False but provided no credentials and no ambient permission
|
|
113
|
+
raise UserAuthError(
|
|
114
|
+
"No authentication method specified. anonymous=False but no explicit credentials "
|
|
115
|
+
"provided and ambient_credentials=False."
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
if self.endpoint_url:
|
|
119
|
+
access_configs["endpoint_url"] = self.endpoint_url
|
|
120
|
+
|
|
121
|
+
# This allows s3fs to properly follow AWS region redirects
|
|
122
|
+
access_configs["cache_regions"] = True
|
|
123
|
+
|
|
124
|
+
return access_configs
|
|
125
|
+
|
|
126
|
+
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
127
|
+
@contextmanager
|
|
128
|
+
def get_client(self, protocol: str) -> Generator["S3FileSystem", None, None]:
|
|
129
|
+
with super().get_client(protocol=protocol) as client:
|
|
130
|
+
yield client
|
|
131
|
+
|
|
132
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
133
|
+
# s3fs maps botocore errors into python ones using mapping here:
|
|
134
|
+
# https://github.com/fsspec/s3fs/blob/main/s3fs/errors.py
|
|
135
|
+
if isinstance(e, PermissionError):
|
|
136
|
+
return UserAuthError(e)
|
|
137
|
+
if isinstance(e, FileNotFoundError):
|
|
138
|
+
return UserError(f"File not found: {e}")
|
|
139
|
+
if cause := getattr(e, "__cause__", None):
|
|
140
|
+
error_response = cause.response
|
|
141
|
+
error_meta = error_response["ResponseMetadata"]
|
|
142
|
+
http_code = error_meta["HTTPStatusCode"]
|
|
143
|
+
message = error_response["Error"].get("Message", str(e))
|
|
144
|
+
if 400 <= http_code < 500:
|
|
145
|
+
return UserError(message)
|
|
146
|
+
if http_code >= 500:
|
|
147
|
+
return ProviderError(message)
|
|
148
|
+
logger.error(
|
|
149
|
+
"Unhandled exception from S3 (type: %s, endpoint: %s): %s",
|
|
150
|
+
type(e).__name__,
|
|
151
|
+
self.endpoint_url or "default",
|
|
152
|
+
e,
|
|
153
|
+
exc_info=True,
|
|
154
|
+
)
|
|
155
|
+
return e
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
@dataclass
|
|
159
|
+
class S3Indexer(FsspecIndexer):
|
|
160
|
+
connection_config: S3ConnectionConfig
|
|
161
|
+
index_config: S3IndexerConfig
|
|
162
|
+
connector_type: str = CONNECTOR_TYPE
|
|
163
|
+
|
|
164
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
165
|
+
return self.connection_config.wrap_error(e=e)
|
|
166
|
+
|
|
167
|
+
def get_path(self, file_info: dict) -> str:
|
|
168
|
+
return file_info["Key"]
|
|
169
|
+
|
|
170
|
+
def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
|
|
171
|
+
path = file_info["Key"]
|
|
172
|
+
|
|
173
|
+
self.log_debug("Getting metadata for S3 object", context={"file_path": path})
|
|
174
|
+
self.log_file_operation("Getting metadata", file_path=path)
|
|
175
|
+
|
|
176
|
+
date_created = None
|
|
177
|
+
date_modified = None
|
|
178
|
+
modified = file_info.get("LastModified")
|
|
179
|
+
if modified:
|
|
180
|
+
date_created = str(modified.timestamp())
|
|
181
|
+
date_modified = str(modified.timestamp())
|
|
182
|
+
|
|
183
|
+
file_size = file_info.get("size") if "size" in file_info else None
|
|
184
|
+
file_size = file_size or file_info.get("Size")
|
|
185
|
+
|
|
186
|
+
version = file_info.get("ETag").rstrip('"').lstrip('"') if "ETag" in file_info else None
|
|
187
|
+
metadata: dict[str, str] = {}
|
|
188
|
+
with (
|
|
189
|
+
contextlib.suppress(AttributeError),
|
|
190
|
+
self.connection_config.get_client(protocol=self.index_config.protocol) as client,
|
|
191
|
+
):
|
|
192
|
+
metadata = client.metadata(path=path)
|
|
193
|
+
record_locator = {
|
|
194
|
+
"protocol": self.index_config.protocol,
|
|
195
|
+
"remote_file_path": self.index_config.remote_url,
|
|
196
|
+
}
|
|
197
|
+
if metadata:
|
|
198
|
+
record_locator["metadata"] = metadata
|
|
199
|
+
issue_characters = [char for char in CHARACTERS_TO_AVOID if char in path]
|
|
200
|
+
if issue_characters:
|
|
201
|
+
self.log_warning(
|
|
202
|
+
f"File path contains characters that can cause issues with S3: {issue_characters}",
|
|
203
|
+
context={"path": path, "problematic_characters": issue_characters},
|
|
204
|
+
)
|
|
205
|
+
return FileDataSourceMetadata(
|
|
206
|
+
date_created=date_created,
|
|
207
|
+
date_modified=date_modified,
|
|
208
|
+
date_processed=str(time()),
|
|
209
|
+
version=version,
|
|
210
|
+
url=f"{self.index_config.protocol}://{path}",
|
|
211
|
+
record_locator=record_locator,
|
|
212
|
+
filesize_bytes=file_size,
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
class S3DownloaderConfig(FsspecDownloaderConfig):
|
|
217
|
+
pass
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
@dataclass
|
|
221
|
+
class S3Downloader(FsspecDownloader):
|
|
222
|
+
protocol: str = "s3"
|
|
223
|
+
connection_config: S3ConnectionConfig
|
|
224
|
+
connector_type: str = CONNECTOR_TYPE
|
|
225
|
+
download_config: Optional[S3DownloaderConfig] = field(default_factory=S3DownloaderConfig)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
class S3UploaderConfig(FsspecUploaderConfig):
|
|
229
|
+
pass
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
@dataclass
|
|
233
|
+
class S3Uploader(FsspecUploader):
|
|
234
|
+
connector_type: str = CONNECTOR_TYPE
|
|
235
|
+
connection_config: S3ConnectionConfig
|
|
236
|
+
upload_config: S3UploaderConfig = field(default=None)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
s3_source_entry = SourceRegistryEntry(
|
|
240
|
+
indexer=S3Indexer,
|
|
241
|
+
indexer_config=S3IndexerConfig,
|
|
242
|
+
downloader=S3Downloader,
|
|
243
|
+
downloader_config=S3DownloaderConfig,
|
|
244
|
+
connection_config=S3ConnectionConfig,
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
s3_destination_entry = DestinationRegistryEntry(
|
|
248
|
+
uploader=S3Uploader,
|
|
249
|
+
uploader_config=S3UploaderConfig,
|
|
250
|
+
connection_config=S3ConnectionConfig,
|
|
251
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
252
|
+
upload_stager=BlobStoreUploadStager,
|
|
253
|
+
)
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from contextlib import contextmanager
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from time import time
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
9
|
+
from urllib.parse import urlparse
|
|
10
|
+
|
|
11
|
+
from pydantic import Field, Secret
|
|
12
|
+
|
|
13
|
+
from unstructured_ingest.data_types.file_data import FileData, FileDataSourceMetadata
|
|
14
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
15
|
+
DestinationRegistryEntry,
|
|
16
|
+
SourceRegistryEntry,
|
|
17
|
+
)
|
|
18
|
+
from unstructured_ingest.processes.connectors.fsspec.fsspec import (
|
|
19
|
+
FsspecAccessConfig,
|
|
20
|
+
FsspecConnectionConfig,
|
|
21
|
+
FsspecDownloader,
|
|
22
|
+
FsspecDownloaderConfig,
|
|
23
|
+
FsspecIndexer,
|
|
24
|
+
FsspecIndexerConfig,
|
|
25
|
+
FsspecUploader,
|
|
26
|
+
FsspecUploaderConfig,
|
|
27
|
+
)
|
|
28
|
+
from unstructured_ingest.processes.utils.blob_storage import (
|
|
29
|
+
BlobStoreUploadStager,
|
|
30
|
+
BlobStoreUploadStagerConfig,
|
|
31
|
+
)
|
|
32
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
33
|
+
|
|
34
|
+
if TYPE_CHECKING:
|
|
35
|
+
from fsspec.implementations.sftp import SFTPFileSystem
|
|
36
|
+
|
|
37
|
+
CONNECTOR_TYPE = "sftp"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class SftpIndexerConfig(FsspecIndexerConfig):
|
|
41
|
+
def model_post_init(self, __context: Any) -> None:
|
|
42
|
+
super().model_post_init(__context)
|
|
43
|
+
_, ext = os.path.splitext(self.remote_url)
|
|
44
|
+
parsed_url = urlparse(self.remote_url)
|
|
45
|
+
if ext:
|
|
46
|
+
self.path_without_protocol = Path(parsed_url.path).parent.as_posix().lstrip("/")
|
|
47
|
+
else:
|
|
48
|
+
self.path_without_protocol = parsed_url.path.lstrip("/")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class SftpAccessConfig(FsspecAccessConfig):
|
|
52
|
+
password: str = Field(description="Password for sftp connection")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class SftpConnectionConfig(FsspecConnectionConfig):
|
|
56
|
+
supported_protocols: list[str] = Field(default_factory=lambda: ["sftp"], init=False)
|
|
57
|
+
access_config: Secret[SftpAccessConfig]
|
|
58
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
59
|
+
username: str = Field(description="Username for sftp connection")
|
|
60
|
+
host: Optional[str] = Field(default=None, description="Hostname for sftp connection")
|
|
61
|
+
port: int = Field(default=22, description="Port for sftp connection")
|
|
62
|
+
look_for_keys: bool = Field(
|
|
63
|
+
default=False, description="Whether to search for private key files in ~/.ssh/"
|
|
64
|
+
)
|
|
65
|
+
allow_agent: bool = Field(default=False, description="Whether to connect to the SSH agent.")
|
|
66
|
+
|
|
67
|
+
def get_access_config(self) -> dict[str, Any]:
|
|
68
|
+
access_config = {
|
|
69
|
+
"username": self.username,
|
|
70
|
+
"host": self.host,
|
|
71
|
+
"port": self.port,
|
|
72
|
+
"look_for_keys": self.look_for_keys,
|
|
73
|
+
"allow_agent": self.allow_agent,
|
|
74
|
+
"password": self.access_config.get_secret_value().password,
|
|
75
|
+
}
|
|
76
|
+
return access_config
|
|
77
|
+
|
|
78
|
+
@contextmanager
|
|
79
|
+
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
80
|
+
def get_client(self, protocol: str) -> Generator["SFTPFileSystem", None, None]:
|
|
81
|
+
# The paramiko.SSHClient() client that's opened by the SFTPFileSystem
|
|
82
|
+
# never gets closed so explicitly adding that as part of this context manager
|
|
83
|
+
from fsspec import get_filesystem_class
|
|
84
|
+
|
|
85
|
+
client: SFTPFileSystem = get_filesystem_class(protocol)(
|
|
86
|
+
**self.get_access_config(),
|
|
87
|
+
)
|
|
88
|
+
yield client
|
|
89
|
+
client.client.close()
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@dataclass
|
|
93
|
+
class SftpIndexer(FsspecIndexer):
|
|
94
|
+
connection_config: SftpConnectionConfig
|
|
95
|
+
index_config: SftpIndexerConfig
|
|
96
|
+
connector_type: str = CONNECTOR_TYPE
|
|
97
|
+
|
|
98
|
+
def __post_init__(self):
|
|
99
|
+
parsed_url = urlparse(self.index_config.remote_url)
|
|
100
|
+
self.connection_config.host = parsed_url.hostname or self.connection_config.host
|
|
101
|
+
self.connection_config.port = parsed_url.port or self.connection_config.port
|
|
102
|
+
|
|
103
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
104
|
+
for file in super().run(**kwargs):
|
|
105
|
+
new_identifier = (
|
|
106
|
+
f"sftp://"
|
|
107
|
+
f"{self.connection_config.host}:"
|
|
108
|
+
f"{self.connection_config.port}/"
|
|
109
|
+
f"{file.identifier}"
|
|
110
|
+
)
|
|
111
|
+
file.identifier = new_identifier
|
|
112
|
+
yield file
|
|
113
|
+
|
|
114
|
+
def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
|
|
115
|
+
path = file_info["name"]
|
|
116
|
+
date_created = str(file_info.get("time").timestamp()) if "time" in file_info else None
|
|
117
|
+
date_modified = str(file_info.get("mtime").timestamp()) if "mtime" in file_info else None
|
|
118
|
+
|
|
119
|
+
file_size = file_info.get("size") if "size" in file_info else None
|
|
120
|
+
|
|
121
|
+
record_locator = {
|
|
122
|
+
"protocol": self.index_config.protocol,
|
|
123
|
+
"remote_file_path": self.index_config.remote_url,
|
|
124
|
+
}
|
|
125
|
+
return FileDataSourceMetadata(
|
|
126
|
+
date_created=date_created,
|
|
127
|
+
date_modified=date_modified,
|
|
128
|
+
date_processed=str(time()),
|
|
129
|
+
url=f"{self.index_config.protocol}://{path}",
|
|
130
|
+
record_locator=record_locator,
|
|
131
|
+
filesize_bytes=file_size,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class SftpDownloaderConfig(FsspecDownloaderConfig):
|
|
136
|
+
remote_url: str = Field(description="Remote fsspec URL formatted as `protocol://dir/path`")
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
@dataclass
|
|
140
|
+
class SftpDownloader(FsspecDownloader):
|
|
141
|
+
protocol: str = "sftp"
|
|
142
|
+
connection_config: SftpConnectionConfig
|
|
143
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
144
|
+
download_config: Optional[SftpDownloaderConfig] = field(default_factory=SftpDownloaderConfig)
|
|
145
|
+
|
|
146
|
+
def __post_init__(self):
|
|
147
|
+
parsed_url = urlparse(self.download_config.remote_url)
|
|
148
|
+
self.connection_config.host = parsed_url.hostname or self.connection_config.host
|
|
149
|
+
self.connection_config.port = parsed_url.port or self.connection_config.port
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class SftpUploaderConfig(FsspecUploaderConfig):
|
|
153
|
+
pass
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
@dataclass
|
|
157
|
+
class SftpUploader(FsspecUploader):
|
|
158
|
+
connector_type: str = CONNECTOR_TYPE
|
|
159
|
+
connection_config: SftpConnectionConfig
|
|
160
|
+
upload_config: SftpUploaderConfig = field(default=None)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
sftp_source_entry = SourceRegistryEntry(
|
|
164
|
+
indexer=SftpIndexer,
|
|
165
|
+
indexer_config=SftpIndexerConfig,
|
|
166
|
+
downloader=SftpDownloader,
|
|
167
|
+
downloader_config=SftpDownloaderConfig,
|
|
168
|
+
connection_config=SftpConnectionConfig,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
sftp_destination_entry = DestinationRegistryEntry(
|
|
172
|
+
uploader=SftpUploader,
|
|
173
|
+
uploader_config=SftpUploaderConfig,
|
|
174
|
+
connection_config=SftpConnectionConfig,
|
|
175
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
176
|
+
upload_stager=BlobStoreUploadStager,
|
|
177
|
+
)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Callable
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def json_serial(obj):
|
|
8
|
+
if isinstance(obj, Path):
|
|
9
|
+
return obj.as_posix()
|
|
10
|
+
if isinstance(obj, datetime):
|
|
11
|
+
return obj.isoformat()
|
|
12
|
+
raise TypeError("Type %s not serializable" % type(obj))
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def sterilize_dict(data: dict, default: Callable = json_serial) -> dict:
|
|
16
|
+
data_s = json.dumps(data, default=default)
|
|
17
|
+
return json.loads(data_s)
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from time import time
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
|
+
from urllib.parse import urlparse
|
|
6
|
+
from uuid import NAMESPACE_DNS, uuid5
|
|
7
|
+
|
|
8
|
+
from pydantic import Field, Secret, field_validator
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.data_types.file_data import (
|
|
11
|
+
FileData,
|
|
12
|
+
FileDataSourceMetadata,
|
|
13
|
+
SourceIdentifiers,
|
|
14
|
+
)
|
|
15
|
+
from unstructured_ingest.error import (
|
|
16
|
+
ProviderError,
|
|
17
|
+
UnstructuredIngestError,
|
|
18
|
+
UserAuthError,
|
|
19
|
+
UserError,
|
|
20
|
+
)
|
|
21
|
+
from unstructured_ingest.interfaces import (
|
|
22
|
+
AccessConfig,
|
|
23
|
+
ConnectionConfig,
|
|
24
|
+
Downloader,
|
|
25
|
+
DownloaderConfig,
|
|
26
|
+
Indexer,
|
|
27
|
+
IndexerConfig,
|
|
28
|
+
download_responses,
|
|
29
|
+
)
|
|
30
|
+
from unstructured_ingest.logger import logger
|
|
31
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
32
|
+
SourceRegistryEntry,
|
|
33
|
+
)
|
|
34
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
35
|
+
|
|
36
|
+
if TYPE_CHECKING:
|
|
37
|
+
from github import ContentFile, GitTreeElement, Repository
|
|
38
|
+
from github import Github as GithubClient
|
|
39
|
+
from github.GithubException import GithubException
|
|
40
|
+
from requests import HTTPError
|
|
41
|
+
|
|
42
|
+
CONNECTOR_TYPE = "github"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class GithubAccessConfig(AccessConfig):
|
|
46
|
+
access_token: str = Field(description="Github acess token")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class GithubConnectionConfig(ConnectionConfig):
|
|
50
|
+
access_config: Secret[GithubAccessConfig]
|
|
51
|
+
url: str = Field(description="Github url or repository owner/name pair")
|
|
52
|
+
|
|
53
|
+
@field_validator("url", mode="after")
|
|
54
|
+
def conform_url(cls, value: str):
|
|
55
|
+
parsed_url = urlparse(value)
|
|
56
|
+
return parsed_url.path
|
|
57
|
+
|
|
58
|
+
def get_full_url(self):
|
|
59
|
+
return f"https://github.com/{self.url}"
|
|
60
|
+
|
|
61
|
+
@requires_dependencies(["github"], extras="github")
|
|
62
|
+
def get_client(self) -> "GithubClient":
|
|
63
|
+
from github import Github as GithubClient
|
|
64
|
+
|
|
65
|
+
return GithubClient(login_or_token=self.access_config.get_secret_value().access_token)
|
|
66
|
+
|
|
67
|
+
def get_repo(self) -> "Repository":
|
|
68
|
+
client = self.get_client()
|
|
69
|
+
return client.get_repo(self.url)
|
|
70
|
+
|
|
71
|
+
def wrap_github_exception(self, e: "GithubException") -> Exception:
|
|
72
|
+
data = e.data
|
|
73
|
+
status_code = e.status
|
|
74
|
+
message = data.get("message")
|
|
75
|
+
if status_code == 401:
|
|
76
|
+
return UserAuthError(f"Unauthorized access to Github: {message}")
|
|
77
|
+
if 400 <= status_code < 500:
|
|
78
|
+
return UserError(message)
|
|
79
|
+
if status_code > 500:
|
|
80
|
+
return ProviderError(message)
|
|
81
|
+
logger.debug(f"unhandled github error: {e}")
|
|
82
|
+
return e
|
|
83
|
+
|
|
84
|
+
def wrap_http_error(self, e: "HTTPError") -> Exception:
|
|
85
|
+
status_code = e.response.status_code
|
|
86
|
+
if status_code == 401:
|
|
87
|
+
return UserAuthError(f"Unauthorized access to Github: {e.response.text}")
|
|
88
|
+
if 400 <= status_code < 500:
|
|
89
|
+
return UserError(e.response.text)
|
|
90
|
+
if status_code > 500:
|
|
91
|
+
return ProviderError(e.response.text)
|
|
92
|
+
logger.debug(f"unhandled http error: {e}")
|
|
93
|
+
return UnstructuredIngestError(str(e))
|
|
94
|
+
|
|
95
|
+
@requires_dependencies(["requests"], extras="github")
|
|
96
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
97
|
+
from github.GithubException import GithubException
|
|
98
|
+
from requests import HTTPError
|
|
99
|
+
|
|
100
|
+
if isinstance(e, GithubException):
|
|
101
|
+
return self.wrap_github_exception(e=e)
|
|
102
|
+
if isinstance(e, HTTPError):
|
|
103
|
+
return self.wrap_http_error(e=e)
|
|
104
|
+
logger.debug(f"unhandled error: {e}")
|
|
105
|
+
return UnstructuredIngestError(str(e))
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class GithubIndexerConfig(IndexerConfig):
|
|
109
|
+
branch: Optional[str] = Field(
|
|
110
|
+
description="Branch to index, use the default if one isn't provided", default=None
|
|
111
|
+
)
|
|
112
|
+
recursive: bool = Field(
|
|
113
|
+
description="Recursively index all files in the repository", default=True
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
@dataclass
|
|
118
|
+
class GithubIndexer(Indexer):
|
|
119
|
+
connection_config: GithubConnectionConfig
|
|
120
|
+
index_config: GithubIndexerConfig = field(default_factory=GithubIndexerConfig)
|
|
121
|
+
connector_type: str = CONNECTOR_TYPE
|
|
122
|
+
|
|
123
|
+
def precheck(self) -> None:
|
|
124
|
+
try:
|
|
125
|
+
self.connection_config.get_repo()
|
|
126
|
+
except Exception as e:
|
|
127
|
+
raise self.connection_config.wrap_error(e=e)
|
|
128
|
+
|
|
129
|
+
def get_branch(self) -> str:
|
|
130
|
+
repo = self.connection_config.get_repo()
|
|
131
|
+
sha = self.index_config.branch or repo.default_branch
|
|
132
|
+
return sha
|
|
133
|
+
|
|
134
|
+
def list_files(self) -> list["GitTreeElement"]:
|
|
135
|
+
repo = self.connection_config.get_repo()
|
|
136
|
+
sha = self.index_config.branch or repo.default_branch
|
|
137
|
+
git_tree = repo.get_git_tree(sha, recursive=self.index_config.recursive)
|
|
138
|
+
file_elements = [
|
|
139
|
+
element for element in git_tree.tree if element.size is not None and element.size > 0
|
|
140
|
+
]
|
|
141
|
+
return file_elements
|
|
142
|
+
|
|
143
|
+
def convert_element(self, element: "GitTreeElement") -> FileData:
|
|
144
|
+
full_path = (
|
|
145
|
+
f"{self.connection_config.get_full_url()}/blob/{self.get_branch()}/{element.path}"
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
return FileData(
|
|
149
|
+
identifier=str(uuid5(NAMESPACE_DNS, full_path)),
|
|
150
|
+
connector_type=self.connector_type,
|
|
151
|
+
display_name=full_path,
|
|
152
|
+
source_identifiers=SourceIdentifiers(
|
|
153
|
+
filename=Path(element.path).name,
|
|
154
|
+
fullpath=(Path(self.get_branch()) / element.path).as_posix(),
|
|
155
|
+
rel_path=element.path,
|
|
156
|
+
),
|
|
157
|
+
metadata=FileDataSourceMetadata(
|
|
158
|
+
url=element.url,
|
|
159
|
+
version=element.etag,
|
|
160
|
+
record_locator={},
|
|
161
|
+
date_modified=str(element.last_modified_datetime.timestamp()),
|
|
162
|
+
date_processed=str(time()),
|
|
163
|
+
filesize_bytes=element.size,
|
|
164
|
+
permissions_data=[{"mode": element.mode}],
|
|
165
|
+
),
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
169
|
+
for element in self.list_files():
|
|
170
|
+
yield self.convert_element(element=element)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
class GithubDownloaderConfig(DownloaderConfig):
|
|
174
|
+
pass
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
@dataclass
|
|
178
|
+
class GithubDownloader(Downloader):
|
|
179
|
+
download_config: GithubDownloaderConfig
|
|
180
|
+
connection_config: GithubConnectionConfig
|
|
181
|
+
connector_type: str = CONNECTOR_TYPE
|
|
182
|
+
|
|
183
|
+
@requires_dependencies(["github"], extras="github")
|
|
184
|
+
def get_file(self, file_data: FileData) -> "ContentFile":
|
|
185
|
+
from github.GithubException import UnknownObjectException
|
|
186
|
+
|
|
187
|
+
path = file_data.source_identifiers.relative_path
|
|
188
|
+
repo = self.connection_config.get_repo()
|
|
189
|
+
|
|
190
|
+
try:
|
|
191
|
+
content_file = repo.get_contents(path)
|
|
192
|
+
except UnknownObjectException as e:
|
|
193
|
+
logger.error(f"File doesn't exists {self.connection_config.url}/{path}: {e}")
|
|
194
|
+
raise UserError(f"File not found: {path}")
|
|
195
|
+
return content_file
|
|
196
|
+
|
|
197
|
+
@requires_dependencies(["requests"], extras="github")
|
|
198
|
+
def get_contents(self, content_file: "ContentFile") -> bytes:
|
|
199
|
+
import requests
|
|
200
|
+
|
|
201
|
+
if content_file.decoded_content:
|
|
202
|
+
return content_file.decoded_content
|
|
203
|
+
download_url = content_file.download_url
|
|
204
|
+
resp = requests.get(download_url)
|
|
205
|
+
try:
|
|
206
|
+
resp.raise_for_status()
|
|
207
|
+
except requests.HTTPError as e:
|
|
208
|
+
raise self.connection_config.wrap_error(e=e)
|
|
209
|
+
return resp.content
|
|
210
|
+
|
|
211
|
+
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
212
|
+
content_file = self.get_file(file_data)
|
|
213
|
+
contents = self.get_contents(content_file)
|
|
214
|
+
download_path = self.get_download_path(file_data)
|
|
215
|
+
with download_path.open("wb") as f:
|
|
216
|
+
f.write(contents)
|
|
217
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
github_source_entry = SourceRegistryEntry(
|
|
221
|
+
indexer=GithubIndexer,
|
|
222
|
+
indexer_config=GithubIndexerConfig,
|
|
223
|
+
downloader=GithubDownloader,
|
|
224
|
+
downloader_config=GithubDownloaderConfig,
|
|
225
|
+
connection_config=GithubConnectionConfig,
|
|
226
|
+
)
|