unstructured-ingest 0.3.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +31 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +38 -0
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +269 -0
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +90 -0
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +89 -0
- test/integration/connectors/duckdb/test_motherduck.py +95 -0
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +34 -0
- test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
- test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_postgres.py +195 -0
- test/integration/connectors/sql/test_singlestore.py +176 -0
- test/integration/connectors/sql/test_snowflake.py +238 -0
- test/integration/connectors/sql/test_sqlite.py +162 -0
- test/integration/connectors/test_astradb.py +217 -0
- test/integration/connectors/test_azure_ai_search.py +255 -0
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +113 -0
- test/integration/connectors/test_delta_table.py +185 -0
- test/integration/connectors/test_lancedb.py +247 -0
- test/integration/connectors/test_milvus.py +203 -0
- test/integration/connectors/test_mongodb.py +335 -0
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_notion.py +145 -0
- test/integration/connectors/test_onedrive.py +118 -0
- test/integration/connectors/test_pinecone.py +288 -0
- test/integration/connectors/test_qdrant.py +215 -0
- test/integration/connectors/test_redis.py +119 -0
- test/integration/connectors/test_s3.py +183 -0
- test/integration/connectors/test_vectara.py +270 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker.py +151 -0
- test/integration/connectors/utils/docker_compose.py +59 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +75 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/validation/source.py +299 -0
- test/integration/connectors/utils/validation/utils.py +36 -0
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +15 -0
- test/integration/connectors/weaviate/test_cloud.py +34 -0
- test/integration/connectors/weaviate/test_local.py +131 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_azure_openai.py +59 -0
- test/integration/embedders/test_bedrock.py +103 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +71 -0
- test/integration/embedders/test_octoai.py +77 -0
- test/integration/embedders/test_openai.py +76 -0
- test/integration/embedders/test_togetherai.py +71 -0
- test/integration/embedders/test_vertexai.py +65 -0
- test/integration/embedders/test_voyageai.py +65 -0
- test/integration/embedders/utils.py +68 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +42 -0
- test/unit/embed/test_octoai.py +27 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_error.py +27 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +184 -0
- test/unit/v2/__init__.py +0 -0
- test/unit/v2/chunkers/__init__.py +0 -0
- test/unit/v2/chunkers/test_chunkers.py +49 -0
- test/unit/v2/connectors/__init__.py +0 -0
- test/unit/v2/connectors/test_confluence.py +39 -0
- test/unit/v2/embedders/__init__.py +0 -0
- test/unit/v2/embedders/test_bedrock.py +36 -0
- test/unit/v2/embedders/test_huggingface.py +48 -0
- test/unit/v2/embedders/test_mixedbread.py +37 -0
- test/unit/v2/embedders/test_octoai.py +35 -0
- test/unit/v2/embedders/test_openai.py +35 -0
- test/unit/v2/embedders/test_togetherai.py +37 -0
- test/unit/v2/embedders/test_vertexai.py +37 -0
- test/unit/v2/embedders/test_voyageai.py +38 -0
- test/unit/v2/partitioners/__init__.py +0 -0
- test/unit/v2/partitioners/test_partitioner.py +63 -0
- test/unit/v2/test_interfaces.py +26 -0
- test/unit/v2/test_utils.py +82 -0
- test/unit/v2/utils/__init__.py +0 -0
- test/unit/v2/utils/data_generator.py +32 -0
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/__init__.py +14 -0
- unstructured_ingest/cli/base/__init__.py +0 -0
- unstructured_ingest/cli/base/cmd.py +19 -0
- unstructured_ingest/cli/base/dest.py +87 -0
- unstructured_ingest/cli/base/src.py +57 -0
- unstructured_ingest/cli/cli.py +37 -0
- unstructured_ingest/cli/cmd_factory.py +12 -0
- unstructured_ingest/cli/cmds/__init__.py +145 -0
- unstructured_ingest/cli/cmds/airtable.py +69 -0
- unstructured_ingest/cli/cmds/astradb.py +99 -0
- unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
- unstructured_ingest/cli/cmds/biomed.py +52 -0
- unstructured_ingest/cli/cmds/chroma.py +104 -0
- unstructured_ingest/cli/cmds/clarifai.py +71 -0
- unstructured_ingest/cli/cmds/confluence.py +69 -0
- unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
- unstructured_ingest/cli/cmds/delta_table.py +94 -0
- unstructured_ingest/cli/cmds/discord.py +47 -0
- unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
- unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
- unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
- unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
- unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
- unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
- unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
- unstructured_ingest/cli/cmds/github.py +54 -0
- unstructured_ingest/cli/cmds/gitlab.py +54 -0
- unstructured_ingest/cli/cmds/google_drive.py +49 -0
- unstructured_ingest/cli/cmds/hubspot.py +70 -0
- unstructured_ingest/cli/cmds/jira.py +71 -0
- unstructured_ingest/cli/cmds/kafka.py +102 -0
- unstructured_ingest/cli/cmds/local.py +43 -0
- unstructured_ingest/cli/cmds/mongodb.py +72 -0
- unstructured_ingest/cli/cmds/notion.py +48 -0
- unstructured_ingest/cli/cmds/onedrive.py +66 -0
- unstructured_ingest/cli/cmds/opensearch.py +117 -0
- unstructured_ingest/cli/cmds/outlook.py +67 -0
- unstructured_ingest/cli/cmds/pinecone.py +71 -0
- unstructured_ingest/cli/cmds/qdrant.py +124 -0
- unstructured_ingest/cli/cmds/reddit.py +67 -0
- unstructured_ingest/cli/cmds/salesforce.py +58 -0
- unstructured_ingest/cli/cmds/sharepoint.py +66 -0
- unstructured_ingest/cli/cmds/slack.py +56 -0
- unstructured_ingest/cli/cmds/sql.py +66 -0
- unstructured_ingest/cli/cmds/vectara.py +66 -0
- unstructured_ingest/cli/cmds/weaviate.py +98 -0
- unstructured_ingest/cli/cmds/wikipedia.py +40 -0
- unstructured_ingest/cli/common.py +7 -0
- unstructured_ingest/cli/interfaces.py +663 -0
- unstructured_ingest/cli/utils.py +205 -0
- unstructured_ingest/connector/__init__.py +0 -0
- unstructured_ingest/connector/airtable.py +309 -0
- unstructured_ingest/connector/astradb.py +267 -0
- unstructured_ingest/connector/azure_ai_search.py +144 -0
- unstructured_ingest/connector/biomed.py +320 -0
- unstructured_ingest/connector/chroma.py +158 -0
- unstructured_ingest/connector/clarifai.py +122 -0
- unstructured_ingest/connector/confluence.py +285 -0
- unstructured_ingest/connector/databricks_volumes.py +137 -0
- unstructured_ingest/connector/delta_table.py +203 -0
- unstructured_ingest/connector/discord.py +180 -0
- unstructured_ingest/connector/elasticsearch.py +396 -0
- unstructured_ingest/connector/fsspec/__init__.py +0 -0
- unstructured_ingest/connector/fsspec/azure.py +78 -0
- unstructured_ingest/connector/fsspec/box.py +109 -0
- unstructured_ingest/connector/fsspec/dropbox.py +160 -0
- unstructured_ingest/connector/fsspec/fsspec.py +359 -0
- unstructured_ingest/connector/fsspec/gcs.py +82 -0
- unstructured_ingest/connector/fsspec/s3.py +62 -0
- unstructured_ingest/connector/fsspec/sftp.py +81 -0
- unstructured_ingest/connector/git.py +124 -0
- unstructured_ingest/connector/github.py +174 -0
- unstructured_ingest/connector/gitlab.py +142 -0
- unstructured_ingest/connector/google_drive.py +348 -0
- unstructured_ingest/connector/hubspot.py +278 -0
- unstructured_ingest/connector/jira.py +469 -0
- unstructured_ingest/connector/kafka.py +293 -0
- unstructured_ingest/connector/local.py +139 -0
- unstructured_ingest/connector/mongodb.py +284 -0
- unstructured_ingest/connector/notion/__init__.py +0 -0
- unstructured_ingest/connector/notion/client.py +248 -0
- unstructured_ingest/connector/notion/connector.py +469 -0
- unstructured_ingest/connector/notion/helpers.py +584 -0
- unstructured_ingest/connector/notion/interfaces.py +32 -0
- unstructured_ingest/connector/notion/types/__init__.py +0 -0
- unstructured_ingest/connector/notion/types/block.py +96 -0
- unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
- unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
- unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
- unstructured_ingest/connector/notion/types/database.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
- unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
- unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/connector/notion/types/date.py +26 -0
- unstructured_ingest/connector/notion/types/file.py +51 -0
- unstructured_ingest/connector/notion/types/page.py +45 -0
- unstructured_ingest/connector/notion/types/parent.py +66 -0
- unstructured_ingest/connector/notion/types/rich_text.py +189 -0
- unstructured_ingest/connector/notion/types/user.py +76 -0
- unstructured_ingest/connector/onedrive.py +232 -0
- unstructured_ingest/connector/opensearch.py +218 -0
- unstructured_ingest/connector/outlook.py +285 -0
- unstructured_ingest/connector/pinecone.py +140 -0
- unstructured_ingest/connector/qdrant.py +144 -0
- unstructured_ingest/connector/reddit.py +166 -0
- unstructured_ingest/connector/registry.py +109 -0
- unstructured_ingest/connector/salesforce.py +301 -0
- unstructured_ingest/connector/sharepoint.py +573 -0
- unstructured_ingest/connector/slack.py +224 -0
- unstructured_ingest/connector/sql.py +199 -0
- unstructured_ingest/connector/vectara.py +253 -0
- unstructured_ingest/connector/weaviate.py +190 -0
- unstructured_ingest/connector/wikipedia.py +208 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +31 -0
- unstructured_ingest/embed/bedrock.py +193 -0
- unstructured_ingest/embed/huggingface.py +52 -0
- unstructured_ingest/embed/interfaces.py +117 -0
- unstructured_ingest/embed/mixedbreadai.py +233 -0
- unstructured_ingest/embed/octoai.py +130 -0
- unstructured_ingest/embed/openai.py +116 -0
- unstructured_ingest/embed/togetherai.py +106 -0
- unstructured_ingest/embed/vertexai.py +126 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
- unstructured_ingest/enhanced_dataclass/core.py +99 -0
- unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
- unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
- unstructured_ingest/error.py +49 -0
- unstructured_ingest/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/ingest_backoff/_common.py +102 -0
- unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
- unstructured_ingest/interfaces.py +852 -0
- unstructured_ingest/logger.py +130 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/pipeline/__init__.py +22 -0
- unstructured_ingest/pipeline/copy.py +19 -0
- unstructured_ingest/pipeline/doc_factory.py +12 -0
- unstructured_ingest/pipeline/interfaces.py +270 -0
- unstructured_ingest/pipeline/partition.py +60 -0
- unstructured_ingest/pipeline/permissions.py +12 -0
- unstructured_ingest/pipeline/pipeline.py +117 -0
- unstructured_ingest/pipeline/reformat/__init__.py +0 -0
- unstructured_ingest/pipeline/reformat/chunking.py +134 -0
- unstructured_ingest/pipeline/reformat/embedding.py +64 -0
- unstructured_ingest/pipeline/source.py +77 -0
- unstructured_ingest/pipeline/utils.py +6 -0
- unstructured_ingest/pipeline/write.py +18 -0
- unstructured_ingest/processor.py +93 -0
- unstructured_ingest/runner/__init__.py +104 -0
- unstructured_ingest/runner/airtable.py +35 -0
- unstructured_ingest/runner/astradb.py +34 -0
- unstructured_ingest/runner/base_runner.py +89 -0
- unstructured_ingest/runner/biomed.py +45 -0
- unstructured_ingest/runner/confluence.py +35 -0
- unstructured_ingest/runner/delta_table.py +34 -0
- unstructured_ingest/runner/discord.py +35 -0
- unstructured_ingest/runner/elasticsearch.py +40 -0
- unstructured_ingest/runner/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/fsspec/azure.py +30 -0
- unstructured_ingest/runner/fsspec/box.py +28 -0
- unstructured_ingest/runner/fsspec/dropbox.py +30 -0
- unstructured_ingest/runner/fsspec/fsspec.py +40 -0
- unstructured_ingest/runner/fsspec/gcs.py +28 -0
- unstructured_ingest/runner/fsspec/s3.py +28 -0
- unstructured_ingest/runner/fsspec/sftp.py +28 -0
- unstructured_ingest/runner/github.py +37 -0
- unstructured_ingest/runner/gitlab.py +37 -0
- unstructured_ingest/runner/google_drive.py +35 -0
- unstructured_ingest/runner/hubspot.py +35 -0
- unstructured_ingest/runner/jira.py +35 -0
- unstructured_ingest/runner/kafka.py +34 -0
- unstructured_ingest/runner/local.py +23 -0
- unstructured_ingest/runner/mongodb.py +34 -0
- unstructured_ingest/runner/notion.py +61 -0
- unstructured_ingest/runner/onedrive.py +35 -0
- unstructured_ingest/runner/opensearch.py +40 -0
- unstructured_ingest/runner/outlook.py +33 -0
- unstructured_ingest/runner/reddit.py +35 -0
- unstructured_ingest/runner/salesforce.py +33 -0
- unstructured_ingest/runner/sharepoint.py +35 -0
- unstructured_ingest/runner/slack.py +33 -0
- unstructured_ingest/runner/utils.py +47 -0
- unstructured_ingest/runner/wikipedia.py +35 -0
- unstructured_ingest/runner/writers/__init__.py +48 -0
- unstructured_ingest/runner/writers/astradb.py +22 -0
- unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
- unstructured_ingest/runner/writers/base_writer.py +26 -0
- unstructured_ingest/runner/writers/chroma.py +22 -0
- unstructured_ingest/runner/writers/clarifai.py +19 -0
- unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
- unstructured_ingest/runner/writers/delta_table.py +24 -0
- unstructured_ingest/runner/writers/elasticsearch.py +24 -0
- unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
- unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
- unstructured_ingest/runner/writers/fsspec/box.py +21 -0
- unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
- unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
- unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
- unstructured_ingest/runner/writers/kafka.py +21 -0
- unstructured_ingest/runner/writers/mongodb.py +21 -0
- unstructured_ingest/runner/writers/opensearch.py +26 -0
- unstructured_ingest/runner/writers/pinecone.py +21 -0
- unstructured_ingest/runner/writers/qdrant.py +19 -0
- unstructured_ingest/runner/writers/sql.py +22 -0
- unstructured_ingest/runner/writers/vectara.py +22 -0
- unstructured_ingest/runner/writers/weaviate.py +21 -0
- unstructured_ingest/utils/__init__.py +0 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +118 -0
- unstructured_ingest/utils/data_prep.py +200 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/utils/string_and_date_utils.py +49 -0
- unstructured_ingest/utils/table.py +73 -0
- unstructured_ingest/v2/__init__.py +1 -0
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +4 -0
- unstructured_ingest/v2/cli/base/cmd.py +269 -0
- unstructured_ingest/v2/cli/base/dest.py +85 -0
- unstructured_ingest/v2/cli/base/importer.py +34 -0
- unstructured_ingest/v2/cli/base/src.py +85 -0
- unstructured_ingest/v2/cli/cli.py +24 -0
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/cli/utils/click.py +237 -0
- unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/v2/constants.py +2 -0
- unstructured_ingest/v2/errors.py +18 -0
- unstructured_ingest/v2/interfaces/__init__.py +32 -0
- unstructured_ingest/v2/interfaces/connector.py +50 -0
- unstructured_ingest/v2/interfaces/downloader.py +89 -0
- unstructured_ingest/v2/interfaces/file_data.py +116 -0
- unstructured_ingest/v2/interfaces/indexer.py +30 -0
- unstructured_ingest/v2/interfaces/process.py +19 -0
- unstructured_ingest/v2/interfaces/processor.py +88 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
- unstructured_ingest/v2/interfaces/uploader.py +53 -0
- unstructured_ingest/v2/logger.py +126 -0
- unstructured_ingest/v2/main.py +11 -0
- unstructured_ingest/v2/otel.py +111 -0
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +211 -0
- unstructured_ingest/v2/pipeline/otel.py +32 -0
- unstructured_ingest/v2/pipeline/pipeline.py +384 -0
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
- unstructured_ingest/v2/pipeline/steps/download.py +207 -0
- unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
- unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
- unstructured_ingest/v2/pipeline/steps/index.py +86 -0
- unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
- unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
- unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +124 -0
- unstructured_ingest/v2/processes/connector_registry.py +69 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
- unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
- unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
- unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
- unstructured_ingest/v2/processes/connectors/discord.py +158 -0
- unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
- unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
- unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
- unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
- unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
- unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/v2/processes/connectors/local.py +217 -0
- unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
- unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
- unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
- unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
- unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
- unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
- unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
- unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
- unstructured_ingest/v2/processes/connectors/slack.py +248 -0
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
- unstructured_ingest/v2/processes/connectors/utils.py +29 -0
- unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
- unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
- unstructured_ingest/v2/processes/embedder.py +195 -0
- unstructured_ingest/v2/processes/filter.py +60 -0
- unstructured_ingest/v2/processes/partitioner.py +188 -0
- unstructured_ingest/v2/processes/uncompress.py +61 -0
- unstructured_ingest/v2/unstructured_api.py +128 -0
- unstructured_ingest/v2/utils.py +61 -0
- unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
- unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
- unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
- unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
- unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
- unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from time import time
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
6
|
+
|
|
7
|
+
from pydantic import Field, Secret
|
|
8
|
+
|
|
9
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
|
+
from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserError
|
|
11
|
+
from unstructured_ingest.v2.interfaces import (
|
|
12
|
+
FileDataSourceMetadata,
|
|
13
|
+
)
|
|
14
|
+
from unstructured_ingest.v2.logger import logger
|
|
15
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
16
|
+
DestinationRegistryEntry,
|
|
17
|
+
SourceRegistryEntry,
|
|
18
|
+
)
|
|
19
|
+
from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
20
|
+
FsspecAccessConfig,
|
|
21
|
+
FsspecConnectionConfig,
|
|
22
|
+
FsspecDownloader,
|
|
23
|
+
FsspecDownloaderConfig,
|
|
24
|
+
FsspecIndexer,
|
|
25
|
+
FsspecIndexerConfig,
|
|
26
|
+
FsspecUploader,
|
|
27
|
+
FsspecUploaderConfig,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
CONNECTOR_TYPE = "s3"
|
|
31
|
+
|
|
32
|
+
if TYPE_CHECKING:
|
|
33
|
+
from s3fs import S3FileSystem
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class S3IndexerConfig(FsspecIndexerConfig):
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class S3AccessConfig(FsspecAccessConfig):
|
|
41
|
+
key: Optional[str] = Field(
|
|
42
|
+
default=None,
|
|
43
|
+
description="If not anonymous, use this access key ID, if specified. Takes precedence "
|
|
44
|
+
"over `aws_access_key_id` in client_kwargs.",
|
|
45
|
+
)
|
|
46
|
+
secret: Optional[str] = Field(
|
|
47
|
+
default=None, description="If not anonymous, use this secret access key, if specified."
|
|
48
|
+
)
|
|
49
|
+
token: Optional[str] = Field(
|
|
50
|
+
default=None, description="If not anonymous, use this security token, if specified."
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class S3ConnectionConfig(FsspecConnectionConfig):
|
|
55
|
+
supported_protocols: list[str] = field(default_factory=lambda: ["s3", "s3a"], init=False)
|
|
56
|
+
access_config: Secret[S3AccessConfig] = Field(default=S3AccessConfig(), validate_default=True)
|
|
57
|
+
endpoint_url: Optional[str] = Field(
|
|
58
|
+
default=None,
|
|
59
|
+
description="Use this endpoint_url, if specified. Needed for "
|
|
60
|
+
"connecting to non-AWS S3 buckets.",
|
|
61
|
+
)
|
|
62
|
+
anonymous: bool = Field(
|
|
63
|
+
default=False, description="Connect to s3 without local AWS credentials."
|
|
64
|
+
)
|
|
65
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
66
|
+
|
|
67
|
+
def get_access_config(self) -> dict[str, Any]:
|
|
68
|
+
access_configs: dict[str, Any] = {"anon": self.anonymous}
|
|
69
|
+
if self.endpoint_url:
|
|
70
|
+
access_configs["endpoint_url"] = self.endpoint_url
|
|
71
|
+
|
|
72
|
+
# Avoid injecting None by filtering out k,v pairs where the value is None
|
|
73
|
+
access_configs.update(
|
|
74
|
+
{k: v for k, v in self.access_config.get_secret_value().model_dump().items() if v}
|
|
75
|
+
)
|
|
76
|
+
return access_configs
|
|
77
|
+
|
|
78
|
+
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
79
|
+
@contextmanager
|
|
80
|
+
def get_client(self, protocol: str) -> Generator["S3FileSystem", None, None]:
|
|
81
|
+
with super().get_client(protocol=protocol) as client:
|
|
82
|
+
yield client
|
|
83
|
+
|
|
84
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
85
|
+
# s3fs maps botocore errors into python ones using mapping here:
|
|
86
|
+
# https://github.com/fsspec/s3fs/blob/main/s3fs/errors.py
|
|
87
|
+
if isinstance(e, PermissionError):
|
|
88
|
+
return UserAuthError(e)
|
|
89
|
+
if isinstance(e, FileNotFoundError):
|
|
90
|
+
return UserError(e)
|
|
91
|
+
if cause := getattr(e, "__cause__", None):
|
|
92
|
+
error_response = cause.response
|
|
93
|
+
error_meta = error_response["ResponseMetadata"]
|
|
94
|
+
http_code = error_meta["HTTPStatusCode"]
|
|
95
|
+
message = error_response["Error"].get("Message", str(e))
|
|
96
|
+
if 400 <= http_code < 500:
|
|
97
|
+
return UserError(message)
|
|
98
|
+
if http_code >= 500:
|
|
99
|
+
return ProviderError(message)
|
|
100
|
+
logger.error(f"unhandled exception from s3 ({type(e)}): {e}", exc_info=True)
|
|
101
|
+
return e
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@dataclass
|
|
105
|
+
class S3Indexer(FsspecIndexer):
|
|
106
|
+
connection_config: S3ConnectionConfig
|
|
107
|
+
index_config: S3IndexerConfig
|
|
108
|
+
connector_type: str = CONNECTOR_TYPE
|
|
109
|
+
|
|
110
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
111
|
+
return self.connection_config.wrap_error(e=e)
|
|
112
|
+
|
|
113
|
+
def get_path(self, file_data: dict) -> str:
|
|
114
|
+
return file_data["Key"]
|
|
115
|
+
|
|
116
|
+
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
117
|
+
path = file_data["Key"]
|
|
118
|
+
date_created = None
|
|
119
|
+
date_modified = None
|
|
120
|
+
modified = file_data.get("LastModified")
|
|
121
|
+
if modified:
|
|
122
|
+
date_created = str(modified.timestamp())
|
|
123
|
+
date_modified = str(modified.timestamp())
|
|
124
|
+
|
|
125
|
+
file_size = file_data.get("size") if "size" in file_data else None
|
|
126
|
+
file_size = file_size or file_data.get("Size")
|
|
127
|
+
|
|
128
|
+
version = file_data.get("ETag").rstrip('"').lstrip('"') if "ETag" in file_data else None
|
|
129
|
+
metadata: dict[str, str] = {}
|
|
130
|
+
with contextlib.suppress(AttributeError):
|
|
131
|
+
with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
|
|
132
|
+
metadata = client.metadata(path=path)
|
|
133
|
+
record_locator = {
|
|
134
|
+
"protocol": self.index_config.protocol,
|
|
135
|
+
"remote_file_path": self.index_config.remote_url,
|
|
136
|
+
}
|
|
137
|
+
if metadata:
|
|
138
|
+
record_locator["metadata"] = metadata
|
|
139
|
+
return FileDataSourceMetadata(
|
|
140
|
+
date_created=date_created,
|
|
141
|
+
date_modified=date_modified,
|
|
142
|
+
date_processed=str(time()),
|
|
143
|
+
version=version,
|
|
144
|
+
url=f"{self.index_config.protocol}://{path}",
|
|
145
|
+
record_locator=record_locator,
|
|
146
|
+
filesize_bytes=file_size,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
class S3DownloaderConfig(FsspecDownloaderConfig):
|
|
151
|
+
pass
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
@dataclass
|
|
155
|
+
class S3Downloader(FsspecDownloader):
|
|
156
|
+
protocol: str = "s3"
|
|
157
|
+
connection_config: S3ConnectionConfig
|
|
158
|
+
connector_type: str = CONNECTOR_TYPE
|
|
159
|
+
download_config: Optional[S3DownloaderConfig] = field(default_factory=S3DownloaderConfig)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class S3UploaderConfig(FsspecUploaderConfig):
|
|
163
|
+
pass
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
@dataclass
|
|
167
|
+
class S3Uploader(FsspecUploader):
|
|
168
|
+
connector_type: str = CONNECTOR_TYPE
|
|
169
|
+
connection_config: S3ConnectionConfig
|
|
170
|
+
upload_config: S3UploaderConfig = field(default=None)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
s3_source_entry = SourceRegistryEntry(
|
|
174
|
+
indexer=S3Indexer,
|
|
175
|
+
indexer_config=S3IndexerConfig,
|
|
176
|
+
downloader=S3Downloader,
|
|
177
|
+
downloader_config=S3DownloaderConfig,
|
|
178
|
+
connection_config=S3ConnectionConfig,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
s3_destination_entry = DestinationRegistryEntry(
|
|
182
|
+
uploader=S3Uploader,
|
|
183
|
+
uploader_config=S3UploaderConfig,
|
|
184
|
+
connection_config=S3ConnectionConfig,
|
|
185
|
+
)
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from contextlib import contextmanager
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from time import time
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
9
|
+
from urllib.parse import urlparse
|
|
10
|
+
|
|
11
|
+
from pydantic import Field, Secret
|
|
12
|
+
|
|
13
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
|
+
from unstructured_ingest.v2.interfaces import FileData, FileDataSourceMetadata
|
|
15
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
16
|
+
DestinationRegistryEntry,
|
|
17
|
+
SourceRegistryEntry,
|
|
18
|
+
)
|
|
19
|
+
from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
20
|
+
FsspecAccessConfig,
|
|
21
|
+
FsspecConnectionConfig,
|
|
22
|
+
FsspecDownloader,
|
|
23
|
+
FsspecDownloaderConfig,
|
|
24
|
+
FsspecIndexer,
|
|
25
|
+
FsspecIndexerConfig,
|
|
26
|
+
FsspecUploader,
|
|
27
|
+
FsspecUploaderConfig,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from fsspec.implementations.sftp import SFTPFileSystem
|
|
32
|
+
|
|
33
|
+
CONNECTOR_TYPE = "sftp"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class SftpIndexerConfig(FsspecIndexerConfig):
|
|
37
|
+
def model_post_init(self, __context: Any) -> None:
|
|
38
|
+
super().model_post_init(__context)
|
|
39
|
+
_, ext = os.path.splitext(self.remote_url)
|
|
40
|
+
parsed_url = urlparse(self.remote_url)
|
|
41
|
+
if ext:
|
|
42
|
+
self.path_without_protocol = Path(parsed_url.path).parent.as_posix().lstrip("/")
|
|
43
|
+
else:
|
|
44
|
+
self.path_without_protocol = parsed_url.path.lstrip("/")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class SftpAccessConfig(FsspecAccessConfig):
|
|
48
|
+
password: str = Field(description="Password for sftp connection")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class SftpConnectionConfig(FsspecConnectionConfig):
|
|
52
|
+
supported_protocols: list[str] = Field(default_factory=lambda: ["sftp"], init=False)
|
|
53
|
+
access_config: Secret[SftpAccessConfig]
|
|
54
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
55
|
+
username: str = Field(description="Username for sftp connection")
|
|
56
|
+
host: Optional[str] = Field(default=None, description="Hostname for sftp connection")
|
|
57
|
+
port: int = Field(default=22, description="Port for sftp connection")
|
|
58
|
+
look_for_keys: bool = Field(
|
|
59
|
+
default=False, description="Whether to search for private key files in ~/.ssh/"
|
|
60
|
+
)
|
|
61
|
+
allow_agent: bool = Field(default=False, description="Whether to connect to the SSH agent.")
|
|
62
|
+
|
|
63
|
+
def get_access_config(self) -> dict[str, Any]:
|
|
64
|
+
access_config = {
|
|
65
|
+
"username": self.username,
|
|
66
|
+
"host": self.host,
|
|
67
|
+
"port": self.port,
|
|
68
|
+
"look_for_keys": self.look_for_keys,
|
|
69
|
+
"allow_agent": self.allow_agent,
|
|
70
|
+
"password": self.access_config.get_secret_value().password,
|
|
71
|
+
}
|
|
72
|
+
return access_config
|
|
73
|
+
|
|
74
|
+
@contextmanager
|
|
75
|
+
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
76
|
+
def get_client(self, protocol: str) -> Generator["SFTPFileSystem", None, None]:
|
|
77
|
+
# The paramiko.SSHClient() client that's opened by the SFTPFileSystem
|
|
78
|
+
# never gets closed so explicitly adding that as part of this context manager
|
|
79
|
+
from fsspec import get_filesystem_class
|
|
80
|
+
|
|
81
|
+
client: SFTPFileSystem = get_filesystem_class(protocol)(
|
|
82
|
+
**self.get_access_config(),
|
|
83
|
+
)
|
|
84
|
+
yield client
|
|
85
|
+
client.client.close()
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclass
|
|
89
|
+
class SftpIndexer(FsspecIndexer):
|
|
90
|
+
connection_config: SftpConnectionConfig
|
|
91
|
+
index_config: SftpIndexerConfig
|
|
92
|
+
connector_type: str = CONNECTOR_TYPE
|
|
93
|
+
|
|
94
|
+
def __post_init__(self):
|
|
95
|
+
parsed_url = urlparse(self.index_config.remote_url)
|
|
96
|
+
self.connection_config.host = parsed_url.hostname or self.connection_config.host
|
|
97
|
+
self.connection_config.port = parsed_url.port or self.connection_config.port
|
|
98
|
+
|
|
99
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
100
|
+
for file in super().run(**kwargs):
|
|
101
|
+
new_identifier = (
|
|
102
|
+
f"sftp://"
|
|
103
|
+
f"{self.connection_config.host}:"
|
|
104
|
+
f"{self.connection_config.port}/"
|
|
105
|
+
f"{file.identifier}"
|
|
106
|
+
)
|
|
107
|
+
file.identifier = new_identifier
|
|
108
|
+
yield file
|
|
109
|
+
|
|
110
|
+
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
111
|
+
path = file_data["name"]
|
|
112
|
+
date_created = str(file_data.get("time").timestamp()) if "time" in file_data else None
|
|
113
|
+
date_modified = str(file_data.get("mtime").timestamp()) if "mtime" in file_data else None
|
|
114
|
+
|
|
115
|
+
file_size = file_data.get("size") if "size" in file_data else None
|
|
116
|
+
|
|
117
|
+
record_locator = {
|
|
118
|
+
"protocol": self.index_config.protocol,
|
|
119
|
+
"remote_file_path": self.index_config.remote_url,
|
|
120
|
+
}
|
|
121
|
+
return FileDataSourceMetadata(
|
|
122
|
+
date_created=date_created,
|
|
123
|
+
date_modified=date_modified,
|
|
124
|
+
date_processed=str(time()),
|
|
125
|
+
url=f"{self.index_config.protocol}://{path}",
|
|
126
|
+
record_locator=record_locator,
|
|
127
|
+
filesize_bytes=file_size,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class SftpDownloaderConfig(FsspecDownloaderConfig):
|
|
132
|
+
remote_url: str = Field(description="Remote fsspec URL formatted as `protocol://dir/path`")
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
@dataclass
|
|
136
|
+
class SftpDownloader(FsspecDownloader):
|
|
137
|
+
protocol: str = "sftp"
|
|
138
|
+
connection_config: SftpConnectionConfig
|
|
139
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
140
|
+
download_config: Optional[SftpDownloaderConfig] = field(default_factory=SftpDownloaderConfig)
|
|
141
|
+
|
|
142
|
+
def __post_init__(self):
|
|
143
|
+
parsed_url = urlparse(self.download_config.remote_url)
|
|
144
|
+
self.connection_config.host = parsed_url.hostname or self.connection_config.host
|
|
145
|
+
self.connection_config.port = parsed_url.port or self.connection_config.port
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class SftpUploaderConfig(FsspecUploaderConfig):
|
|
149
|
+
pass
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
@dataclass
|
|
153
|
+
class SftpUploader(FsspecUploader):
|
|
154
|
+
connector_type: str = CONNECTOR_TYPE
|
|
155
|
+
connection_config: SftpConnectionConfig
|
|
156
|
+
upload_config: SftpUploaderConfig = field(default=None)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
sftp_source_entry = SourceRegistryEntry(
|
|
160
|
+
indexer=SftpIndexer,
|
|
161
|
+
indexer_config=SftpIndexerConfig,
|
|
162
|
+
downloader=SftpDownloader,
|
|
163
|
+
downloader_config=SftpDownloaderConfig,
|
|
164
|
+
connection_config=SftpConnectionConfig,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
sftp_destination_entry = DestinationRegistryEntry(
|
|
168
|
+
uploader=SftpUploader,
|
|
169
|
+
uploader_config=SftpUploaderConfig,
|
|
170
|
+
connection_config=SftpConnectionConfig,
|
|
171
|
+
)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Callable
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def json_serial(obj):
|
|
8
|
+
if isinstance(obj, Path):
|
|
9
|
+
return obj.as_posix()
|
|
10
|
+
if isinstance(obj, datetime):
|
|
11
|
+
return obj.isoformat()
|
|
12
|
+
raise TypeError("Type %s not serializable" % type(obj))
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def sterilize_dict(data: dict, default: Callable = json_serial) -> dict:
|
|
16
|
+
data_s = json.dumps(data, default=default)
|
|
17
|
+
return json.loads(data_s)
|
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
|
|
9
|
+
from pydantic import Field, Secret, model_validator
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.error import SourceConnectionError
|
|
12
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
|
+
from unstructured_ingest.v2.interfaces import (
|
|
14
|
+
AccessConfig,
|
|
15
|
+
ConnectionConfig,
|
|
16
|
+
Downloader,
|
|
17
|
+
DownloaderConfig,
|
|
18
|
+
DownloadResponse,
|
|
19
|
+
FileData,
|
|
20
|
+
FileDataSourceMetadata,
|
|
21
|
+
Indexer,
|
|
22
|
+
IndexerConfig,
|
|
23
|
+
SourceIdentifiers,
|
|
24
|
+
)
|
|
25
|
+
from unstructured_ingest.v2.logger import logger
|
|
26
|
+
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
27
|
+
|
|
28
|
+
CONNECTOR_TYPE = "gitlab"
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from gitlab import Gitlab
|
|
31
|
+
from gitlab.v4.objects.projects import Project
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class GitLabAccessConfig(AccessConfig):
|
|
35
|
+
access_token: Optional[str] = Field(
|
|
36
|
+
default=None,
|
|
37
|
+
description="Optional personal access token for authenticating with the GitLab API.",
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class GitLabConnectionConfig(ConnectionConfig):
|
|
42
|
+
access_config: Secret[GitLabAccessConfig] = Field(
|
|
43
|
+
default_factory=GitLabAccessConfig,
|
|
44
|
+
validate_default=True,
|
|
45
|
+
description="Secret configuration for accessing the GitLab API by authentication token.",
|
|
46
|
+
)
|
|
47
|
+
url: str = Field(description="The full URL to the GitLab project or repository.")
|
|
48
|
+
base_url: str = Field(
|
|
49
|
+
default="https://gitlab.com",
|
|
50
|
+
description="The base URL for the GitLab instance (default is GitLab's public domain).",
|
|
51
|
+
)
|
|
52
|
+
repo_path: str = Field(
|
|
53
|
+
default=None,
|
|
54
|
+
init=False,
|
|
55
|
+
repr=False,
|
|
56
|
+
description="The normalized path extracted from the repository URL.",
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
@model_validator(mode="after")
|
|
60
|
+
def set_repo_path(self):
|
|
61
|
+
"""
|
|
62
|
+
Parses the provided GitLab URL to extract the `base_url` and `repo_path`,
|
|
63
|
+
ensuring both are properly formatted for use.
|
|
64
|
+
|
|
65
|
+
If the URL contains a scheme (e.g., 'https') and a network location (e.g., 'gitlab.com'),
|
|
66
|
+
the `base_url` is set accordingly. The repository path is extracted and normalized
|
|
67
|
+
by removing any leading slashes.
|
|
68
|
+
|
|
69
|
+
Notes:
|
|
70
|
+
- If the URL contains both a scheme and network location, the `base_url` is
|
|
71
|
+
extracted directly from the URL.
|
|
72
|
+
- The `repo_path` is adjusted to remove any leading slashes.
|
|
73
|
+
- This method assumes that the URL follows GitLab's structure
|
|
74
|
+
(e.g., 'https://gitlab.com/owner/repo').
|
|
75
|
+
"""
|
|
76
|
+
parsed_gh_url = urlparse(self.url)
|
|
77
|
+
|
|
78
|
+
if parsed_gh_url.scheme and parsed_gh_url.netloc:
|
|
79
|
+
self.base_url = f"{parsed_gh_url.scheme}://{parsed_gh_url.netloc}"
|
|
80
|
+
self.repo_path = parsed_gh_url.path.lstrip("/")
|
|
81
|
+
|
|
82
|
+
return self
|
|
83
|
+
|
|
84
|
+
@SourceConnectionError.wrap
|
|
85
|
+
@requires_dependencies(["gitlab"], extras="gitlab")
|
|
86
|
+
@contextmanager
|
|
87
|
+
def get_client(self) -> Generator["Gitlab", None, None]:
|
|
88
|
+
from gitlab import Gitlab
|
|
89
|
+
|
|
90
|
+
logger.info(f"Connection to GitLab: {self.base_url!r}")
|
|
91
|
+
with Gitlab(
|
|
92
|
+
self.base_url, private_token=self.access_config.get_secret_value().access_token
|
|
93
|
+
) as client:
|
|
94
|
+
yield client
|
|
95
|
+
|
|
96
|
+
@contextmanager
|
|
97
|
+
def get_project(self) -> Generator["Project", None, None]:
|
|
98
|
+
"""Retrieves the specified GitLab project using the configured base URL and access token.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Project: A GitLab `Project` object representing the specified repository.
|
|
102
|
+
|
|
103
|
+
Raises:
|
|
104
|
+
SourceConnectionError: If the GitLab API connection fails.
|
|
105
|
+
gitlab.exceptions.GitlabGetError: If the project is not found.
|
|
106
|
+
"""
|
|
107
|
+
with self.get_client() as client:
|
|
108
|
+
logger.info(f"Accessing Project: '{self.repo_path}'")
|
|
109
|
+
project = client.projects.get(self.repo_path)
|
|
110
|
+
|
|
111
|
+
logger.info(f"Successfully accessed project '{self.repo_path}'")
|
|
112
|
+
yield project
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class GitLabIndexerConfig(IndexerConfig):
|
|
116
|
+
path: Path = Field(
|
|
117
|
+
default="/", description=("Path to the location in the repository that will be processed.")
|
|
118
|
+
)
|
|
119
|
+
recursive: bool = Field(
|
|
120
|
+
default=True,
|
|
121
|
+
description=(
|
|
122
|
+
"Flag to control recursive operations when indexing. "
|
|
123
|
+
"If True, the indexer will traverse directories recursively."
|
|
124
|
+
),
|
|
125
|
+
)
|
|
126
|
+
git_branch: Optional[str] = Field(
|
|
127
|
+
default=None,
|
|
128
|
+
description="The name of the branch to interact with.",
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
@dataclass
|
|
133
|
+
class GitLabIndexer(Indexer):
|
|
134
|
+
connection_config: GitLabConnectionConfig
|
|
135
|
+
index_config: GitLabIndexerConfig
|
|
136
|
+
|
|
137
|
+
def precheck(self) -> None:
|
|
138
|
+
"""Validates the connection to the GitLab instance by authenticating or
|
|
139
|
+
accessing the project.
|
|
140
|
+
|
|
141
|
+
This method ensures that the GitLab credentials and configuration are correct by
|
|
142
|
+
either authenticating or attempting to fetch the specified project.
|
|
143
|
+
|
|
144
|
+
Raises:
|
|
145
|
+
SourceConnectionError: If the connection or authentication with GitLab fails.
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
try:
|
|
149
|
+
with self.connection_config.get_client() as client:
|
|
150
|
+
if self.connection_config.access_config.get_secret_value().access_token is not None:
|
|
151
|
+
client.auth()
|
|
152
|
+
else:
|
|
153
|
+
client.projects.get(self.connection_config.repo_path)
|
|
154
|
+
|
|
155
|
+
except Exception as e:
|
|
156
|
+
logger.error(f"Failed to validate connection: {e}", exc_info=True)
|
|
157
|
+
raise SourceConnectionError(f"Failed to validate connection: {e}")
|
|
158
|
+
|
|
159
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
160
|
+
"""Iterates over the GitLab repository tree and yields file metadata as `FileData` objects.
|
|
161
|
+
|
|
162
|
+
This method fetches the repository tree for the specified branch and iterates
|
|
163
|
+
over its contents. For each file (blob), it generates a `FileData` object containing
|
|
164
|
+
the file's metadata, path, and permissions.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
**kwargs (Any): Additional keyword arguments (if required).
|
|
168
|
+
|
|
169
|
+
Yields:
|
|
170
|
+
FileData: A generator that yields `FileData` objects representing each file (blob)
|
|
171
|
+
in the repository.
|
|
172
|
+
"""
|
|
173
|
+
with self.connection_config.get_project() as project:
|
|
174
|
+
ref = self.index_config.git_branch or project.default_branch
|
|
175
|
+
|
|
176
|
+
files = project.repository_tree(
|
|
177
|
+
path=str(self.index_config.path),
|
|
178
|
+
ref=ref,
|
|
179
|
+
recursive=self.index_config.recursive,
|
|
180
|
+
iterator=True,
|
|
181
|
+
all=True,
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
for file in files:
|
|
185
|
+
relative_path = str(Path(file["path"]).relative_to(self.index_config.path))
|
|
186
|
+
if file["type"] == "blob":
|
|
187
|
+
record_locator = {
|
|
188
|
+
"file_path": file["path"],
|
|
189
|
+
"ref": ref,
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
yield FileData(
|
|
193
|
+
identifier=file["id"],
|
|
194
|
+
connector_type=CONNECTOR_TYPE,
|
|
195
|
+
source_identifiers=SourceIdentifiers(
|
|
196
|
+
fullpath=file["path"],
|
|
197
|
+
filename=Path(file["path"]).name,
|
|
198
|
+
rel_path=relative_path,
|
|
199
|
+
),
|
|
200
|
+
metadata=FileDataSourceMetadata(
|
|
201
|
+
url=file["id"],
|
|
202
|
+
record_locator=record_locator,
|
|
203
|
+
permissions_data=[{"mode": file["mode"]}],
|
|
204
|
+
),
|
|
205
|
+
additional_metadata={},
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
class GitLabDownloaderConfig(DownloaderConfig):
|
|
210
|
+
pass
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
@dataclass
|
|
214
|
+
class GitLabDownloader(Downloader):
|
|
215
|
+
connection_config: GitLabConnectionConfig
|
|
216
|
+
download_config: GitLabDownloaderConfig
|
|
217
|
+
|
|
218
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
219
|
+
"""Downloads a file from the repository and returns a `DownloadResponse`.
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
file_data (FileData): Metadata about the file to be downloaded.
|
|
223
|
+
**kwargs (Any): Additional arguments (if required).
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
DownloadResponse: A response object containing the download details.
|
|
227
|
+
"""
|
|
228
|
+
download_path = self.get_download_path(file_data=file_data)
|
|
229
|
+
if download_path is None:
|
|
230
|
+
logger.error(
|
|
231
|
+
"Generated download path is None, source_identifiers might be missing"
|
|
232
|
+
"from FileData."
|
|
233
|
+
)
|
|
234
|
+
raise ValueError("Generated invalid download path.")
|
|
235
|
+
|
|
236
|
+
self._download_file(file_data, download_path)
|
|
237
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
238
|
+
|
|
239
|
+
def _download_file(self, file_data: FileData, download_path: Path) -> None:
|
|
240
|
+
# NOTE: Indexer should supply the record locator in metadata
|
|
241
|
+
if (
|
|
242
|
+
file_data.metadata.record_locator is None
|
|
243
|
+
or "ref" not in file_data.metadata.record_locator
|
|
244
|
+
or "file_path" not in file_data.metadata.record_locator
|
|
245
|
+
):
|
|
246
|
+
logger.error(
|
|
247
|
+
f"Invalid record locator in metadata: {file_data.metadata.record_locator}."
|
|
248
|
+
"Keys 'ref' and 'path' must be present."
|
|
249
|
+
)
|
|
250
|
+
raise ValueError("Invalid record locator.")
|
|
251
|
+
|
|
252
|
+
ref = file_data.metadata.record_locator["ref"]
|
|
253
|
+
path = file_data.metadata.record_locator["file_path"]
|
|
254
|
+
download_path.parent.mkdir(exist_ok=True, parents=True)
|
|
255
|
+
|
|
256
|
+
with self.connection_config.get_project() as project:
|
|
257
|
+
project_file = project.files.get(file_path=path, ref=ref)
|
|
258
|
+
with open(download_path, "wb") as file:
|
|
259
|
+
file.write(project_file.decode())
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
gitlab_source_entry = SourceRegistryEntry(
|
|
263
|
+
connection_config=GitLabConnectionConfig,
|
|
264
|
+
indexer_config=GitLabIndexerConfig,
|
|
265
|
+
indexer=GitLabIndexer,
|
|
266
|
+
downloader_config=GitLabDownloaderConfig,
|
|
267
|
+
downloader=GitLabDownloader,
|
|
268
|
+
)
|