unstructured-ingest 0.6.4__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- examples/airtable.py +44 -0
- examples/azure_cognitive_search.py +55 -0
- examples/chroma.py +54 -0
- examples/couchbase.py +55 -0
- examples/databricks_volumes_dest.py +55 -0
- examples/databricks_volumes_source.py +53 -0
- examples/delta_table.py +45 -0
- examples/discord_example.py +36 -0
- examples/elasticsearch.py +49 -0
- examples/google_drive.py +45 -0
- examples/kdbai.py +54 -0
- examples/local.py +36 -0
- examples/milvus.py +44 -0
- examples/mongodb.py +53 -0
- examples/opensearch.py +50 -0
- examples/pinecone.py +57 -0
- examples/s3.py +38 -0
- examples/salesforce.py +44 -0
- examples/sharepoint.py +47 -0
- examples/singlestore.py +49 -0
- examples/sql.py +90 -0
- examples/vectara.py +54 -0
- examples/weaviate.py +44 -0
- test/integration/chunkers/test_chunkers.py +1 -1
- test/integration/connectors/conftest.py +1 -1
- test/integration/connectors/databricks/test_volumes_native.py +3 -3
- test/integration/connectors/discord/test_discord.py +1 -1
- test/integration/connectors/duckdb/test_duckdb.py +2 -2
- test/integration/connectors/duckdb/test_motherduck.py +2 -2
- test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
- test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
- test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
- test/integration/connectors/sql/test_postgres.py +2 -2
- test/integration/connectors/sql/test_singlestore.py +2 -2
- test/integration/connectors/sql/test_snowflake.py +2 -2
- test/integration/connectors/sql/test_sqlite.py +2 -2
- test/integration/connectors/sql/test_vastdb.py +1 -1
- test/integration/connectors/test_astradb.py +2 -2
- test/integration/connectors/test_azure_ai_search.py +2 -2
- test/integration/connectors/test_chroma.py +2 -2
- test/integration/connectors/test_confluence.py +1 -1
- test/integration/connectors/test_delta_table.py +2 -2
- test/integration/connectors/test_dropbox.py +2 -2
- test/integration/connectors/test_github.py +1 -1
- test/integration/connectors/test_google_drive.py +2 -2
- test/integration/connectors/test_jira.py +1 -1
- test/integration/connectors/test_lancedb.py +7 -7
- test/integration/connectors/test_milvus.py +2 -2
- test/integration/connectors/test_mongodb.py +2 -2
- test/integration/connectors/test_neo4j.py +7 -7
- test/integration/connectors/test_notion.py +2 -2
- test/integration/connectors/test_onedrive.py +2 -2
- test/integration/connectors/test_pinecone.py +3 -3
- test/integration/connectors/test_qdrant.py +6 -6
- test/integration/connectors/test_redis.py +3 -3
- test/integration/connectors/test_s3.py +3 -3
- test/integration/connectors/test_sharepoint.py +1 -1
- test/integration/connectors/test_vectara.py +4 -4
- test/integration/connectors/test_zendesk.py +2 -2
- test/integration/connectors/utils/validation/destination.py +2 -2
- test/integration/connectors/utils/validation/source.py +2 -2
- test/integration/connectors/weaviate/test_cloud.py +1 -1
- test/integration/connectors/weaviate/test_local.py +2 -2
- test/integration/embedders/test_azure_openai.py +1 -1
- test/integration/embedders/test_bedrock.py +2 -2
- test/integration/embedders/test_huggingface.py +1 -1
- test/integration/embedders/test_mixedbread.py +1 -1
- test/integration/embedders/test_octoai.py +2 -2
- test/integration/embedders/test_openai.py +2 -2
- test/integration/embedders/test_togetherai.py +2 -2
- test/integration/embedders/test_vertexai.py +1 -1
- test/integration/embedders/test_voyageai.py +1 -1
- test/integration/partitioners/test_partitioner.py +2 -2
- test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
- test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
- test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
- test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
- test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
- test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
- test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
- test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
- test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
- test/unit/test_html.py +1 -1
- test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
- test/unit/test_utils.py +106 -97
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/__init__.py +0 -14
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +259 -9
- unstructured_ingest/cli/base/dest.py +58 -61
- unstructured_ingest/cli/base/src.py +54 -36
- unstructured_ingest/cli/cli.py +4 -17
- unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
- unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
- unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
- unstructured_ingest/embed/bedrock.py +3 -3
- unstructured_ingest/embed/octoai.py +3 -3
- unstructured_ingest/embed/openai.py +3 -3
- unstructured_ingest/embed/togetherai.py +4 -4
- unstructured_ingest/embed/vertexai.py +1 -1
- unstructured_ingest/embed/voyageai.py +4 -4
- unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
- unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
- unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
- unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
- unstructured_ingest/{v2/otel.py → otel.py} +1 -1
- unstructured_ingest/pipeline/__init__.py +0 -22
- unstructured_ingest/pipeline/interfaces.py +179 -238
- unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
- unstructured_ingest/pipeline/pipeline.py +388 -97
- unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
- unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
- unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
- unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +11 -11
- unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
- unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
- unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
- unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
- unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
- unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/github.py +10 -10
- unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
- unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
- unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
- unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
- unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
- unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +10 -10
- unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
- unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
- unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
- unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
- unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
- unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
- unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
- unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
- unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
- unstructured_ingest/utils/compression.py +1 -48
- unstructured_ingest/utils/data_prep.py +9 -1
- unstructured_ingest/utils/html.py +3 -3
- unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
- unstructured_ingest/utils/string_and_date_utils.py +1 -1
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/METADATA +21 -21
- unstructured_ingest-0.7.0.dist-info/RECORD +370 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/top_level.txt +1 -0
- test/unit/v2/test_utils.py +0 -82
- unstructured_ingest/cli/cmd_factory.py +0 -12
- unstructured_ingest/cli/cmds/__init__.py +0 -145
- unstructured_ingest/cli/cmds/airtable.py +0 -69
- unstructured_ingest/cli/cmds/astradb.py +0 -99
- unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
- unstructured_ingest/cli/cmds/biomed.py +0 -52
- unstructured_ingest/cli/cmds/chroma.py +0 -104
- unstructured_ingest/cli/cmds/clarifai.py +0 -71
- unstructured_ingest/cli/cmds/confluence.py +0 -69
- unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
- unstructured_ingest/cli/cmds/delta_table.py +0 -94
- unstructured_ingest/cli/cmds/discord.py +0 -47
- unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
- unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
- unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
- unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
- unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
- unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
- unstructured_ingest/cli/cmds/github.py +0 -54
- unstructured_ingest/cli/cmds/gitlab.py +0 -54
- unstructured_ingest/cli/cmds/google_drive.py +0 -49
- unstructured_ingest/cli/cmds/hubspot.py +0 -70
- unstructured_ingest/cli/cmds/jira.py +0 -71
- unstructured_ingest/cli/cmds/kafka.py +0 -102
- unstructured_ingest/cli/cmds/local.py +0 -43
- unstructured_ingest/cli/cmds/mongodb.py +0 -72
- unstructured_ingest/cli/cmds/notion.py +0 -48
- unstructured_ingest/cli/cmds/onedrive.py +0 -66
- unstructured_ingest/cli/cmds/opensearch.py +0 -117
- unstructured_ingest/cli/cmds/outlook.py +0 -67
- unstructured_ingest/cli/cmds/pinecone.py +0 -71
- unstructured_ingest/cli/cmds/qdrant.py +0 -124
- unstructured_ingest/cli/cmds/reddit.py +0 -67
- unstructured_ingest/cli/cmds/salesforce.py +0 -58
- unstructured_ingest/cli/cmds/sharepoint.py +0 -66
- unstructured_ingest/cli/cmds/slack.py +0 -56
- unstructured_ingest/cli/cmds/sql.py +0 -66
- unstructured_ingest/cli/cmds/vectara.py +0 -66
- unstructured_ingest/cli/cmds/weaviate.py +0 -98
- unstructured_ingest/cli/cmds/wikipedia.py +0 -40
- unstructured_ingest/cli/common.py +0 -7
- unstructured_ingest/cli/interfaces.py +0 -663
- unstructured_ingest/cli/utils.py +0 -205
- unstructured_ingest/connector/airtable.py +0 -309
- unstructured_ingest/connector/astradb.py +0 -267
- unstructured_ingest/connector/azure_ai_search.py +0 -144
- unstructured_ingest/connector/biomed.py +0 -320
- unstructured_ingest/connector/chroma.py +0 -158
- unstructured_ingest/connector/clarifai.py +0 -122
- unstructured_ingest/connector/confluence.py +0 -285
- unstructured_ingest/connector/databricks_volumes.py +0 -137
- unstructured_ingest/connector/delta_table.py +0 -203
- unstructured_ingest/connector/discord.py +0 -180
- unstructured_ingest/connector/elasticsearch.py +0 -396
- unstructured_ingest/connector/fsspec/azure.py +0 -78
- unstructured_ingest/connector/fsspec/box.py +0 -109
- unstructured_ingest/connector/fsspec/dropbox.py +0 -160
- unstructured_ingest/connector/fsspec/fsspec.py +0 -359
- unstructured_ingest/connector/fsspec/gcs.py +0 -82
- unstructured_ingest/connector/fsspec/s3.py +0 -62
- unstructured_ingest/connector/fsspec/sftp.py +0 -81
- unstructured_ingest/connector/git.py +0 -124
- unstructured_ingest/connector/github.py +0 -174
- unstructured_ingest/connector/gitlab.py +0 -142
- unstructured_ingest/connector/google_drive.py +0 -348
- unstructured_ingest/connector/hubspot.py +0 -278
- unstructured_ingest/connector/jira.py +0 -469
- unstructured_ingest/connector/kafka.py +0 -293
- unstructured_ingest/connector/local.py +0 -139
- unstructured_ingest/connector/mongodb.py +0 -284
- unstructured_ingest/connector/notion/client.py +0 -248
- unstructured_ingest/connector/notion/connector.py +0 -469
- unstructured_ingest/connector/notion/helpers.py +0 -584
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
- unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
- unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
- unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
- unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
- unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
- unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
- unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
- unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
- unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
- unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
- unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
- unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
- unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
- unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
- unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
- unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
- unstructured_ingest/connector/notion/types/date.py +0 -26
- unstructured_ingest/connector/notion/types/file.py +0 -51
- unstructured_ingest/connector/notion/types/user.py +0 -76
- unstructured_ingest/connector/onedrive.py +0 -232
- unstructured_ingest/connector/opensearch.py +0 -218
- unstructured_ingest/connector/outlook.py +0 -285
- unstructured_ingest/connector/pinecone.py +0 -150
- unstructured_ingest/connector/qdrant.py +0 -144
- unstructured_ingest/connector/reddit.py +0 -166
- unstructured_ingest/connector/registry.py +0 -109
- unstructured_ingest/connector/salesforce.py +0 -301
- unstructured_ingest/connector/sharepoint.py +0 -573
- unstructured_ingest/connector/slack.py +0 -224
- unstructured_ingest/connector/sql.py +0 -199
- unstructured_ingest/connector/vectara.py +0 -253
- unstructured_ingest/connector/weaviate.py +0 -190
- unstructured_ingest/connector/wikipedia.py +0 -208
- unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
- unstructured_ingest/enhanced_dataclass/core.py +0 -99
- unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
- unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
- unstructured_ingest/interfaces.py +0 -852
- unstructured_ingest/pipeline/copy.py +0 -19
- unstructured_ingest/pipeline/doc_factory.py +0 -12
- unstructured_ingest/pipeline/partition.py +0 -60
- unstructured_ingest/pipeline/permissions.py +0 -12
- unstructured_ingest/pipeline/reformat/chunking.py +0 -134
- unstructured_ingest/pipeline/reformat/embedding.py +0 -64
- unstructured_ingest/pipeline/source.py +0 -77
- unstructured_ingest/pipeline/utils.py +0 -6
- unstructured_ingest/pipeline/write.py +0 -18
- unstructured_ingest/processor.py +0 -93
- unstructured_ingest/runner/__init__.py +0 -104
- unstructured_ingest/runner/airtable.py +0 -35
- unstructured_ingest/runner/astradb.py +0 -34
- unstructured_ingest/runner/base_runner.py +0 -89
- unstructured_ingest/runner/biomed.py +0 -45
- unstructured_ingest/runner/confluence.py +0 -35
- unstructured_ingest/runner/delta_table.py +0 -34
- unstructured_ingest/runner/discord.py +0 -35
- unstructured_ingest/runner/elasticsearch.py +0 -40
- unstructured_ingest/runner/fsspec/azure.py +0 -30
- unstructured_ingest/runner/fsspec/box.py +0 -28
- unstructured_ingest/runner/fsspec/dropbox.py +0 -30
- unstructured_ingest/runner/fsspec/fsspec.py +0 -40
- unstructured_ingest/runner/fsspec/gcs.py +0 -28
- unstructured_ingest/runner/fsspec/s3.py +0 -28
- unstructured_ingest/runner/fsspec/sftp.py +0 -28
- unstructured_ingest/runner/github.py +0 -37
- unstructured_ingest/runner/gitlab.py +0 -37
- unstructured_ingest/runner/google_drive.py +0 -35
- unstructured_ingest/runner/hubspot.py +0 -35
- unstructured_ingest/runner/jira.py +0 -35
- unstructured_ingest/runner/kafka.py +0 -34
- unstructured_ingest/runner/local.py +0 -23
- unstructured_ingest/runner/mongodb.py +0 -34
- unstructured_ingest/runner/notion.py +0 -61
- unstructured_ingest/runner/onedrive.py +0 -35
- unstructured_ingest/runner/opensearch.py +0 -40
- unstructured_ingest/runner/outlook.py +0 -33
- unstructured_ingest/runner/reddit.py +0 -35
- unstructured_ingest/runner/salesforce.py +0 -33
- unstructured_ingest/runner/sharepoint.py +0 -35
- unstructured_ingest/runner/slack.py +0 -33
- unstructured_ingest/runner/utils.py +0 -47
- unstructured_ingest/runner/wikipedia.py +0 -35
- unstructured_ingest/runner/writers/__init__.py +0 -48
- unstructured_ingest/runner/writers/astradb.py +0 -22
- unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
- unstructured_ingest/runner/writers/base_writer.py +0 -26
- unstructured_ingest/runner/writers/chroma.py +0 -22
- unstructured_ingest/runner/writers/clarifai.py +0 -19
- unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
- unstructured_ingest/runner/writers/delta_table.py +0 -24
- unstructured_ingest/runner/writers/elasticsearch.py +0 -24
- unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
- unstructured_ingest/runner/writers/fsspec/box.py +0 -21
- unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
- unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
- unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
- unstructured_ingest/runner/writers/kafka.py +0 -21
- unstructured_ingest/runner/writers/mongodb.py +0 -21
- unstructured_ingest/runner/writers/opensearch.py +0 -26
- unstructured_ingest/runner/writers/pinecone.py +0 -21
- unstructured_ingest/runner/writers/qdrant.py +0 -19
- unstructured_ingest/runner/writers/sql.py +0 -22
- unstructured_ingest/runner/writers/vectara.py +0 -22
- unstructured_ingest/runner/writers/weaviate.py +0 -21
- unstructured_ingest/utils/google_filetype.py +0 -9
- unstructured_ingest/v2/__init__.py +0 -1
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +0 -4
- unstructured_ingest/v2/cli/base/cmd.py +0 -269
- unstructured_ingest/v2/cli/base/dest.py +0 -85
- unstructured_ingest/v2/cli/base/src.py +0 -85
- unstructured_ingest/v2/cli/cli.py +0 -24
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/logger.py +0 -126
- unstructured_ingest/v2/main.py +0 -11
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +0 -211
- unstructured_ingest/v2/pipeline/pipeline.py +0 -408
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
- unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
- unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/v2/processes/utils/__init__.py +0 -0
- unstructured_ingest/v2/types/__init__.py +0 -0
- unstructured_ingest-0.6.4.dist-info/RECORD +0 -591
- {test/unit/v2 → examples}/__init__.py +0 -0
- /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
- /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
- /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
- /test/unit/{v2/utils → utils}/__init__.py +0 -0
- /test/unit/{v2/utils → utils}/data_generator.py +0 -0
- /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
- /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
- /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
- /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
- /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
- /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
- /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
- /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
- /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
- /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
- /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
- /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
- /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
- /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
- /unstructured_ingest/{v2 → utils}/constants.py +0 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,82 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from typing import Type
|
|
5
|
-
|
|
6
|
-
from unstructured_ingest.connector.fsspec.fsspec import (
|
|
7
|
-
FsspecDestinationConnector,
|
|
8
|
-
FsspecIngestDoc,
|
|
9
|
-
FsspecSourceConnector,
|
|
10
|
-
FsspecWriteConfig,
|
|
11
|
-
SimpleFsspecConfig,
|
|
12
|
-
)
|
|
13
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
14
|
-
from unstructured_ingest.error import SourceConnectionError
|
|
15
|
-
from unstructured_ingest.interfaces import AccessConfig
|
|
16
|
-
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
17
|
-
from unstructured_ingest.utils.string_and_date_utils import json_to_dict
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
@dataclass
|
|
21
|
-
class GcsAccessConfig(AccessConfig):
|
|
22
|
-
token: t.Optional[str] = enhanced_field(
|
|
23
|
-
default=None, sensitive=True, overload_name="service_account_key"
|
|
24
|
-
)
|
|
25
|
-
|
|
26
|
-
def __post_init__(self):
|
|
27
|
-
ALLOWED_AUTH_VALUES = "google_default", "cache", "anon", "browser", "cloud"
|
|
28
|
-
|
|
29
|
-
# Case: null value
|
|
30
|
-
if not self.token:
|
|
31
|
-
return
|
|
32
|
-
# Case: one of auth constants
|
|
33
|
-
if self.token in ALLOWED_AUTH_VALUES:
|
|
34
|
-
return
|
|
35
|
-
# Case: token as json
|
|
36
|
-
if isinstance(json_to_dict(self.token), dict):
|
|
37
|
-
self.token = json_to_dict(self.token)
|
|
38
|
-
return
|
|
39
|
-
# Case: path to token
|
|
40
|
-
if Path(self.token).is_file():
|
|
41
|
-
return
|
|
42
|
-
|
|
43
|
-
raise ValueError("Invalid auth token value")
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
@dataclass
|
|
47
|
-
class GcsWriteConfig(FsspecWriteConfig):
|
|
48
|
-
pass
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
@dataclass
|
|
52
|
-
class SimpleGcsConfig(SimpleFsspecConfig):
|
|
53
|
-
access_config: GcsAccessConfig = None
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
@dataclass
|
|
57
|
-
class GcsIngestDoc(FsspecIngestDoc):
|
|
58
|
-
connector_config: SimpleGcsConfig
|
|
59
|
-
registry_name: str = "gcs"
|
|
60
|
-
|
|
61
|
-
@SourceConnectionError.wrap
|
|
62
|
-
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
63
|
-
def get_file(self):
|
|
64
|
-
super().get_file()
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
@dataclass
|
|
68
|
-
class GcsSourceConnector(FsspecSourceConnector):
|
|
69
|
-
connector_config: SimpleGcsConfig
|
|
70
|
-
|
|
71
|
-
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
72
|
-
def initialize(self):
|
|
73
|
-
super().initialize()
|
|
74
|
-
|
|
75
|
-
def __post_init__(self):
|
|
76
|
-
self.ingest_doc_cls: Type[GcsIngestDoc] = GcsIngestDoc
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
@dataclass
|
|
80
|
-
class GcsDestinationConnector(FsspecDestinationConnector):
|
|
81
|
-
connector_config: SimpleGcsConfig
|
|
82
|
-
write_config: GcsWriteConfig
|
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
from typing import Type
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.connector.fsspec.fsspec import (
|
|
6
|
-
FsspecDestinationConnector,
|
|
7
|
-
FsspecIngestDoc,
|
|
8
|
-
FsspecSourceConnector,
|
|
9
|
-
FsspecWriteConfig,
|
|
10
|
-
SimpleFsspecConfig,
|
|
11
|
-
)
|
|
12
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
13
|
-
from unstructured_ingest.interfaces import AccessConfig
|
|
14
|
-
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
@dataclass
|
|
18
|
-
class S3AccessConfig(AccessConfig):
|
|
19
|
-
anon: bool = enhanced_field(default=False, overload_name="anonymous")
|
|
20
|
-
endpoint_url: t.Optional[str] = None
|
|
21
|
-
key: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
22
|
-
secret: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
23
|
-
token: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
@dataclass
|
|
27
|
-
class S3WriteConfig(FsspecWriteConfig):
|
|
28
|
-
pass
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
@dataclass
|
|
32
|
-
class SimpleS3Config(SimpleFsspecConfig):
|
|
33
|
-
access_config: S3AccessConfig = enhanced_field(default=None)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
@dataclass
|
|
37
|
-
class S3IngestDoc(FsspecIngestDoc):
|
|
38
|
-
connector_config: SimpleS3Config
|
|
39
|
-
remote_file_path: str
|
|
40
|
-
registry_name: str = "s3"
|
|
41
|
-
|
|
42
|
-
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
43
|
-
def get_file(self):
|
|
44
|
-
super().get_file()
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
@dataclass
|
|
48
|
-
class S3SourceConnector(FsspecSourceConnector):
|
|
49
|
-
connector_config: SimpleS3Config
|
|
50
|
-
|
|
51
|
-
def __post_init__(self):
|
|
52
|
-
self.ingest_doc_cls: Type[S3IngestDoc] = S3IngestDoc
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
@dataclass
|
|
56
|
-
class S3DestinationConnector(FsspecDestinationConnector):
|
|
57
|
-
connector_config: SimpleS3Config
|
|
58
|
-
write_config: S3WriteConfig
|
|
59
|
-
|
|
60
|
-
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
61
|
-
def initialize(self):
|
|
62
|
-
super().initialize()
|
|
@@ -1,81 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from typing import Type
|
|
5
|
-
from urllib.parse import urlparse
|
|
6
|
-
|
|
7
|
-
from unstructured_ingest.connector.fsspec.fsspec import (
|
|
8
|
-
FsspecIngestDoc,
|
|
9
|
-
FsspecSourceConnector,
|
|
10
|
-
SimpleFsspecConfig,
|
|
11
|
-
)
|
|
12
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
13
|
-
from unstructured_ingest.error import SourceConnectionError
|
|
14
|
-
from unstructured_ingest.interfaces import AccessConfig
|
|
15
|
-
from unstructured_ingest.logger import logger
|
|
16
|
-
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
@dataclass
|
|
20
|
-
class SftpAccessConfig(AccessConfig):
|
|
21
|
-
username: str
|
|
22
|
-
password: str = enhanced_field(sensitive=True)
|
|
23
|
-
host: str = ""
|
|
24
|
-
port: int = 22
|
|
25
|
-
look_for_keys: bool = False
|
|
26
|
-
allow_agent: bool = False
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
@dataclass
|
|
30
|
-
class SimpleSftpConfig(SimpleFsspecConfig):
|
|
31
|
-
access_config: SftpAccessConfig = None
|
|
32
|
-
|
|
33
|
-
def __post_init__(self):
|
|
34
|
-
super().__post_init__()
|
|
35
|
-
|
|
36
|
-
_, ext = os.path.splitext(self.remote_url)
|
|
37
|
-
parsed_url = urlparse(self.remote_url)
|
|
38
|
-
if ext:
|
|
39
|
-
# We only want the file_path if it has an extension
|
|
40
|
-
self.file_path = Path(self.remote_url).name
|
|
41
|
-
self.dir_path = Path(parsed_url.path).parent.as_posix().lstrip("/")
|
|
42
|
-
self.path_without_protocol = self.dir_path
|
|
43
|
-
else:
|
|
44
|
-
self.file_path = ""
|
|
45
|
-
self.dir_path = parsed_url.path.lstrip("/")
|
|
46
|
-
self.path_without_protocol = self.dir_path
|
|
47
|
-
self.access_config.host = parsed_url.hostname or self.access_config.host
|
|
48
|
-
self.access_config.port = parsed_url.port or self.access_config.port
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
@dataclass
|
|
52
|
-
class SftpIngestDoc(FsspecIngestDoc):
|
|
53
|
-
connector_config: SimpleSftpConfig
|
|
54
|
-
registry_name: str = "sftp"
|
|
55
|
-
|
|
56
|
-
@SourceConnectionError.wrap
|
|
57
|
-
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
58
|
-
def get_file(self):
|
|
59
|
-
super().get_file()
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
@dataclass
|
|
63
|
-
class SftpSourceConnector(FsspecSourceConnector):
|
|
64
|
-
connector_config: SimpleSftpConfig
|
|
65
|
-
|
|
66
|
-
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
67
|
-
def initialize(self):
|
|
68
|
-
super().initialize()
|
|
69
|
-
|
|
70
|
-
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
71
|
-
def check_connection(self):
|
|
72
|
-
from fsspec.implementations.sftp import SFTPFileSystem
|
|
73
|
-
|
|
74
|
-
try:
|
|
75
|
-
SFTPFileSystem(**self.connector_config.get_access_config())
|
|
76
|
-
except Exception as e:
|
|
77
|
-
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
78
|
-
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
79
|
-
|
|
80
|
-
def __post_init__(self):
|
|
81
|
-
self.ingest_doc_cls: Type[SftpIngestDoc] = SftpIngestDoc
|
|
@@ -1,124 +0,0 @@
|
|
|
1
|
-
import fnmatch
|
|
2
|
-
import typing as t
|
|
3
|
-
from dataclasses import dataclass, field
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
|
|
6
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
7
|
-
from unstructured_ingest.error import SourceConnectionError
|
|
8
|
-
from unstructured_ingest.interfaces import (
|
|
9
|
-
AccessConfig,
|
|
10
|
-
BaseConnectorConfig,
|
|
11
|
-
BaseSingleIngestDoc,
|
|
12
|
-
BaseSourceConnector,
|
|
13
|
-
IngestDocCleanupMixin,
|
|
14
|
-
SourceConnectorCleanupMixin,
|
|
15
|
-
)
|
|
16
|
-
from unstructured_ingest.logger import logger
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
@dataclass
|
|
20
|
-
class GitAccessConfig(AccessConfig):
|
|
21
|
-
access_token: t.Optional[str] = enhanced_field(
|
|
22
|
-
default=None, sensitive=True, overload_name="git_access_token"
|
|
23
|
-
)
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
@dataclass
|
|
27
|
-
class SimpleGitConfig(BaseConnectorConfig):
|
|
28
|
-
url: str
|
|
29
|
-
access_config: GitAccessConfig
|
|
30
|
-
branch: t.Optional[str] = enhanced_field(default=None, overload_name="git_branch")
|
|
31
|
-
file_glob: t.Optional[t.List[str]] = enhanced_field(default=None, overload_name="git_file_glob")
|
|
32
|
-
repo_path: str = field(init=False, repr=False)
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
@dataclass
|
|
36
|
-
class GitIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
37
|
-
connector_config: SimpleGitConfig = field(repr=False)
|
|
38
|
-
path: str
|
|
39
|
-
|
|
40
|
-
@property
|
|
41
|
-
def filename(self):
|
|
42
|
-
return (Path(self.read_config.download_dir) / self.path).resolve()
|
|
43
|
-
|
|
44
|
-
@property
|
|
45
|
-
def _output_filename(self):
|
|
46
|
-
return Path(self.processor_config.output_dir) / f"{self.path}.json"
|
|
47
|
-
|
|
48
|
-
@property
|
|
49
|
-
def record_locator(self) -> t.Dict[str, t.Any]:
|
|
50
|
-
record_locator = {
|
|
51
|
-
"repo_path": self.connector_config.repo_path,
|
|
52
|
-
"file_path": self.path,
|
|
53
|
-
}
|
|
54
|
-
if self.connector_config.branch is not None:
|
|
55
|
-
record_locator["branch"] = self.connector_config.branch
|
|
56
|
-
return record_locator
|
|
57
|
-
|
|
58
|
-
def _create_full_tmp_dir_path(self):
|
|
59
|
-
"""includes directories in in the gitlab repository"""
|
|
60
|
-
self.filename.parent.mkdir(parents=True, exist_ok=True)
|
|
61
|
-
|
|
62
|
-
def update_source_metadata(self, **kwargs):
|
|
63
|
-
raise NotImplementedError()
|
|
64
|
-
|
|
65
|
-
@SourceConnectionError.wrap
|
|
66
|
-
@BaseSingleIngestDoc.skip_if_file_exists
|
|
67
|
-
def get_file(self):
|
|
68
|
-
"""Fetches the "remote" doc and stores it locally on the filesystem."""
|
|
69
|
-
self._create_full_tmp_dir_path()
|
|
70
|
-
self._fetch_and_write()
|
|
71
|
-
|
|
72
|
-
def _fetch_content(self) -> None:
|
|
73
|
-
raise NotImplementedError()
|
|
74
|
-
|
|
75
|
-
def _fetch_and_write(self) -> None:
|
|
76
|
-
raise NotImplementedError()
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
@dataclass
|
|
80
|
-
class GitSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
81
|
-
connector_config: SimpleGitConfig
|
|
82
|
-
|
|
83
|
-
def initialize(self):
|
|
84
|
-
pass
|
|
85
|
-
|
|
86
|
-
def check_connection(self):
|
|
87
|
-
pass
|
|
88
|
-
|
|
89
|
-
@staticmethod
|
|
90
|
-
def is_file_type_supported(path: str) -> bool:
|
|
91
|
-
# Workaround to ensure that auto.partition isn't fed with .yaml, .py, etc. files
|
|
92
|
-
# TODO: What to do with no filenames? e.g. LICENSE, Makefile, etc.
|
|
93
|
-
supported = path.endswith(
|
|
94
|
-
(
|
|
95
|
-
".md",
|
|
96
|
-
".txt",
|
|
97
|
-
".pdf",
|
|
98
|
-
".doc",
|
|
99
|
-
".docx",
|
|
100
|
-
".eml",
|
|
101
|
-
".heic",
|
|
102
|
-
".html",
|
|
103
|
-
".png",
|
|
104
|
-
".jpg",
|
|
105
|
-
".ppt",
|
|
106
|
-
".pptx",
|
|
107
|
-
".xml",
|
|
108
|
-
),
|
|
109
|
-
)
|
|
110
|
-
if not supported:
|
|
111
|
-
logger.debug(
|
|
112
|
-
f"The file {path!r} is discarded as it does not contain a supported filetype.",
|
|
113
|
-
)
|
|
114
|
-
return supported
|
|
115
|
-
|
|
116
|
-
def does_path_match_glob(self, path: str) -> bool:
|
|
117
|
-
if not self.connector_config.file_glob:
|
|
118
|
-
return True
|
|
119
|
-
patterns = self.connector_config.file_glob
|
|
120
|
-
for pattern in patterns:
|
|
121
|
-
if fnmatch.filter([path], pattern):
|
|
122
|
-
return True
|
|
123
|
-
logger.debug(f"the file {path!r} is discarded as it does not match any given glob.")
|
|
124
|
-
return False
|
|
@@ -1,174 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
from datetime import datetime
|
|
4
|
-
from urllib.parse import urlparse
|
|
5
|
-
|
|
6
|
-
from unstructured_ingest.connector.git import (
|
|
7
|
-
GitIngestDoc,
|
|
8
|
-
GitSourceConnector,
|
|
9
|
-
SimpleGitConfig,
|
|
10
|
-
)
|
|
11
|
-
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
12
|
-
from unstructured_ingest.interfaces import SourceMetadata
|
|
13
|
-
from unstructured_ingest.logger import logger
|
|
14
|
-
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
15
|
-
|
|
16
|
-
if t.TYPE_CHECKING:
|
|
17
|
-
from github.Repository import Repository
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
@dataclass
|
|
21
|
-
class SimpleGitHubConfig(SimpleGitConfig):
|
|
22
|
-
def __post_init__(self):
|
|
23
|
-
parsed_gh_url = urlparse(self.url)
|
|
24
|
-
path_fragments = [fragment for fragment in parsed_gh_url.path.split("/") if fragment]
|
|
25
|
-
|
|
26
|
-
# If a scheme and netloc are provided, ensure they are correct
|
|
27
|
-
# Additionally, ensure that the path contains two fragments
|
|
28
|
-
if (
|
|
29
|
-
(parsed_gh_url.scheme and parsed_gh_url.scheme != "https")
|
|
30
|
-
or (parsed_gh_url.netloc and parsed_gh_url.netloc != "github.com")
|
|
31
|
-
or len(path_fragments) != 2
|
|
32
|
-
):
|
|
33
|
-
raise ValueError(
|
|
34
|
-
'Please provide a valid URL, e.g. "https://github.com/Unstructured-IO/unstructured"'
|
|
35
|
-
' or a repository owner/name pair, e.g. "Unstructured-IO/unstructured".',
|
|
36
|
-
)
|
|
37
|
-
|
|
38
|
-
# If there's no issues, store the core repository info
|
|
39
|
-
self.repo_path = parsed_gh_url.path
|
|
40
|
-
|
|
41
|
-
@SourceConnectionError.wrap
|
|
42
|
-
@requires_dependencies(["github"], extras="github")
|
|
43
|
-
def get_repo(self) -> "Repository":
|
|
44
|
-
from github import Github
|
|
45
|
-
|
|
46
|
-
github = Github(self.access_config.access_token)
|
|
47
|
-
return github.get_repo(self.repo_path)
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
@dataclass
|
|
51
|
-
class GitHubIngestDoc(GitIngestDoc):
|
|
52
|
-
connector_config: SimpleGitHubConfig
|
|
53
|
-
registry_name: str = "github"
|
|
54
|
-
|
|
55
|
-
@property
|
|
56
|
-
def date_created(self) -> t.Optional[str]:
|
|
57
|
-
return None
|
|
58
|
-
|
|
59
|
-
@requires_dependencies(["github"], extras="github")
|
|
60
|
-
def _fetch_file(self):
|
|
61
|
-
from github.GithubException import UnknownObjectException
|
|
62
|
-
|
|
63
|
-
try:
|
|
64
|
-
content_file = self.connector_config.get_repo().get_contents(self.path)
|
|
65
|
-
except UnknownObjectException:
|
|
66
|
-
logger.error(f"File doesn't exists {self.connector_config.url}/{self.path}")
|
|
67
|
-
return None
|
|
68
|
-
|
|
69
|
-
return content_file
|
|
70
|
-
|
|
71
|
-
@SourceConnectionNetworkError.wrap
|
|
72
|
-
@requires_dependencies(["requests"], extras="github")
|
|
73
|
-
def _fetch_content(self, content_file):
|
|
74
|
-
import requests
|
|
75
|
-
|
|
76
|
-
contents = b""
|
|
77
|
-
if (
|
|
78
|
-
not content_file.content # type: ignore
|
|
79
|
-
and content_file.encoding == "none" # type: ignore
|
|
80
|
-
and content_file.size # type: ignore
|
|
81
|
-
):
|
|
82
|
-
logger.info("File too large for the GitHub API, using direct download link instead.")
|
|
83
|
-
# NOTE: Maybe add a raise_for_status to catch connection timeout or HTTP Errors?
|
|
84
|
-
response = requests.get(content_file.download_url) # type: ignore
|
|
85
|
-
if response.status_code != 200:
|
|
86
|
-
logger.info("Direct download link has failed... Skipping this file.")
|
|
87
|
-
return None
|
|
88
|
-
else:
|
|
89
|
-
contents = response.content
|
|
90
|
-
else:
|
|
91
|
-
contents = content_file.decoded_content # type: ignore
|
|
92
|
-
return contents
|
|
93
|
-
|
|
94
|
-
def update_source_metadata(self, **kwargs):
|
|
95
|
-
content_file = kwargs.get("content_file", self._fetch_file())
|
|
96
|
-
if content_file is None:
|
|
97
|
-
self.source_metadata = SourceMetadata(
|
|
98
|
-
exists=False,
|
|
99
|
-
)
|
|
100
|
-
return
|
|
101
|
-
|
|
102
|
-
date_modified = datetime.strptime(
|
|
103
|
-
content_file.last_modified,
|
|
104
|
-
"%a, %d %b %Y %H:%M:%S %Z",
|
|
105
|
-
).isoformat()
|
|
106
|
-
self.source_metadata = SourceMetadata(
|
|
107
|
-
date_modified=date_modified,
|
|
108
|
-
version=content_file.etag,
|
|
109
|
-
source_url=content_file.download_url,
|
|
110
|
-
exists=True,
|
|
111
|
-
)
|
|
112
|
-
|
|
113
|
-
def _fetch_and_write(self) -> None:
|
|
114
|
-
content_file = self._fetch_file()
|
|
115
|
-
self.update_source_metadata(content_file=content_file)
|
|
116
|
-
contents = self._fetch_content(content_file)
|
|
117
|
-
if contents is None:
|
|
118
|
-
raise ValueError(
|
|
119
|
-
f"Failed to retrieve file from repo "
|
|
120
|
-
f"{self.connector_config.url}/{self.path}. Check logs",
|
|
121
|
-
)
|
|
122
|
-
with open(self.filename, "wb") as f:
|
|
123
|
-
f.write(contents)
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
@dataclass
|
|
127
|
-
class GitHubSourceConnector(GitSourceConnector):
|
|
128
|
-
connector_config: SimpleGitHubConfig
|
|
129
|
-
|
|
130
|
-
@requires_dependencies(["github"], extras="github")
|
|
131
|
-
def check_connection(self):
|
|
132
|
-
from github import Consts
|
|
133
|
-
from github.GithubRetry import GithubRetry
|
|
134
|
-
from github.Requester import Requester
|
|
135
|
-
|
|
136
|
-
try:
|
|
137
|
-
requester = Requester(
|
|
138
|
-
auth=self.connector_config.access_config.access_token,
|
|
139
|
-
base_url=Consts.DEFAULT_BASE_URL,
|
|
140
|
-
timeout=Consts.DEFAULT_TIMEOUT,
|
|
141
|
-
user_agent=Consts.DEFAULT_USER_AGENT,
|
|
142
|
-
per_page=Consts.DEFAULT_PER_PAGE,
|
|
143
|
-
verify=True,
|
|
144
|
-
retry=GithubRetry(),
|
|
145
|
-
pool_size=None,
|
|
146
|
-
)
|
|
147
|
-
url_base = (
|
|
148
|
-
"/repositories/" if isinstance(self.connector_config.repo_path, int) else "/repos/"
|
|
149
|
-
)
|
|
150
|
-
url = f"{url_base}{self.connector_config.repo_path}"
|
|
151
|
-
headers, _ = requester.requestJsonAndCheck("HEAD", url)
|
|
152
|
-
logger.debug(f"headers from HEAD request: {headers}")
|
|
153
|
-
except Exception as e:
|
|
154
|
-
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
155
|
-
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
156
|
-
|
|
157
|
-
def get_ingest_docs(self):
|
|
158
|
-
repo = self.connector_config.get_repo()
|
|
159
|
-
# Load the Git tree with all files, and then create Ingest docs
|
|
160
|
-
# for all blobs, i.e. all files, ignoring directories
|
|
161
|
-
sha = self.connector_config.branch or repo.default_branch
|
|
162
|
-
git_tree = repo.get_git_tree(sha, recursive=True)
|
|
163
|
-
return [
|
|
164
|
-
GitHubIngestDoc(
|
|
165
|
-
connector_config=self.connector_config,
|
|
166
|
-
processor_config=self.processor_config,
|
|
167
|
-
read_config=self.read_config,
|
|
168
|
-
path=element.path,
|
|
169
|
-
)
|
|
170
|
-
for element in git_tree.tree
|
|
171
|
-
if element.type == "blob"
|
|
172
|
-
and self.is_file_type_supported(element.path)
|
|
173
|
-
and (not self.connector_config.file_glob or self.does_path_match_glob(element.path))
|
|
174
|
-
]
|
|
@@ -1,142 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
from urllib.parse import urlparse
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.connector.git import (
|
|
6
|
-
GitIngestDoc,
|
|
7
|
-
GitSourceConnector,
|
|
8
|
-
SimpleGitConfig,
|
|
9
|
-
)
|
|
10
|
-
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
11
|
-
from unstructured_ingest.interfaces import SourceMetadata
|
|
12
|
-
from unstructured_ingest.logger import logger
|
|
13
|
-
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
|
-
|
|
15
|
-
if t.TYPE_CHECKING:
|
|
16
|
-
from gitlab.v4.objects.projects import Project
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
@dataclass
|
|
20
|
-
class SimpleGitlabConfig(SimpleGitConfig):
|
|
21
|
-
base_url: str = "https://gitlab.com"
|
|
22
|
-
|
|
23
|
-
def __post_init__(self):
|
|
24
|
-
parsed_gh_url = urlparse(self.url)
|
|
25
|
-
# If a scheme or netloc are provided, use the parsed base url
|
|
26
|
-
if parsed_gh_url.scheme or parsed_gh_url.netloc:
|
|
27
|
-
self.base_url = f"{parsed_gh_url.scheme}://{parsed_gh_url.netloc}"
|
|
28
|
-
self.repo_path = parsed_gh_url.path
|
|
29
|
-
while self.repo_path.startswith("/"):
|
|
30
|
-
self.repo_path = self.repo_path[1:]
|
|
31
|
-
|
|
32
|
-
@SourceConnectionError.wrap
|
|
33
|
-
@requires_dependencies(["gitlab"], extras="gitlab")
|
|
34
|
-
def get_project(self) -> "Project":
|
|
35
|
-
from gitlab import Gitlab
|
|
36
|
-
|
|
37
|
-
gitlab = Gitlab(self.base_url, private_token=self.access_config.access_token)
|
|
38
|
-
return gitlab.projects.get(self.repo_path)
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
@dataclass
|
|
42
|
-
class GitLabIngestDoc(GitIngestDoc):
|
|
43
|
-
connector_config: SimpleGitlabConfig
|
|
44
|
-
registry_name: str = "gitlab"
|
|
45
|
-
|
|
46
|
-
@property
|
|
47
|
-
def date_created(self) -> t.Optional[str]:
|
|
48
|
-
return None
|
|
49
|
-
|
|
50
|
-
@property
|
|
51
|
-
def date_modified(self) -> t.Optional[str]:
|
|
52
|
-
return None
|
|
53
|
-
|
|
54
|
-
@property
|
|
55
|
-
def source_url(self) -> t.Optional[str]:
|
|
56
|
-
return None
|
|
57
|
-
|
|
58
|
-
@SourceConnectionNetworkError.wrap
|
|
59
|
-
@requires_dependencies(["gitlab"], extras="gitlab")
|
|
60
|
-
def _fetch_content(self):
|
|
61
|
-
from gitlab.exceptions import GitlabHttpError
|
|
62
|
-
|
|
63
|
-
try:
|
|
64
|
-
project = self.connector_config.get_project()
|
|
65
|
-
content_file = project.files.get(
|
|
66
|
-
self.path,
|
|
67
|
-
ref=self.connector_config.branch or project.default_branch,
|
|
68
|
-
)
|
|
69
|
-
except GitlabHttpError as e:
|
|
70
|
-
if e.response_code == 404:
|
|
71
|
-
logger.error(f"File doesn't exists {self.connector_config.url}/{self.path}")
|
|
72
|
-
return None
|
|
73
|
-
raise
|
|
74
|
-
return content_file
|
|
75
|
-
|
|
76
|
-
def update_source_metadata(self, **kwargs):
|
|
77
|
-
content_file = kwargs.get("content_file", self._fetch_content())
|
|
78
|
-
if content_file is None:
|
|
79
|
-
self.source_metadata = SourceMetadata(
|
|
80
|
-
exists=None,
|
|
81
|
-
)
|
|
82
|
-
return
|
|
83
|
-
self.source_metadata = SourceMetadata(
|
|
84
|
-
version=content_file.attributes.get("last_commit_id", ""),
|
|
85
|
-
exists=True,
|
|
86
|
-
)
|
|
87
|
-
|
|
88
|
-
def _fetch_and_write(self) -> None:
|
|
89
|
-
content_file = self._fetch_content()
|
|
90
|
-
self.update_source_metadata(content_file=content_file)
|
|
91
|
-
if content_file is None:
|
|
92
|
-
raise ValueError(
|
|
93
|
-
f"Failed to retrieve file from repo "
|
|
94
|
-
f"{self.connector_config.url}/{self.path}. Check logs.",
|
|
95
|
-
)
|
|
96
|
-
contents = content_file.decode()
|
|
97
|
-
with open(self.filename, "wb") as f:
|
|
98
|
-
f.write(contents)
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
@dataclass
|
|
102
|
-
class GitLabSourceConnector(GitSourceConnector):
|
|
103
|
-
connector_config: SimpleGitlabConfig
|
|
104
|
-
|
|
105
|
-
@requires_dependencies(["gitlab"], extras="gitlab")
|
|
106
|
-
def check_connection(self):
|
|
107
|
-
from gitlab import Gitlab
|
|
108
|
-
from gitlab.exceptions import GitlabError
|
|
109
|
-
|
|
110
|
-
try:
|
|
111
|
-
gitlab = Gitlab(
|
|
112
|
-
self.connector_config.base_url,
|
|
113
|
-
private_token=self.connector_config.access_config.access_token,
|
|
114
|
-
)
|
|
115
|
-
gitlab.auth()
|
|
116
|
-
except GitlabError as gitlab_error:
|
|
117
|
-
logger.error(f"failed to validate connection: {gitlab_error}", exc_info=True)
|
|
118
|
-
raise SourceConnectionError(f"failed to validate connection: {gitlab_error}")
|
|
119
|
-
|
|
120
|
-
def get_ingest_docs(self):
|
|
121
|
-
# Load the Git tree with all files, and then create Ingest docs
|
|
122
|
-
# for all blobs, i.e. all files, ignoring directories
|
|
123
|
-
project = self.connector_config.get_project()
|
|
124
|
-
ref = self.connector_config.branch or project.default_branch
|
|
125
|
-
git_tree = project.repository_tree(
|
|
126
|
-
ref=ref,
|
|
127
|
-
recursive=True,
|
|
128
|
-
iterator=True,
|
|
129
|
-
all=True,
|
|
130
|
-
)
|
|
131
|
-
return [
|
|
132
|
-
GitLabIngestDoc(
|
|
133
|
-
connector_config=self.connector_config,
|
|
134
|
-
processor_config=self.processor_config,
|
|
135
|
-
read_config=self.read_config,
|
|
136
|
-
path=element["path"],
|
|
137
|
-
)
|
|
138
|
-
for element in git_tree
|
|
139
|
-
if element["type"] == "blob"
|
|
140
|
-
and self.is_file_type_supported(element["path"])
|
|
141
|
-
and (not self.connector_config.file_glob or self.does_path_match_glob(element["path"]))
|
|
142
|
-
]
|