unstructured-ingest 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- examples/airtable.py +44 -0
- examples/azure_cognitive_search.py +55 -0
- examples/chroma.py +54 -0
- examples/couchbase.py +55 -0
- examples/databricks_volumes_dest.py +55 -0
- examples/databricks_volumes_source.py +53 -0
- examples/delta_table.py +45 -0
- examples/discord_example.py +36 -0
- examples/elasticsearch.py +49 -0
- examples/google_drive.py +45 -0
- examples/kdbai.py +54 -0
- examples/local.py +36 -0
- examples/milvus.py +44 -0
- examples/mongodb.py +53 -0
- examples/opensearch.py +50 -0
- examples/pinecone.py +57 -0
- examples/s3.py +38 -0
- examples/salesforce.py +44 -0
- examples/sharepoint.py +47 -0
- examples/singlestore.py +49 -0
- examples/sql.py +90 -0
- examples/vectara.py +54 -0
- examples/weaviate.py +44 -0
- test/integration/chunkers/test_chunkers.py +1 -1
- test/integration/connectors/conftest.py +1 -1
- test/integration/connectors/databricks/test_volumes_native.py +3 -3
- test/integration/connectors/discord/test_discord.py +1 -1
- test/integration/connectors/duckdb/test_duckdb.py +2 -2
- test/integration/connectors/duckdb/test_motherduck.py +2 -2
- test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
- test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
- test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
- test/integration/connectors/sql/test_postgres.py +2 -2
- test/integration/connectors/sql/test_singlestore.py +2 -2
- test/integration/connectors/sql/test_snowflake.py +2 -2
- test/integration/connectors/sql/test_sqlite.py +2 -2
- test/integration/connectors/sql/test_vastdb.py +1 -1
- test/integration/connectors/test_astradb.py +2 -2
- test/integration/connectors/test_azure_ai_search.py +2 -2
- test/integration/connectors/test_chroma.py +2 -2
- test/integration/connectors/test_confluence.py +1 -1
- test/integration/connectors/test_delta_table.py +2 -2
- test/integration/connectors/test_dropbox.py +2 -2
- test/integration/connectors/test_github.py +49 -0
- test/integration/connectors/test_google_drive.py +2 -2
- test/integration/connectors/test_jira.py +1 -1
- test/integration/connectors/test_lancedb.py +7 -7
- test/integration/connectors/test_milvus.py +2 -2
- test/integration/connectors/test_mongodb.py +2 -2
- test/integration/connectors/test_neo4j.py +7 -7
- test/integration/connectors/test_notion.py +2 -2
- test/integration/connectors/test_onedrive.py +2 -2
- test/integration/connectors/test_pinecone.py +3 -3
- test/integration/connectors/test_qdrant.py +6 -6
- test/integration/connectors/test_redis.py +3 -3
- test/integration/connectors/test_s3.py +3 -3
- test/integration/connectors/test_sharepoint.py +1 -1
- test/integration/connectors/test_vectara.py +4 -4
- test/integration/connectors/test_zendesk.py +2 -2
- test/integration/connectors/utils/validation/destination.py +2 -2
- test/integration/connectors/utils/validation/source.py +2 -2
- test/integration/connectors/weaviate/test_cloud.py +1 -1
- test/integration/connectors/weaviate/test_local.py +2 -2
- test/integration/embedders/test_azure_openai.py +1 -1
- test/integration/embedders/test_bedrock.py +2 -2
- test/integration/embedders/test_huggingface.py +1 -1
- test/integration/embedders/test_mixedbread.py +1 -1
- test/integration/embedders/test_octoai.py +2 -2
- test/integration/embedders/test_openai.py +2 -2
- test/integration/embedders/test_togetherai.py +2 -2
- test/integration/embedders/test_vertexai.py +1 -1
- test/integration/embedders/test_voyageai.py +1 -1
- test/integration/partitioners/test_partitioner.py +2 -2
- test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
- test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
- test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
- test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
- test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
- test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
- test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
- test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
- test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
- test/unit/test_html.py +1 -1
- test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
- test/unit/test_utils.py +106 -97
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/__init__.py +0 -14
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +259 -9
- unstructured_ingest/cli/base/dest.py +58 -61
- unstructured_ingest/cli/base/src.py +54 -36
- unstructured_ingest/cli/cli.py +4 -17
- unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
- unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
- unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
- unstructured_ingest/embed/bedrock.py +3 -3
- unstructured_ingest/embed/octoai.py +3 -3
- unstructured_ingest/embed/openai.py +3 -3
- unstructured_ingest/embed/togetherai.py +4 -4
- unstructured_ingest/embed/vertexai.py +1 -1
- unstructured_ingest/embed/voyageai.py +4 -4
- unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
- unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
- unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
- unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
- unstructured_ingest/{v2/otel.py → otel.py} +1 -1
- unstructured_ingest/pipeline/__init__.py +0 -22
- unstructured_ingest/pipeline/interfaces.py +179 -238
- unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
- unstructured_ingest/pipeline/pipeline.py +388 -97
- unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
- unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
- unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
- unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +14 -11
- unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
- unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
- unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
- unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
- unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +12 -11
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
- unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
- unstructured_ingest/processes/connectors/github.py +221 -0
- unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
- unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
- unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
- unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
- unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
- unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +10 -10
- unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +11 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
- unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
- unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
- unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
- unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
- unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
- unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
- unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
- unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
- unstructured_ingest/utils/compression.py +1 -48
- unstructured_ingest/utils/data_prep.py +9 -1
- unstructured_ingest/utils/html.py +3 -3
- unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
- unstructured_ingest/utils/string_and_date_utils.py +1 -1
- {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/METADATA +99 -99
- unstructured_ingest-0.7.0.dist-info/RECORD +370 -0
- {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/top_level.txt +1 -0
- test/unit/v2/test_utils.py +0 -82
- unstructured_ingest/cli/cmd_factory.py +0 -12
- unstructured_ingest/cli/cmds/__init__.py +0 -145
- unstructured_ingest/cli/cmds/airtable.py +0 -69
- unstructured_ingest/cli/cmds/astradb.py +0 -99
- unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
- unstructured_ingest/cli/cmds/biomed.py +0 -52
- unstructured_ingest/cli/cmds/chroma.py +0 -104
- unstructured_ingest/cli/cmds/clarifai.py +0 -71
- unstructured_ingest/cli/cmds/confluence.py +0 -69
- unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
- unstructured_ingest/cli/cmds/delta_table.py +0 -94
- unstructured_ingest/cli/cmds/discord.py +0 -47
- unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
- unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
- unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
- unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
- unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
- unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
- unstructured_ingest/cli/cmds/github.py +0 -54
- unstructured_ingest/cli/cmds/gitlab.py +0 -54
- unstructured_ingest/cli/cmds/google_drive.py +0 -49
- unstructured_ingest/cli/cmds/hubspot.py +0 -70
- unstructured_ingest/cli/cmds/jira.py +0 -71
- unstructured_ingest/cli/cmds/kafka.py +0 -102
- unstructured_ingest/cli/cmds/local.py +0 -43
- unstructured_ingest/cli/cmds/mongodb.py +0 -72
- unstructured_ingest/cli/cmds/notion.py +0 -48
- unstructured_ingest/cli/cmds/onedrive.py +0 -66
- unstructured_ingest/cli/cmds/opensearch.py +0 -117
- unstructured_ingest/cli/cmds/outlook.py +0 -67
- unstructured_ingest/cli/cmds/pinecone.py +0 -71
- unstructured_ingest/cli/cmds/qdrant.py +0 -124
- unstructured_ingest/cli/cmds/reddit.py +0 -67
- unstructured_ingest/cli/cmds/salesforce.py +0 -58
- unstructured_ingest/cli/cmds/sharepoint.py +0 -66
- unstructured_ingest/cli/cmds/slack.py +0 -56
- unstructured_ingest/cli/cmds/sql.py +0 -66
- unstructured_ingest/cli/cmds/vectara.py +0 -66
- unstructured_ingest/cli/cmds/weaviate.py +0 -98
- unstructured_ingest/cli/cmds/wikipedia.py +0 -40
- unstructured_ingest/cli/common.py +0 -7
- unstructured_ingest/cli/interfaces.py +0 -663
- unstructured_ingest/cli/utils.py +0 -205
- unstructured_ingest/connector/airtable.py +0 -309
- unstructured_ingest/connector/astradb.py +0 -267
- unstructured_ingest/connector/azure_ai_search.py +0 -144
- unstructured_ingest/connector/biomed.py +0 -320
- unstructured_ingest/connector/chroma.py +0 -158
- unstructured_ingest/connector/clarifai.py +0 -122
- unstructured_ingest/connector/confluence.py +0 -285
- unstructured_ingest/connector/databricks_volumes.py +0 -137
- unstructured_ingest/connector/delta_table.py +0 -203
- unstructured_ingest/connector/discord.py +0 -180
- unstructured_ingest/connector/elasticsearch.py +0 -396
- unstructured_ingest/connector/fsspec/azure.py +0 -78
- unstructured_ingest/connector/fsspec/box.py +0 -109
- unstructured_ingest/connector/fsspec/dropbox.py +0 -160
- unstructured_ingest/connector/fsspec/fsspec.py +0 -359
- unstructured_ingest/connector/fsspec/gcs.py +0 -82
- unstructured_ingest/connector/fsspec/s3.py +0 -62
- unstructured_ingest/connector/fsspec/sftp.py +0 -81
- unstructured_ingest/connector/git.py +0 -124
- unstructured_ingest/connector/github.py +0 -174
- unstructured_ingest/connector/gitlab.py +0 -142
- unstructured_ingest/connector/google_drive.py +0 -348
- unstructured_ingest/connector/hubspot.py +0 -278
- unstructured_ingest/connector/jira.py +0 -469
- unstructured_ingest/connector/kafka.py +0 -293
- unstructured_ingest/connector/local.py +0 -139
- unstructured_ingest/connector/mongodb.py +0 -284
- unstructured_ingest/connector/notion/client.py +0 -248
- unstructured_ingest/connector/notion/connector.py +0 -469
- unstructured_ingest/connector/notion/helpers.py +0 -584
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
- unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
- unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
- unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
- unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
- unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
- unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
- unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
- unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
- unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
- unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
- unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
- unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
- unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
- unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
- unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
- unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
- unstructured_ingest/connector/notion/types/date.py +0 -26
- unstructured_ingest/connector/notion/types/file.py +0 -51
- unstructured_ingest/connector/notion/types/user.py +0 -76
- unstructured_ingest/connector/onedrive.py +0 -232
- unstructured_ingest/connector/opensearch.py +0 -218
- unstructured_ingest/connector/outlook.py +0 -285
- unstructured_ingest/connector/pinecone.py +0 -150
- unstructured_ingest/connector/qdrant.py +0 -144
- unstructured_ingest/connector/reddit.py +0 -166
- unstructured_ingest/connector/registry.py +0 -109
- unstructured_ingest/connector/salesforce.py +0 -301
- unstructured_ingest/connector/sharepoint.py +0 -573
- unstructured_ingest/connector/slack.py +0 -224
- unstructured_ingest/connector/sql.py +0 -199
- unstructured_ingest/connector/vectara.py +0 -253
- unstructured_ingest/connector/weaviate.py +0 -190
- unstructured_ingest/connector/wikipedia.py +0 -208
- unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
- unstructured_ingest/enhanced_dataclass/core.py +0 -99
- unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
- unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
- unstructured_ingest/interfaces.py +0 -852
- unstructured_ingest/pipeline/copy.py +0 -19
- unstructured_ingest/pipeline/doc_factory.py +0 -12
- unstructured_ingest/pipeline/partition.py +0 -60
- unstructured_ingest/pipeline/permissions.py +0 -12
- unstructured_ingest/pipeline/reformat/chunking.py +0 -134
- unstructured_ingest/pipeline/reformat/embedding.py +0 -64
- unstructured_ingest/pipeline/source.py +0 -77
- unstructured_ingest/pipeline/utils.py +0 -6
- unstructured_ingest/pipeline/write.py +0 -18
- unstructured_ingest/processor.py +0 -93
- unstructured_ingest/runner/__init__.py +0 -104
- unstructured_ingest/runner/airtable.py +0 -35
- unstructured_ingest/runner/astradb.py +0 -34
- unstructured_ingest/runner/base_runner.py +0 -89
- unstructured_ingest/runner/biomed.py +0 -45
- unstructured_ingest/runner/confluence.py +0 -35
- unstructured_ingest/runner/delta_table.py +0 -34
- unstructured_ingest/runner/discord.py +0 -35
- unstructured_ingest/runner/elasticsearch.py +0 -40
- unstructured_ingest/runner/fsspec/azure.py +0 -30
- unstructured_ingest/runner/fsspec/box.py +0 -28
- unstructured_ingest/runner/fsspec/dropbox.py +0 -30
- unstructured_ingest/runner/fsspec/fsspec.py +0 -40
- unstructured_ingest/runner/fsspec/gcs.py +0 -28
- unstructured_ingest/runner/fsspec/s3.py +0 -28
- unstructured_ingest/runner/fsspec/sftp.py +0 -28
- unstructured_ingest/runner/github.py +0 -37
- unstructured_ingest/runner/gitlab.py +0 -37
- unstructured_ingest/runner/google_drive.py +0 -35
- unstructured_ingest/runner/hubspot.py +0 -35
- unstructured_ingest/runner/jira.py +0 -35
- unstructured_ingest/runner/kafka.py +0 -34
- unstructured_ingest/runner/local.py +0 -23
- unstructured_ingest/runner/mongodb.py +0 -34
- unstructured_ingest/runner/notion.py +0 -61
- unstructured_ingest/runner/onedrive.py +0 -35
- unstructured_ingest/runner/opensearch.py +0 -40
- unstructured_ingest/runner/outlook.py +0 -33
- unstructured_ingest/runner/reddit.py +0 -35
- unstructured_ingest/runner/salesforce.py +0 -33
- unstructured_ingest/runner/sharepoint.py +0 -35
- unstructured_ingest/runner/slack.py +0 -33
- unstructured_ingest/runner/utils.py +0 -47
- unstructured_ingest/runner/wikipedia.py +0 -35
- unstructured_ingest/runner/writers/__init__.py +0 -48
- unstructured_ingest/runner/writers/astradb.py +0 -22
- unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
- unstructured_ingest/runner/writers/base_writer.py +0 -26
- unstructured_ingest/runner/writers/chroma.py +0 -22
- unstructured_ingest/runner/writers/clarifai.py +0 -19
- unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
- unstructured_ingest/runner/writers/delta_table.py +0 -24
- unstructured_ingest/runner/writers/elasticsearch.py +0 -24
- unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
- unstructured_ingest/runner/writers/fsspec/box.py +0 -21
- unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
- unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
- unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
- unstructured_ingest/runner/writers/kafka.py +0 -21
- unstructured_ingest/runner/writers/mongodb.py +0 -21
- unstructured_ingest/runner/writers/opensearch.py +0 -26
- unstructured_ingest/runner/writers/pinecone.py +0 -21
- unstructured_ingest/runner/writers/qdrant.py +0 -19
- unstructured_ingest/runner/writers/sql.py +0 -22
- unstructured_ingest/runner/writers/vectara.py +0 -22
- unstructured_ingest/runner/writers/weaviate.py +0 -21
- unstructured_ingest/utils/google_filetype.py +0 -9
- unstructured_ingest/v2/__init__.py +0 -1
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +0 -4
- unstructured_ingest/v2/cli/base/cmd.py +0 -269
- unstructured_ingest/v2/cli/base/dest.py +0 -85
- unstructured_ingest/v2/cli/base/src.py +0 -85
- unstructured_ingest/v2/cli/cli.py +0 -24
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/logger.py +0 -126
- unstructured_ingest/v2/main.py +0 -11
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +0 -211
- unstructured_ingest/v2/pipeline/pipeline.py +0 -408
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
- unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
- unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/v2/processes/utils/__init__.py +0 -0
- unstructured_ingest/v2/types/__init__.py +0 -0
- unstructured_ingest-0.6.2.dist-info/RECORD +0 -589
- {test/unit/v2 → examples}/__init__.py +0 -0
- /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
- /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
- /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
- /test/unit/{v2/utils → utils}/__init__.py +0 -0
- /test/unit/{v2/utils → utils}/data_generator.py +0 -0
- /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
- /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
- /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
- /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
- /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
- /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
- /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
- /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
- /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
- /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
- /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
- /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
- /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
- /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
- /unstructured_ingest/{v2 → utils}/constants.py +0 -0
- {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/entry_points.txt +0 -0
|
@@ -8,20 +8,20 @@ from typing import TYPE_CHECKING, Any, Generator, Optional, Union
|
|
|
8
8
|
|
|
9
9
|
from pydantic import BaseModel, Field, Secret, SecretStr, field_validator
|
|
10
10
|
|
|
11
|
+
from unstructured_ingest.data_types.file_data import (
|
|
12
|
+
BatchFileData,
|
|
13
|
+
BatchItem,
|
|
14
|
+
FileData,
|
|
15
|
+
FileDataSourceMetadata,
|
|
16
|
+
SourceIdentifiers,
|
|
17
|
+
)
|
|
11
18
|
from unstructured_ingest.error import (
|
|
12
19
|
DestinationConnectionError,
|
|
13
20
|
SourceConnectionError,
|
|
14
21
|
SourceConnectionNetworkError,
|
|
15
22
|
WriteError,
|
|
16
23
|
)
|
|
17
|
-
from unstructured_ingest.
|
|
18
|
-
batch_generator,
|
|
19
|
-
flatten_dict,
|
|
20
|
-
generator_batching_wbytes,
|
|
21
|
-
)
|
|
22
|
-
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
23
|
-
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
24
|
-
from unstructured_ingest.v2.interfaces import (
|
|
24
|
+
from unstructured_ingest.interfaces import (
|
|
25
25
|
AccessConfig,
|
|
26
26
|
ConnectionConfig,
|
|
27
27
|
Downloader,
|
|
@@ -35,19 +35,19 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
35
35
|
UploadStagerConfig,
|
|
36
36
|
download_responses,
|
|
37
37
|
)
|
|
38
|
-
from unstructured_ingest.
|
|
39
|
-
from unstructured_ingest.
|
|
38
|
+
from unstructured_ingest.logger import logger
|
|
39
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
40
40
|
DestinationRegistryEntry,
|
|
41
41
|
SourceRegistryEntry,
|
|
42
42
|
)
|
|
43
|
-
from unstructured_ingest.
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
43
|
+
from unstructured_ingest.utils.constants import RECORD_ID_LABEL
|
|
44
|
+
from unstructured_ingest.utils.data_prep import (
|
|
45
|
+
batch_generator,
|
|
46
|
+
flatten_dict,
|
|
47
|
+
generator_batching_wbytes,
|
|
48
|
+
get_enhanced_element_id,
|
|
49
49
|
)
|
|
50
|
-
from unstructured_ingest.
|
|
50
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
51
51
|
|
|
52
52
|
if TYPE_CHECKING:
|
|
53
53
|
from elasticsearch import Elasticsearch as ElasticsearchClient
|
|
@@ -7,17 +7,16 @@ from pydantic import BaseModel, Field, Secret, field_validator
|
|
|
7
7
|
from unstructured_ingest.error import (
|
|
8
8
|
DestinationConnectionError,
|
|
9
9
|
)
|
|
10
|
-
from unstructured_ingest.
|
|
11
|
-
from unstructured_ingest.v2.interfaces import (
|
|
10
|
+
from unstructured_ingest.interfaces import (
|
|
12
11
|
AccessConfig,
|
|
13
12
|
ConnectionConfig,
|
|
14
13
|
)
|
|
15
|
-
from unstructured_ingest.
|
|
16
|
-
from unstructured_ingest.
|
|
14
|
+
from unstructured_ingest.logger import logger
|
|
15
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
17
16
|
DestinationRegistryEntry,
|
|
18
17
|
SourceRegistryEntry,
|
|
19
18
|
)
|
|
20
|
-
from unstructured_ingest.
|
|
19
|
+
from unstructured_ingest.processes.connectors.elasticsearch.elasticsearch import (
|
|
21
20
|
ElasticsearchDownloader,
|
|
22
21
|
ElasticsearchDownloaderConfig,
|
|
23
22
|
ElasticsearchIndexer,
|
|
@@ -27,6 +26,7 @@ from unstructured_ingest.v2.processes.connectors.elasticsearch.elasticsearch imp
|
|
|
27
26
|
ElasticsearchUploadStager,
|
|
28
27
|
ElasticsearchUploadStagerConfig,
|
|
29
28
|
)
|
|
29
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
30
30
|
|
|
31
31
|
if TYPE_CHECKING:
|
|
32
32
|
from opensearchpy import OpenSearch
|
|
@@ -7,14 +7,14 @@ from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
|
7
7
|
|
|
8
8
|
from pydantic import Field, Secret
|
|
9
9
|
|
|
10
|
-
from unstructured_ingest.
|
|
11
|
-
from unstructured_ingest.
|
|
12
|
-
from unstructured_ingest.
|
|
13
|
-
from unstructured_ingest.
|
|
10
|
+
from unstructured_ingest.data_types.file_data import FileDataSourceMetadata
|
|
11
|
+
from unstructured_ingest.errors_v2 import ProviderError, UserAuthError, UserError
|
|
12
|
+
from unstructured_ingest.logger import logger
|
|
13
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
14
14
|
DestinationRegistryEntry,
|
|
15
15
|
SourceRegistryEntry,
|
|
16
16
|
)
|
|
17
|
-
from unstructured_ingest.
|
|
17
|
+
from unstructured_ingest.processes.connectors.fsspec.fsspec import (
|
|
18
18
|
FsspecAccessConfig,
|
|
19
19
|
FsspecConnectionConfig,
|
|
20
20
|
FsspecDownloader,
|
|
@@ -24,12 +24,12 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
24
24
|
FsspecUploader,
|
|
25
25
|
FsspecUploaderConfig,
|
|
26
26
|
)
|
|
27
|
-
from unstructured_ingest.
|
|
28
|
-
from unstructured_ingest.
|
|
27
|
+
from unstructured_ingest.processes.connectors.fsspec.utils import json_serial, sterilize_dict
|
|
28
|
+
from unstructured_ingest.processes.utils.blob_storage import (
|
|
29
29
|
BlobStoreUploadStager,
|
|
30
30
|
BlobStoreUploadStagerConfig,
|
|
31
31
|
)
|
|
32
|
-
from unstructured_ingest.
|
|
32
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
33
33
|
|
|
34
34
|
if TYPE_CHECKING:
|
|
35
35
|
from adlfs import AzureBlobFileSystem
|
|
@@ -9,14 +9,14 @@ from dateutil import parser
|
|
|
9
9
|
from pydantic import Field, Secret
|
|
10
10
|
from pydantic.functional_validators import BeforeValidator
|
|
11
11
|
|
|
12
|
-
from unstructured_ingest.
|
|
13
|
-
from unstructured_ingest.
|
|
14
|
-
from unstructured_ingest.
|
|
15
|
-
from unstructured_ingest.
|
|
12
|
+
from unstructured_ingest.data_types.file_data import FileDataSourceMetadata
|
|
13
|
+
from unstructured_ingest.errors_v2 import ProviderError, UserAuthError, UserError
|
|
14
|
+
from unstructured_ingest.logger import logger
|
|
15
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
16
16
|
DestinationRegistryEntry,
|
|
17
17
|
SourceRegistryEntry,
|
|
18
18
|
)
|
|
19
|
-
from unstructured_ingest.
|
|
19
|
+
from unstructured_ingest.processes.connectors.fsspec.fsspec import (
|
|
20
20
|
FsspecAccessConfig,
|
|
21
21
|
FsspecConnectionConfig,
|
|
22
22
|
FsspecDownloader,
|
|
@@ -26,12 +26,12 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
26
26
|
FsspecUploader,
|
|
27
27
|
FsspecUploaderConfig,
|
|
28
28
|
)
|
|
29
|
-
from unstructured_ingest.
|
|
30
|
-
from unstructured_ingest.
|
|
29
|
+
from unstructured_ingest.processes.connectors.utils import conform_string_to_dict
|
|
30
|
+
from unstructured_ingest.processes.utils.blob_storage import (
|
|
31
31
|
BlobStoreUploadStager,
|
|
32
32
|
BlobStoreUploadStagerConfig,
|
|
33
33
|
)
|
|
34
|
-
from unstructured_ingest.
|
|
34
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
35
35
|
|
|
36
36
|
if TYPE_CHECKING:
|
|
37
37
|
from boxfs import BoxFileSystem
|
|
@@ -6,21 +6,21 @@ from typing import TYPE_CHECKING, Any, Optional
|
|
|
6
6
|
|
|
7
7
|
from pydantic import Field, Secret
|
|
8
8
|
|
|
9
|
-
from unstructured_ingest.
|
|
10
|
-
from unstructured_ingest.
|
|
9
|
+
from unstructured_ingest.data_types.file_data import FileDataSourceMetadata
|
|
10
|
+
from unstructured_ingest.errors_v2 import (
|
|
11
11
|
ProviderError,
|
|
12
12
|
UserAuthError,
|
|
13
13
|
UserError,
|
|
14
14
|
)
|
|
15
|
-
from unstructured_ingest.
|
|
15
|
+
from unstructured_ingest.errors_v2 import (
|
|
16
16
|
RateLimitError as CustomRateLimitError,
|
|
17
17
|
)
|
|
18
|
-
from unstructured_ingest.
|
|
19
|
-
from unstructured_ingest.
|
|
18
|
+
from unstructured_ingest.logger import logger
|
|
19
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
20
20
|
DestinationRegistryEntry,
|
|
21
21
|
SourceRegistryEntry,
|
|
22
22
|
)
|
|
23
|
-
from unstructured_ingest.
|
|
23
|
+
from unstructured_ingest.processes.connectors.fsspec.fsspec import (
|
|
24
24
|
FsspecAccessConfig,
|
|
25
25
|
FsspecConnectionConfig,
|
|
26
26
|
FsspecDownloader,
|
|
@@ -30,11 +30,11 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
30
30
|
FsspecUploader,
|
|
31
31
|
FsspecUploaderConfig,
|
|
32
32
|
)
|
|
33
|
-
from unstructured_ingest.
|
|
33
|
+
from unstructured_ingest.processes.utils.blob_storage import (
|
|
34
34
|
BlobStoreUploadStager,
|
|
35
35
|
BlobStoreUploadStagerConfig,
|
|
36
36
|
)
|
|
37
|
-
from unstructured_ingest.
|
|
37
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
38
38
|
|
|
39
39
|
if TYPE_CHECKING:
|
|
40
40
|
pass
|
|
@@ -12,7 +12,12 @@ from uuid import NAMESPACE_DNS, uuid5
|
|
|
12
12
|
|
|
13
13
|
from pydantic import BaseModel, Field, Secret
|
|
14
14
|
|
|
15
|
-
from unstructured_ingest.
|
|
15
|
+
from unstructured_ingest.data_types.file_data import (
|
|
16
|
+
FileData,
|
|
17
|
+
FileDataSourceMetadata,
|
|
18
|
+
SourceIdentifiers,
|
|
19
|
+
)
|
|
20
|
+
from unstructured_ingest.interfaces import (
|
|
16
21
|
AccessConfig,
|
|
17
22
|
ConnectionConfig,
|
|
18
23
|
Downloader,
|
|
@@ -23,13 +28,8 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
23
28
|
Uploader,
|
|
24
29
|
UploaderConfig,
|
|
25
30
|
)
|
|
26
|
-
from unstructured_ingest.
|
|
27
|
-
from unstructured_ingest.
|
|
28
|
-
from unstructured_ingest.v2.types.file_data import (
|
|
29
|
-
FileData,
|
|
30
|
-
FileDataSourceMetadata,
|
|
31
|
-
SourceIdentifiers,
|
|
32
|
-
)
|
|
31
|
+
from unstructured_ingest.logger import logger
|
|
32
|
+
from unstructured_ingest.processes.connectors.fsspec.utils import sterilize_dict
|
|
33
33
|
|
|
34
34
|
if TYPE_CHECKING:
|
|
35
35
|
from fsspec import AbstractFileSystem
|
|
@@ -9,15 +9,14 @@ from typing import TYPE_CHECKING, Any, Generator, Optional, Union
|
|
|
9
9
|
from dateutil import parser
|
|
10
10
|
from pydantic import Field, Secret
|
|
11
11
|
|
|
12
|
-
from unstructured_ingest.
|
|
13
|
-
from unstructured_ingest.
|
|
14
|
-
from unstructured_ingest.
|
|
15
|
-
from unstructured_ingest.
|
|
16
|
-
from unstructured_ingest.v2.processes.connector_registry import (
|
|
12
|
+
from unstructured_ingest.data_types.file_data import FileDataSourceMetadata
|
|
13
|
+
from unstructured_ingest.errors_v2 import ProviderError, UserError
|
|
14
|
+
from unstructured_ingest.logger import logger
|
|
15
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
17
16
|
DestinationRegistryEntry,
|
|
18
17
|
SourceRegistryEntry,
|
|
19
18
|
)
|
|
20
|
-
from unstructured_ingest.
|
|
19
|
+
from unstructured_ingest.processes.connectors.fsspec.fsspec import (
|
|
21
20
|
FsspecAccessConfig,
|
|
22
21
|
FsspecConnectionConfig,
|
|
23
22
|
FsspecDownloader,
|
|
@@ -27,11 +26,12 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
27
26
|
FsspecUploader,
|
|
28
27
|
FsspecUploaderConfig,
|
|
29
28
|
)
|
|
30
|
-
from unstructured_ingest.
|
|
29
|
+
from unstructured_ingest.processes.utils.blob_storage import (
|
|
31
30
|
BlobStoreUploadStager,
|
|
32
31
|
BlobStoreUploadStagerConfig,
|
|
33
32
|
)
|
|
34
|
-
from unstructured_ingest.
|
|
33
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
34
|
+
from unstructured_ingest.utils.string_and_date_utils import json_to_dict
|
|
35
35
|
|
|
36
36
|
if TYPE_CHECKING:
|
|
37
37
|
from gcsfs import GCSFileSystem
|
|
@@ -6,14 +6,16 @@ from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
|
6
6
|
|
|
7
7
|
from pydantic import Field, Secret
|
|
8
8
|
|
|
9
|
-
from unstructured_ingest.
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
from unstructured_ingest.
|
|
9
|
+
from unstructured_ingest.data_types.file_data import (
|
|
10
|
+
FileDataSourceMetadata,
|
|
11
|
+
)
|
|
12
|
+
from unstructured_ingest.errors_v2 import ProviderError, UserAuthError, UserError
|
|
13
|
+
from unstructured_ingest.logger import logger
|
|
14
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
13
15
|
DestinationRegistryEntry,
|
|
14
16
|
SourceRegistryEntry,
|
|
15
17
|
)
|
|
16
|
-
from unstructured_ingest.
|
|
18
|
+
from unstructured_ingest.processes.connectors.fsspec.fsspec import (
|
|
17
19
|
FsspecAccessConfig,
|
|
18
20
|
FsspecConnectionConfig,
|
|
19
21
|
FsspecDownloader,
|
|
@@ -23,13 +25,11 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
23
25
|
FsspecUploader,
|
|
24
26
|
FsspecUploaderConfig,
|
|
25
27
|
)
|
|
26
|
-
from unstructured_ingest.
|
|
28
|
+
from unstructured_ingest.processes.utils.blob_storage import (
|
|
27
29
|
BlobStoreUploadStager,
|
|
28
30
|
BlobStoreUploadStagerConfig,
|
|
29
31
|
)
|
|
30
|
-
from unstructured_ingest.
|
|
31
|
-
FileDataSourceMetadata,
|
|
32
|
-
)
|
|
32
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
33
33
|
|
|
34
34
|
CONNECTOR_TYPE = "s3"
|
|
35
35
|
|
|
@@ -10,12 +10,12 @@ from urllib.parse import urlparse
|
|
|
10
10
|
|
|
11
11
|
from pydantic import Field, Secret
|
|
12
12
|
|
|
13
|
-
from unstructured_ingest.
|
|
14
|
-
from unstructured_ingest.
|
|
13
|
+
from unstructured_ingest.data_types.file_data import FileData, FileDataSourceMetadata
|
|
14
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
15
15
|
DestinationRegistryEntry,
|
|
16
16
|
SourceRegistryEntry,
|
|
17
17
|
)
|
|
18
|
-
from unstructured_ingest.
|
|
18
|
+
from unstructured_ingest.processes.connectors.fsspec.fsspec import (
|
|
19
19
|
FsspecAccessConfig,
|
|
20
20
|
FsspecConnectionConfig,
|
|
21
21
|
FsspecDownloader,
|
|
@@ -25,11 +25,11 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
25
25
|
FsspecUploader,
|
|
26
26
|
FsspecUploaderConfig,
|
|
27
27
|
)
|
|
28
|
-
from unstructured_ingest.
|
|
28
|
+
from unstructured_ingest.processes.utils.blob_storage import (
|
|
29
29
|
BlobStoreUploadStager,
|
|
30
30
|
BlobStoreUploadStagerConfig,
|
|
31
31
|
)
|
|
32
|
-
from unstructured_ingest.
|
|
32
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
33
33
|
|
|
34
34
|
if TYPE_CHECKING:
|
|
35
35
|
from fsspec.implementations.sftp import SFTPFileSystem
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from time import time
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
|
+
from urllib.parse import urlparse
|
|
6
|
+
from uuid import NAMESPACE_DNS, uuid5
|
|
7
|
+
|
|
8
|
+
from pydantic import Field, Secret, field_validator
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.data_types.file_data import (
|
|
11
|
+
FileData,
|
|
12
|
+
FileDataSourceMetadata,
|
|
13
|
+
SourceIdentifiers,
|
|
14
|
+
)
|
|
15
|
+
from unstructured_ingest.errors_v2 import ProviderError, UserAuthError, UserError
|
|
16
|
+
from unstructured_ingest.interfaces import (
|
|
17
|
+
AccessConfig,
|
|
18
|
+
ConnectionConfig,
|
|
19
|
+
Downloader,
|
|
20
|
+
DownloaderConfig,
|
|
21
|
+
Indexer,
|
|
22
|
+
IndexerConfig,
|
|
23
|
+
download_responses,
|
|
24
|
+
)
|
|
25
|
+
from unstructured_ingest.logger import logger
|
|
26
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
27
|
+
SourceRegistryEntry,
|
|
28
|
+
)
|
|
29
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
30
|
+
|
|
31
|
+
if TYPE_CHECKING:
|
|
32
|
+
from github import ContentFile, GitTreeElement, Repository
|
|
33
|
+
from github import Github as GithubClient
|
|
34
|
+
from github.GithubException import GithubException
|
|
35
|
+
from requests import HTTPError
|
|
36
|
+
|
|
37
|
+
CONNECTOR_TYPE = "github"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class GithubAccessConfig(AccessConfig):
|
|
41
|
+
access_token: str = Field(description="Github acess token")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class GithubConnectionConfig(ConnectionConfig):
|
|
45
|
+
access_config: Secret[GithubAccessConfig]
|
|
46
|
+
url: str = Field(description="Github url or repository owner/name pair")
|
|
47
|
+
|
|
48
|
+
@field_validator("url", mode="after")
|
|
49
|
+
def conform_url(cls, value: str):
|
|
50
|
+
parsed_url = urlparse(value)
|
|
51
|
+
return parsed_url.path
|
|
52
|
+
|
|
53
|
+
def get_full_url(self):
|
|
54
|
+
return f"https://github.com/{self.url}"
|
|
55
|
+
|
|
56
|
+
@requires_dependencies(["github"], extras="github")
|
|
57
|
+
def get_client(self) -> "GithubClient":
|
|
58
|
+
from github import Github as GithubClient
|
|
59
|
+
|
|
60
|
+
return GithubClient(login_or_token=self.access_config.get_secret_value().access_token)
|
|
61
|
+
|
|
62
|
+
def get_repo(self) -> "Repository":
|
|
63
|
+
client = self.get_client()
|
|
64
|
+
return client.get_repo(self.url)
|
|
65
|
+
|
|
66
|
+
def wrap_github_exception(self, e: "GithubException") -> Exception:
|
|
67
|
+
data = e.data
|
|
68
|
+
status_code = e.status
|
|
69
|
+
message = data.get("message")
|
|
70
|
+
if status_code == 401:
|
|
71
|
+
return UserAuthError(f"Unauthorized access to Github: {message}")
|
|
72
|
+
if 400 <= status_code < 500:
|
|
73
|
+
return UserError(message)
|
|
74
|
+
if status_code > 500:
|
|
75
|
+
return ProviderError(message)
|
|
76
|
+
logger.debug(f"unhandled github error: {e}")
|
|
77
|
+
return e
|
|
78
|
+
|
|
79
|
+
def wrap_http_error(self, e: "HTTPError") -> Exception:
|
|
80
|
+
status_code = e.response.status_code
|
|
81
|
+
if status_code == 401:
|
|
82
|
+
return UserAuthError(f"Unauthorized access to Github: {e.response.text}")
|
|
83
|
+
if 400 <= status_code < 500:
|
|
84
|
+
return UserError(e.response.text)
|
|
85
|
+
if status_code > 500:
|
|
86
|
+
return ProviderError(e.response.text)
|
|
87
|
+
logger.debug(f"unhandled http error: {e}")
|
|
88
|
+
return e
|
|
89
|
+
|
|
90
|
+
@requires_dependencies(["requests"], extras="github")
|
|
91
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
92
|
+
from github.GithubException import GithubException
|
|
93
|
+
from requests import HTTPError
|
|
94
|
+
|
|
95
|
+
if isinstance(e, GithubException):
|
|
96
|
+
return self.wrap_github_exception(e=e)
|
|
97
|
+
if isinstance(e, HTTPError):
|
|
98
|
+
return self.wrap_http_error(e=e)
|
|
99
|
+
logger.debug(f"unhandled error: {e}")
|
|
100
|
+
return e
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class GithubIndexerConfig(IndexerConfig):
|
|
104
|
+
branch: Optional[str] = Field(
|
|
105
|
+
description="Branch to index, use the default if one isn't provided", default=None
|
|
106
|
+
)
|
|
107
|
+
recursive: bool = Field(
|
|
108
|
+
description="Recursively index all files in the repository", default=True
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@dataclass
|
|
113
|
+
class GithubIndexer(Indexer):
|
|
114
|
+
connection_config: GithubConnectionConfig
|
|
115
|
+
index_config: GithubIndexerConfig = field(default_factory=GithubIndexerConfig)
|
|
116
|
+
connector_type: str = CONNECTOR_TYPE
|
|
117
|
+
|
|
118
|
+
def precheck(self) -> None:
|
|
119
|
+
try:
|
|
120
|
+
self.connection_config.get_repo()
|
|
121
|
+
except Exception as e:
|
|
122
|
+
raise self.connection_config.wrap_error(e=e)
|
|
123
|
+
|
|
124
|
+
def get_branch(self) -> str:
|
|
125
|
+
repo = self.connection_config.get_repo()
|
|
126
|
+
sha = self.index_config.branch or repo.default_branch
|
|
127
|
+
return sha
|
|
128
|
+
|
|
129
|
+
def list_files(self) -> list["GitTreeElement"]:
|
|
130
|
+
repo = self.connection_config.get_repo()
|
|
131
|
+
sha = self.index_config.branch or repo.default_branch
|
|
132
|
+
git_tree = repo.get_git_tree(sha, recursive=self.index_config.recursive)
|
|
133
|
+
file_elements = [
|
|
134
|
+
element for element in git_tree.tree if element.size is not None and element.size > 0
|
|
135
|
+
]
|
|
136
|
+
return file_elements
|
|
137
|
+
|
|
138
|
+
def convert_element(self, element: "GitTreeElement") -> FileData:
|
|
139
|
+
full_path = (
|
|
140
|
+
f"{self.connection_config.get_full_url()}/blob/{self.get_branch()}/{element.path}"
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
return FileData(
|
|
144
|
+
identifier=str(uuid5(NAMESPACE_DNS, full_path)),
|
|
145
|
+
connector_type=self.connector_type,
|
|
146
|
+
display_name=full_path,
|
|
147
|
+
source_identifiers=SourceIdentifiers(
|
|
148
|
+
filename=Path(element.path).name,
|
|
149
|
+
fullpath=(Path(self.get_branch()) / element.path).as_posix(),
|
|
150
|
+
rel_path=element.path,
|
|
151
|
+
),
|
|
152
|
+
metadata=FileDataSourceMetadata(
|
|
153
|
+
url=element.url,
|
|
154
|
+
version=element.etag,
|
|
155
|
+
record_locator={},
|
|
156
|
+
date_modified=str(element.last_modified_datetime.timestamp()),
|
|
157
|
+
date_processed=str(time()),
|
|
158
|
+
filesize_bytes=element.size,
|
|
159
|
+
permissions_data=[{"mode": element.mode}],
|
|
160
|
+
),
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
164
|
+
for element in self.list_files():
|
|
165
|
+
yield self.convert_element(element=element)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
class GithubDownloaderConfig(DownloaderConfig):
|
|
169
|
+
pass
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
@dataclass
|
|
173
|
+
class GithubDownloader(Downloader):
|
|
174
|
+
download_config: GithubDownloaderConfig
|
|
175
|
+
connection_config: GithubConnectionConfig
|
|
176
|
+
connector_type: str = CONNECTOR_TYPE
|
|
177
|
+
|
|
178
|
+
@requires_dependencies(["github"], extras="github")
|
|
179
|
+
def get_file(self, file_data: FileData) -> "ContentFile":
|
|
180
|
+
from github.GithubException import UnknownObjectException
|
|
181
|
+
|
|
182
|
+
path = file_data.source_identifiers.relative_path
|
|
183
|
+
repo = self.connection_config.get_repo()
|
|
184
|
+
|
|
185
|
+
try:
|
|
186
|
+
content_file = repo.get_contents(path)
|
|
187
|
+
except UnknownObjectException as e:
|
|
188
|
+
logger.error(f"File doesn't exists {self.connection_config.url}/{path}: {e}")
|
|
189
|
+
raise UserError(f"File not found: {path}")
|
|
190
|
+
return content_file
|
|
191
|
+
|
|
192
|
+
@requires_dependencies(["requests"], extras="github")
|
|
193
|
+
def get_contents(self, content_file: "ContentFile") -> bytes:
|
|
194
|
+
import requests
|
|
195
|
+
|
|
196
|
+
if content_file.decoded_content:
|
|
197
|
+
return content_file.decoded_content
|
|
198
|
+
download_url = content_file.download_url
|
|
199
|
+
resp = requests.get(download_url)
|
|
200
|
+
try:
|
|
201
|
+
resp.raise_for_status()
|
|
202
|
+
except requests.HTTPError as e:
|
|
203
|
+
raise self.connection_config.wrap_error(e=e)
|
|
204
|
+
return resp.content
|
|
205
|
+
|
|
206
|
+
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
207
|
+
content_file = self.get_file(file_data)
|
|
208
|
+
contents = self.get_contents(content_file)
|
|
209
|
+
download_path = self.get_download_path(file_data)
|
|
210
|
+
with download_path.open("wb") as f:
|
|
211
|
+
f.write(contents)
|
|
212
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
github_source_entry = SourceRegistryEntry(
|
|
216
|
+
indexer=GithubIndexer,
|
|
217
|
+
indexer_config=GithubIndexerConfig,
|
|
218
|
+
downloader=GithubDownloader,
|
|
219
|
+
downloader_config=GithubDownloaderConfig,
|
|
220
|
+
connection_config=GithubConnectionConfig,
|
|
221
|
+
)
|
|
@@ -8,9 +8,13 @@ from urllib.parse import urlparse
|
|
|
8
8
|
|
|
9
9
|
from pydantic import Field, Secret, model_validator
|
|
10
10
|
|
|
11
|
+
from unstructured_ingest.data_types.file_data import (
|
|
12
|
+
FileData,
|
|
13
|
+
FileDataSourceMetadata,
|
|
14
|
+
SourceIdentifiers,
|
|
15
|
+
)
|
|
11
16
|
from unstructured_ingest.error import SourceConnectionError
|
|
12
|
-
from unstructured_ingest.
|
|
13
|
-
from unstructured_ingest.v2.interfaces import (
|
|
17
|
+
from unstructured_ingest.interfaces import (
|
|
14
18
|
AccessConfig,
|
|
15
19
|
ConnectionConfig,
|
|
16
20
|
Downloader,
|
|
@@ -19,13 +23,9 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
19
23
|
Indexer,
|
|
20
24
|
IndexerConfig,
|
|
21
25
|
)
|
|
22
|
-
from unstructured_ingest.
|
|
23
|
-
from unstructured_ingest.
|
|
24
|
-
from unstructured_ingest.
|
|
25
|
-
FileData,
|
|
26
|
-
FileDataSourceMetadata,
|
|
27
|
-
SourceIdentifiers,
|
|
28
|
-
)
|
|
26
|
+
from unstructured_ingest.logger import logger
|
|
27
|
+
from unstructured_ingest.processes.connector_registry import SourceRegistryEntry
|
|
28
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
29
29
|
|
|
30
30
|
CONNECTOR_TYPE = "gitlab"
|
|
31
31
|
if TYPE_CHECKING:
|
|
@@ -9,13 +9,16 @@ from dateutil import parser
|
|
|
9
9
|
from pydantic import Field, Secret
|
|
10
10
|
from pydantic.functional_validators import BeforeValidator
|
|
11
11
|
|
|
12
|
+
from unstructured_ingest.data_types.file_data import (
|
|
13
|
+
FileData,
|
|
14
|
+
FileDataSourceMetadata,
|
|
15
|
+
SourceIdentifiers,
|
|
16
|
+
)
|
|
12
17
|
from unstructured_ingest.error import (
|
|
13
18
|
SourceConnectionError,
|
|
14
19
|
SourceConnectionNetworkError,
|
|
15
20
|
)
|
|
16
|
-
from unstructured_ingest.
|
|
17
|
-
from unstructured_ingest.utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
|
|
18
|
-
from unstructured_ingest.v2.interfaces import (
|
|
21
|
+
from unstructured_ingest.interfaces import (
|
|
19
22
|
AccessConfig,
|
|
20
23
|
ConnectionConfig,
|
|
21
24
|
Downloader,
|
|
@@ -24,21 +27,27 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
24
27
|
Indexer,
|
|
25
28
|
IndexerConfig,
|
|
26
29
|
)
|
|
27
|
-
from unstructured_ingest.
|
|
28
|
-
from unstructured_ingest.
|
|
29
|
-
from unstructured_ingest.
|
|
30
|
-
from unstructured_ingest.
|
|
31
|
-
FileData,
|
|
32
|
-
FileDataSourceMetadata,
|
|
33
|
-
SourceIdentifiers,
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
CONNECTOR_TYPE = "google_drive"
|
|
30
|
+
from unstructured_ingest.logger import logger
|
|
31
|
+
from unstructured_ingest.processes.connector_registry import SourceRegistryEntry
|
|
32
|
+
from unstructured_ingest.processes.connectors.utils import conform_string_to_dict
|
|
33
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
37
34
|
|
|
38
35
|
if TYPE_CHECKING:
|
|
39
36
|
from googleapiclient.discovery import Resource as GoogleAPIResource
|
|
40
37
|
from googleapiclient.http import MediaIoBaseDownload
|
|
41
38
|
|
|
39
|
+
CONNECTOR_TYPE = "google_drive"
|
|
40
|
+
|
|
41
|
+
GOOGLE_DRIVE_EXPORT_TYPES = {
|
|
42
|
+
"application/vnd.google-apps.document": "application/"
|
|
43
|
+
"vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
44
|
+
"application/vnd.google-apps.spreadsheet": "application/"
|
|
45
|
+
"vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
46
|
+
"application/vnd.google-apps.presentation": "application/"
|
|
47
|
+
"vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
48
|
+
"application/vnd.google-apps.photo": "image/jpeg",
|
|
49
|
+
}
|
|
50
|
+
|
|
42
51
|
|
|
43
52
|
class GoogleDriveAccessConfig(AccessConfig):
|
|
44
53
|
service_account_key: Optional[Annotated[dict, BeforeValidator(conform_string_to_dict)]] = Field(
|