unstructured-ingest 0.6.4__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- examples/airtable.py +44 -0
- examples/azure_cognitive_search.py +55 -0
- examples/chroma.py +54 -0
- examples/couchbase.py +55 -0
- examples/databricks_volumes_dest.py +55 -0
- examples/databricks_volumes_source.py +53 -0
- examples/delta_table.py +45 -0
- examples/discord_example.py +36 -0
- examples/elasticsearch.py +49 -0
- examples/google_drive.py +45 -0
- examples/kdbai.py +54 -0
- examples/local.py +36 -0
- examples/milvus.py +44 -0
- examples/mongodb.py +53 -0
- examples/opensearch.py +50 -0
- examples/pinecone.py +57 -0
- examples/s3.py +38 -0
- examples/salesforce.py +44 -0
- examples/sharepoint.py +47 -0
- examples/singlestore.py +49 -0
- examples/sql.py +90 -0
- examples/vectara.py +54 -0
- examples/weaviate.py +44 -0
- test/integration/chunkers/test_chunkers.py +1 -1
- test/integration/connectors/conftest.py +1 -1
- test/integration/connectors/databricks/test_volumes_native.py +3 -3
- test/integration/connectors/discord/test_discord.py +1 -1
- test/integration/connectors/duckdb/test_duckdb.py +2 -2
- test/integration/connectors/duckdb/test_motherduck.py +2 -2
- test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
- test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
- test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
- test/integration/connectors/sql/test_postgres.py +2 -2
- test/integration/connectors/sql/test_singlestore.py +2 -2
- test/integration/connectors/sql/test_snowflake.py +2 -2
- test/integration/connectors/sql/test_sqlite.py +2 -2
- test/integration/connectors/sql/test_vastdb.py +1 -1
- test/integration/connectors/test_astradb.py +2 -2
- test/integration/connectors/test_azure_ai_search.py +2 -2
- test/integration/connectors/test_chroma.py +2 -2
- test/integration/connectors/test_confluence.py +1 -1
- test/integration/connectors/test_delta_table.py +2 -2
- test/integration/connectors/test_dropbox.py +2 -2
- test/integration/connectors/test_github.py +1 -1
- test/integration/connectors/test_google_drive.py +2 -2
- test/integration/connectors/test_jira.py +1 -1
- test/integration/connectors/test_lancedb.py +7 -7
- test/integration/connectors/test_milvus.py +2 -2
- test/integration/connectors/test_mongodb.py +2 -2
- test/integration/connectors/test_neo4j.py +7 -7
- test/integration/connectors/test_notion.py +2 -2
- test/integration/connectors/test_onedrive.py +2 -2
- test/integration/connectors/test_pinecone.py +3 -3
- test/integration/connectors/test_qdrant.py +6 -6
- test/integration/connectors/test_redis.py +3 -3
- test/integration/connectors/test_s3.py +3 -3
- test/integration/connectors/test_sharepoint.py +1 -1
- test/integration/connectors/test_vectara.py +4 -4
- test/integration/connectors/test_zendesk.py +2 -2
- test/integration/connectors/utils/validation/destination.py +2 -2
- test/integration/connectors/utils/validation/source.py +2 -2
- test/integration/connectors/weaviate/test_cloud.py +1 -1
- test/integration/connectors/weaviate/test_local.py +2 -2
- test/integration/embedders/test_azure_openai.py +1 -1
- test/integration/embedders/test_bedrock.py +2 -2
- test/integration/embedders/test_huggingface.py +1 -1
- test/integration/embedders/test_mixedbread.py +1 -1
- test/integration/embedders/test_octoai.py +2 -2
- test/integration/embedders/test_openai.py +2 -2
- test/integration/embedders/test_togetherai.py +2 -2
- test/integration/embedders/test_vertexai.py +1 -1
- test/integration/embedders/test_voyageai.py +1 -1
- test/integration/partitioners/test_partitioner.py +2 -2
- test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
- test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
- test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
- test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
- test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
- test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
- test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
- test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
- test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
- test/unit/test_html.py +1 -1
- test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
- test/unit/test_utils.py +106 -97
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/__init__.py +0 -14
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +259 -9
- unstructured_ingest/cli/base/dest.py +58 -61
- unstructured_ingest/cli/base/src.py +54 -36
- unstructured_ingest/cli/cli.py +4 -17
- unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
- unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
- unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
- unstructured_ingest/embed/bedrock.py +3 -3
- unstructured_ingest/embed/octoai.py +3 -3
- unstructured_ingest/embed/openai.py +3 -3
- unstructured_ingest/embed/togetherai.py +4 -4
- unstructured_ingest/embed/vertexai.py +1 -1
- unstructured_ingest/embed/voyageai.py +4 -4
- unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
- unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
- unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
- unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
- unstructured_ingest/{v2/otel.py → otel.py} +1 -1
- unstructured_ingest/pipeline/__init__.py +0 -22
- unstructured_ingest/pipeline/interfaces.py +179 -238
- unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
- unstructured_ingest/pipeline/pipeline.py +388 -97
- unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
- unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
- unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
- unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +11 -11
- unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
- unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
- unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
- unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
- unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
- unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/github.py +10 -10
- unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
- unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
- unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
- unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
- unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
- unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +10 -10
- unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
- unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
- unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
- unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
- unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
- unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
- unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
- unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
- unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
- unstructured_ingest/utils/compression.py +1 -48
- unstructured_ingest/utils/data_prep.py +9 -1
- unstructured_ingest/utils/html.py +3 -3
- unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
- unstructured_ingest/utils/string_and_date_utils.py +1 -1
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/METADATA +21 -21
- unstructured_ingest-0.7.0.dist-info/RECORD +370 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/top_level.txt +1 -0
- test/unit/v2/test_utils.py +0 -82
- unstructured_ingest/cli/cmd_factory.py +0 -12
- unstructured_ingest/cli/cmds/__init__.py +0 -145
- unstructured_ingest/cli/cmds/airtable.py +0 -69
- unstructured_ingest/cli/cmds/astradb.py +0 -99
- unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
- unstructured_ingest/cli/cmds/biomed.py +0 -52
- unstructured_ingest/cli/cmds/chroma.py +0 -104
- unstructured_ingest/cli/cmds/clarifai.py +0 -71
- unstructured_ingest/cli/cmds/confluence.py +0 -69
- unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
- unstructured_ingest/cli/cmds/delta_table.py +0 -94
- unstructured_ingest/cli/cmds/discord.py +0 -47
- unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
- unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
- unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
- unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
- unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
- unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
- unstructured_ingest/cli/cmds/github.py +0 -54
- unstructured_ingest/cli/cmds/gitlab.py +0 -54
- unstructured_ingest/cli/cmds/google_drive.py +0 -49
- unstructured_ingest/cli/cmds/hubspot.py +0 -70
- unstructured_ingest/cli/cmds/jira.py +0 -71
- unstructured_ingest/cli/cmds/kafka.py +0 -102
- unstructured_ingest/cli/cmds/local.py +0 -43
- unstructured_ingest/cli/cmds/mongodb.py +0 -72
- unstructured_ingest/cli/cmds/notion.py +0 -48
- unstructured_ingest/cli/cmds/onedrive.py +0 -66
- unstructured_ingest/cli/cmds/opensearch.py +0 -117
- unstructured_ingest/cli/cmds/outlook.py +0 -67
- unstructured_ingest/cli/cmds/pinecone.py +0 -71
- unstructured_ingest/cli/cmds/qdrant.py +0 -124
- unstructured_ingest/cli/cmds/reddit.py +0 -67
- unstructured_ingest/cli/cmds/salesforce.py +0 -58
- unstructured_ingest/cli/cmds/sharepoint.py +0 -66
- unstructured_ingest/cli/cmds/slack.py +0 -56
- unstructured_ingest/cli/cmds/sql.py +0 -66
- unstructured_ingest/cli/cmds/vectara.py +0 -66
- unstructured_ingest/cli/cmds/weaviate.py +0 -98
- unstructured_ingest/cli/cmds/wikipedia.py +0 -40
- unstructured_ingest/cli/common.py +0 -7
- unstructured_ingest/cli/interfaces.py +0 -663
- unstructured_ingest/cli/utils.py +0 -205
- unstructured_ingest/connector/airtable.py +0 -309
- unstructured_ingest/connector/astradb.py +0 -267
- unstructured_ingest/connector/azure_ai_search.py +0 -144
- unstructured_ingest/connector/biomed.py +0 -320
- unstructured_ingest/connector/chroma.py +0 -158
- unstructured_ingest/connector/clarifai.py +0 -122
- unstructured_ingest/connector/confluence.py +0 -285
- unstructured_ingest/connector/databricks_volumes.py +0 -137
- unstructured_ingest/connector/delta_table.py +0 -203
- unstructured_ingest/connector/discord.py +0 -180
- unstructured_ingest/connector/elasticsearch.py +0 -396
- unstructured_ingest/connector/fsspec/azure.py +0 -78
- unstructured_ingest/connector/fsspec/box.py +0 -109
- unstructured_ingest/connector/fsspec/dropbox.py +0 -160
- unstructured_ingest/connector/fsspec/fsspec.py +0 -359
- unstructured_ingest/connector/fsspec/gcs.py +0 -82
- unstructured_ingest/connector/fsspec/s3.py +0 -62
- unstructured_ingest/connector/fsspec/sftp.py +0 -81
- unstructured_ingest/connector/git.py +0 -124
- unstructured_ingest/connector/github.py +0 -174
- unstructured_ingest/connector/gitlab.py +0 -142
- unstructured_ingest/connector/google_drive.py +0 -348
- unstructured_ingest/connector/hubspot.py +0 -278
- unstructured_ingest/connector/jira.py +0 -469
- unstructured_ingest/connector/kafka.py +0 -293
- unstructured_ingest/connector/local.py +0 -139
- unstructured_ingest/connector/mongodb.py +0 -284
- unstructured_ingest/connector/notion/client.py +0 -248
- unstructured_ingest/connector/notion/connector.py +0 -469
- unstructured_ingest/connector/notion/helpers.py +0 -584
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
- unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
- unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
- unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
- unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
- unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
- unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
- unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
- unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
- unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
- unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
- unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
- unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
- unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
- unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
- unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
- unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
- unstructured_ingest/connector/notion/types/date.py +0 -26
- unstructured_ingest/connector/notion/types/file.py +0 -51
- unstructured_ingest/connector/notion/types/user.py +0 -76
- unstructured_ingest/connector/onedrive.py +0 -232
- unstructured_ingest/connector/opensearch.py +0 -218
- unstructured_ingest/connector/outlook.py +0 -285
- unstructured_ingest/connector/pinecone.py +0 -150
- unstructured_ingest/connector/qdrant.py +0 -144
- unstructured_ingest/connector/reddit.py +0 -166
- unstructured_ingest/connector/registry.py +0 -109
- unstructured_ingest/connector/salesforce.py +0 -301
- unstructured_ingest/connector/sharepoint.py +0 -573
- unstructured_ingest/connector/slack.py +0 -224
- unstructured_ingest/connector/sql.py +0 -199
- unstructured_ingest/connector/vectara.py +0 -253
- unstructured_ingest/connector/weaviate.py +0 -190
- unstructured_ingest/connector/wikipedia.py +0 -208
- unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
- unstructured_ingest/enhanced_dataclass/core.py +0 -99
- unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
- unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
- unstructured_ingest/interfaces.py +0 -852
- unstructured_ingest/pipeline/copy.py +0 -19
- unstructured_ingest/pipeline/doc_factory.py +0 -12
- unstructured_ingest/pipeline/partition.py +0 -60
- unstructured_ingest/pipeline/permissions.py +0 -12
- unstructured_ingest/pipeline/reformat/chunking.py +0 -134
- unstructured_ingest/pipeline/reformat/embedding.py +0 -64
- unstructured_ingest/pipeline/source.py +0 -77
- unstructured_ingest/pipeline/utils.py +0 -6
- unstructured_ingest/pipeline/write.py +0 -18
- unstructured_ingest/processor.py +0 -93
- unstructured_ingest/runner/__init__.py +0 -104
- unstructured_ingest/runner/airtable.py +0 -35
- unstructured_ingest/runner/astradb.py +0 -34
- unstructured_ingest/runner/base_runner.py +0 -89
- unstructured_ingest/runner/biomed.py +0 -45
- unstructured_ingest/runner/confluence.py +0 -35
- unstructured_ingest/runner/delta_table.py +0 -34
- unstructured_ingest/runner/discord.py +0 -35
- unstructured_ingest/runner/elasticsearch.py +0 -40
- unstructured_ingest/runner/fsspec/azure.py +0 -30
- unstructured_ingest/runner/fsspec/box.py +0 -28
- unstructured_ingest/runner/fsspec/dropbox.py +0 -30
- unstructured_ingest/runner/fsspec/fsspec.py +0 -40
- unstructured_ingest/runner/fsspec/gcs.py +0 -28
- unstructured_ingest/runner/fsspec/s3.py +0 -28
- unstructured_ingest/runner/fsspec/sftp.py +0 -28
- unstructured_ingest/runner/github.py +0 -37
- unstructured_ingest/runner/gitlab.py +0 -37
- unstructured_ingest/runner/google_drive.py +0 -35
- unstructured_ingest/runner/hubspot.py +0 -35
- unstructured_ingest/runner/jira.py +0 -35
- unstructured_ingest/runner/kafka.py +0 -34
- unstructured_ingest/runner/local.py +0 -23
- unstructured_ingest/runner/mongodb.py +0 -34
- unstructured_ingest/runner/notion.py +0 -61
- unstructured_ingest/runner/onedrive.py +0 -35
- unstructured_ingest/runner/opensearch.py +0 -40
- unstructured_ingest/runner/outlook.py +0 -33
- unstructured_ingest/runner/reddit.py +0 -35
- unstructured_ingest/runner/salesforce.py +0 -33
- unstructured_ingest/runner/sharepoint.py +0 -35
- unstructured_ingest/runner/slack.py +0 -33
- unstructured_ingest/runner/utils.py +0 -47
- unstructured_ingest/runner/wikipedia.py +0 -35
- unstructured_ingest/runner/writers/__init__.py +0 -48
- unstructured_ingest/runner/writers/astradb.py +0 -22
- unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
- unstructured_ingest/runner/writers/base_writer.py +0 -26
- unstructured_ingest/runner/writers/chroma.py +0 -22
- unstructured_ingest/runner/writers/clarifai.py +0 -19
- unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
- unstructured_ingest/runner/writers/delta_table.py +0 -24
- unstructured_ingest/runner/writers/elasticsearch.py +0 -24
- unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
- unstructured_ingest/runner/writers/fsspec/box.py +0 -21
- unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
- unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
- unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
- unstructured_ingest/runner/writers/kafka.py +0 -21
- unstructured_ingest/runner/writers/mongodb.py +0 -21
- unstructured_ingest/runner/writers/opensearch.py +0 -26
- unstructured_ingest/runner/writers/pinecone.py +0 -21
- unstructured_ingest/runner/writers/qdrant.py +0 -19
- unstructured_ingest/runner/writers/sql.py +0 -22
- unstructured_ingest/runner/writers/vectara.py +0 -22
- unstructured_ingest/runner/writers/weaviate.py +0 -21
- unstructured_ingest/utils/google_filetype.py +0 -9
- unstructured_ingest/v2/__init__.py +0 -1
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +0 -4
- unstructured_ingest/v2/cli/base/cmd.py +0 -269
- unstructured_ingest/v2/cli/base/dest.py +0 -85
- unstructured_ingest/v2/cli/base/src.py +0 -85
- unstructured_ingest/v2/cli/cli.py +0 -24
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/logger.py +0 -126
- unstructured_ingest/v2/main.py +0 -11
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +0 -211
- unstructured_ingest/v2/pipeline/pipeline.py +0 -408
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
- unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
- unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/v2/processes/utils/__init__.py +0 -0
- unstructured_ingest/v2/types/__init__.py +0 -0
- unstructured_ingest-0.6.4.dist-info/RECORD +0 -591
- {test/unit/v2 → examples}/__init__.py +0 -0
- /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
- /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
- /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
- /test/unit/{v2/utils → utils}/__init__.py +0 -0
- /test/unit/{v2/utils → utils}/data_generator.py +0 -0
- /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
- /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
- /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
- /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
- /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
- /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
- /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
- /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
- /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
- /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
- /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
- /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
- /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
- /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
- /unstructured_ingest/{v2 → utils}/constants.py +0 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import shutil
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.connector.registry import create_ingest_doc_from_dict
|
|
6
|
-
from unstructured_ingest.logger import logger
|
|
7
|
-
from unstructured_ingest.pipeline.interfaces import CopyNode
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class Copier(CopyNode):
|
|
11
|
-
def run(self, json_path: str):
|
|
12
|
-
filename = os.path.basename(json_path)
|
|
13
|
-
doc_hash = os.path.splitext(filename)[0]
|
|
14
|
-
ingest_doc_dict = self.pipeline_context.ingest_docs_map[doc_hash]
|
|
15
|
-
ingest_doc = create_ingest_doc_from_dict(ingest_doc_dict)
|
|
16
|
-
desired_output = ingest_doc._output_filename
|
|
17
|
-
Path(desired_output).parent.mkdir(parents=True, exist_ok=True)
|
|
18
|
-
logger.info(f"copying {json_path} -> {desired_output}")
|
|
19
|
-
shutil.copy(json_path, desired_output)
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
|
|
4
|
-
from unstructured_ingest.pipeline.interfaces import DocFactoryNode
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
@dataclass
|
|
8
|
-
class DocFactory(DocFactoryNode):
|
|
9
|
-
def run(self, *args, **kwargs) -> t.Iterable[dict]:
|
|
10
|
-
docs = self.source_doc_connector.get_ingest_docs()
|
|
11
|
-
json_docs = [doc.to_dict() for doc in docs]
|
|
12
|
-
return json_docs
|
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
import hashlib
|
|
2
|
-
import json
|
|
3
|
-
import typing as t
|
|
4
|
-
from dataclasses import dataclass
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import Optional
|
|
7
|
-
|
|
8
|
-
from unstructured_ingest.connector.registry import create_ingest_doc_from_dict
|
|
9
|
-
from unstructured_ingest.error import PartitionError
|
|
10
|
-
from unstructured_ingest.logger import logger
|
|
11
|
-
from unstructured_ingest.pipeline.interfaces import PartitionNode
|
|
12
|
-
from unstructured_ingest.pipeline.utils import get_ingest_doc_hash
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
@dataclass
|
|
16
|
-
class Partitioner(PartitionNode):
|
|
17
|
-
@PartitionError.wrap
|
|
18
|
-
def run(self, ingest_doc_dict) -> Optional[str]:
|
|
19
|
-
try:
|
|
20
|
-
doc = create_ingest_doc_from_dict(ingest_doc_dict)
|
|
21
|
-
doc_filename_hash = get_ingest_doc_hash(ingest_doc_dict)
|
|
22
|
-
hashed_filename = hashlib.sha256(
|
|
23
|
-
f"{self.create_hash()}{doc_filename_hash}".encode(),
|
|
24
|
-
).hexdigest()[:32]
|
|
25
|
-
self.pipeline_context.ingest_docs_map[hashed_filename] = ingest_doc_dict
|
|
26
|
-
doc_filename = f"{hashed_filename}.json"
|
|
27
|
-
json_path = (Path(self.get_path()) / doc_filename).resolve()
|
|
28
|
-
if (
|
|
29
|
-
not self.pipeline_context.reprocess
|
|
30
|
-
and json_path.is_file()
|
|
31
|
-
and json_path.stat().st_size
|
|
32
|
-
):
|
|
33
|
-
logger.info(f"file exists: {json_path}, skipping partition")
|
|
34
|
-
return str(json_path)
|
|
35
|
-
partition_kwargs: t.Dict[str, t.Any] = {
|
|
36
|
-
"strategy": self.partition_config.strategy,
|
|
37
|
-
"encoding": self.partition_config.encoding,
|
|
38
|
-
"pdf_infer_table_structure": self.partition_config.pdf_infer_table_structure,
|
|
39
|
-
"languages": self.partition_config.ocr_languages,
|
|
40
|
-
"hi_res_model_name": self.partition_config.hi_res_model_name,
|
|
41
|
-
}
|
|
42
|
-
if self.partition_config.skip_infer_table_types:
|
|
43
|
-
partition_kwargs["skip_infer_table_types"] = (
|
|
44
|
-
self.partition_config.skip_infer_table_types
|
|
45
|
-
)
|
|
46
|
-
if self.partition_config.additional_partition_args:
|
|
47
|
-
partition_kwargs.update(self.partition_config.additional_partition_args)
|
|
48
|
-
elements = doc.process_file(
|
|
49
|
-
partition_config=self.partition_config,
|
|
50
|
-
**partition_kwargs,
|
|
51
|
-
)
|
|
52
|
-
with open(json_path, "w", encoding="utf8") as output_f:
|
|
53
|
-
logger.info(f"writing partitioned content to {json_path}")
|
|
54
|
-
json.dump(elements, output_f, ensure_ascii=False, indent=2, sort_keys=True)
|
|
55
|
-
return str(json_path)
|
|
56
|
-
except Exception as e:
|
|
57
|
-
if self.pipeline_context.raise_on_error:
|
|
58
|
-
raise
|
|
59
|
-
logger.error(f"failed to partition doc: {ingest_doc_dict}, {e}", exc_info=True)
|
|
60
|
-
return None
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
from unstructured_ingest.interfaces import PermissionsCleanupMixin, ProcessorConfig
|
|
4
|
-
from unstructured_ingest.pipeline.interfaces import PermissionsNode
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
@dataclass
|
|
8
|
-
class PermissionsDataCleaner(PermissionsNode, PermissionsCleanupMixin):
|
|
9
|
-
processor_config: ProcessorConfig
|
|
10
|
-
|
|
11
|
-
def run(self):
|
|
12
|
-
self.cleanup_permissions()
|
|
@@ -1,134 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import hashlib
|
|
4
|
-
import json
|
|
5
|
-
import os.path
|
|
6
|
-
from dataclasses import dataclass
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
from typing import TYPE_CHECKING, Optional
|
|
9
|
-
|
|
10
|
-
from unstructured_ingest.interfaces import ChunkingConfig, PartitionConfig
|
|
11
|
-
from unstructured_ingest.logger import logger
|
|
12
|
-
from unstructured_ingest.pipeline.interfaces import ReformatNode
|
|
13
|
-
from unstructured_ingest.utils.chunking import assign_and_map_hash_ids
|
|
14
|
-
|
|
15
|
-
if TYPE_CHECKING:
|
|
16
|
-
from unstructured.documents.elements import Element
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
@dataclass
|
|
20
|
-
class Chunker(ReformatNode):
|
|
21
|
-
"""Implementation for the chunking node in the ingest pipeline.
|
|
22
|
-
|
|
23
|
-
Parameters
|
|
24
|
-
----------
|
|
25
|
-
pipeline_context: PipelineContext (inherited from parent class)
|
|
26
|
-
chunking_config: ChunkingConfig
|
|
27
|
-
partition_config: PartitionConfig
|
|
28
|
-
"""
|
|
29
|
-
|
|
30
|
-
chunking_config: ChunkingConfig
|
|
31
|
-
partition_config: PartitionConfig
|
|
32
|
-
|
|
33
|
-
def initialize(self):
|
|
34
|
-
logger.info(
|
|
35
|
-
f"Running chunking node. Chunking config: {self.chunking_config.to_json()}]",
|
|
36
|
-
)
|
|
37
|
-
super().initialize()
|
|
38
|
-
|
|
39
|
-
def create_hash(self) -> str:
|
|
40
|
-
hash_dict = self.chunking_config.to_dict()
|
|
41
|
-
return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
|
|
42
|
-
|
|
43
|
-
def run(self, elements_json: str) -> Optional[str]:
|
|
44
|
-
try:
|
|
45
|
-
elements_json_filename = os.path.basename(elements_json)
|
|
46
|
-
filename_ext = os.path.basename(elements_json_filename)
|
|
47
|
-
filename = os.path.splitext(filename_ext)[0]
|
|
48
|
-
hashed_filename = hashlib.sha256(
|
|
49
|
-
f"{self.create_hash()}{filename}".encode(),
|
|
50
|
-
).hexdigest()[:32]
|
|
51
|
-
json_filename = f"{hashed_filename}.json"
|
|
52
|
-
json_path = (Path(self.get_path()) / json_filename).resolve()
|
|
53
|
-
self.pipeline_context.ingest_docs_map[hashed_filename] = (
|
|
54
|
-
self.pipeline_context.ingest_docs_map[filename]
|
|
55
|
-
)
|
|
56
|
-
if (
|
|
57
|
-
not self.pipeline_context.reprocess
|
|
58
|
-
and json_path.is_file()
|
|
59
|
-
and json_path.stat().st_size
|
|
60
|
-
):
|
|
61
|
-
logger.debug(f"file exists: {json_path}, skipping chunking")
|
|
62
|
-
return str(json_path)
|
|
63
|
-
|
|
64
|
-
chunked_elements = self.chunk(elements_json)
|
|
65
|
-
|
|
66
|
-
# -- return if chunking_strategy is None --
|
|
67
|
-
if chunked_elements is None:
|
|
68
|
-
logger.info(f"chunking_strategy is None, skipping chunking for {filename_ext}")
|
|
69
|
-
return
|
|
70
|
-
|
|
71
|
-
element_dicts = [e.to_dict() for e in chunked_elements]
|
|
72
|
-
assign_and_map_hash_ids(elements=element_dicts)
|
|
73
|
-
|
|
74
|
-
with open(json_path, "w", encoding="utf8") as output_f:
|
|
75
|
-
logger.info(f"writing chunking content to {json_path}")
|
|
76
|
-
json.dump(element_dicts, output_f, ensure_ascii=False, indent=2)
|
|
77
|
-
return str(json_path)
|
|
78
|
-
|
|
79
|
-
except Exception as e:
|
|
80
|
-
if self.pipeline_context.raise_on_error:
|
|
81
|
-
raise
|
|
82
|
-
logger.error(f"failed to run chunking on file {elements_json}, {e}", exc_info=True)
|
|
83
|
-
return None
|
|
84
|
-
|
|
85
|
-
def get_path(self) -> Path:
|
|
86
|
-
return (Path(self.pipeline_context.work_dir) / "chunked").resolve()
|
|
87
|
-
|
|
88
|
-
def chunk(self, elements_json_file: str) -> Optional[list["Element"]]:
|
|
89
|
-
"""Called by Chunker.run() to properly execute the defined chunking_strategy."""
|
|
90
|
-
# -- No chunking_strategy means no chunking --
|
|
91
|
-
if self.chunking_config.chunking_strategy is None:
|
|
92
|
-
return
|
|
93
|
-
# -- Chunk locally for open-source chunking strategies, even when partitioning remotely --
|
|
94
|
-
if self.chunking_config.chunking_strategy in ("basic", "by_title"):
|
|
95
|
-
from unstructured.chunking import dispatch
|
|
96
|
-
from unstructured.staging.base import elements_from_json
|
|
97
|
-
|
|
98
|
-
return dispatch.chunk(
|
|
99
|
-
elements=elements_from_json(filename=elements_json_file),
|
|
100
|
-
chunking_strategy=self.chunking_config.chunking_strategy,
|
|
101
|
-
combine_text_under_n_chars=self.chunking_config.combine_text_under_n_chars,
|
|
102
|
-
include_orig_elements=self.chunking_config.include_orig_elements,
|
|
103
|
-
max_characters=self.chunking_config.max_characters,
|
|
104
|
-
multipage_sections=self.chunking_config.multipage_sections,
|
|
105
|
-
new_after_n_chars=self.chunking_config.new_after_n_chars,
|
|
106
|
-
overlap=self.chunking_config.overlap,
|
|
107
|
-
overlap_all=self.chunking_config.overlap_all,
|
|
108
|
-
)
|
|
109
|
-
# -- Chunk remotely --
|
|
110
|
-
if self.partition_config.partition_by_api:
|
|
111
|
-
from unstructured.partition.api import partition_via_api
|
|
112
|
-
|
|
113
|
-
return partition_via_api(
|
|
114
|
-
filename=elements_json_file,
|
|
115
|
-
# -- NOTE(jennings): If api_key or api_url are None, partition_via_api will raise an
|
|
116
|
-
# -- error, which will be caught and logged by Chunker.run()
|
|
117
|
-
api_key=self.partition_config.api_key, # type: ignore
|
|
118
|
-
api_url=self.partition_config.partition_endpoint, # type: ignore
|
|
119
|
-
chunking_strategy=self.chunking_config.chunking_strategy,
|
|
120
|
-
combine_under_n_chars=self.chunking_config.combine_text_under_n_chars,
|
|
121
|
-
include_orig_elements=self.chunking_config.include_orig_elements,
|
|
122
|
-
max_characters=self.chunking_config.max_characters,
|
|
123
|
-
multipage_sections=self.chunking_config.multipage_sections,
|
|
124
|
-
new_after_n_chars=self.chunking_config.new_after_n_chars,
|
|
125
|
-
overlap=self.chunking_config.overlap,
|
|
126
|
-
overlap_all=self.chunking_config.overlap_all,
|
|
127
|
-
)
|
|
128
|
-
# -- Warn that the defined chunking_strategy is not locally available --
|
|
129
|
-
logger.warning(
|
|
130
|
-
f"There is no locally available chunking_strategy:"
|
|
131
|
-
f" {self.chunking_config.chunking_strategy}."
|
|
132
|
-
f" If trying to partition remotely, check that `partition_by_api`, `api_url`,"
|
|
133
|
-
f" and `api_key` are correctly defined."
|
|
134
|
-
)
|
|
@@ -1,64 +0,0 @@
|
|
|
1
|
-
import hashlib
|
|
2
|
-
import json
|
|
3
|
-
import os.path
|
|
4
|
-
from dataclasses import dataclass
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import Optional
|
|
7
|
-
|
|
8
|
-
from unstructured_ingest.interfaces import (
|
|
9
|
-
EmbeddingConfig,
|
|
10
|
-
)
|
|
11
|
-
from unstructured_ingest.logger import logger
|
|
12
|
-
from unstructured_ingest.pipeline.interfaces import ReformatNode
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
@dataclass
|
|
16
|
-
class Embedder(ReformatNode):
|
|
17
|
-
embedder_config: EmbeddingConfig
|
|
18
|
-
|
|
19
|
-
def initialize(self):
|
|
20
|
-
logger.info(
|
|
21
|
-
f"Running embedding node. Embedding config: {self.embedder_config.to_json()}]",
|
|
22
|
-
)
|
|
23
|
-
super().initialize()
|
|
24
|
-
|
|
25
|
-
def create_hash(self) -> str:
|
|
26
|
-
hash_dict = self.embedder_config.to_dict()
|
|
27
|
-
return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
|
|
28
|
-
|
|
29
|
-
def run(self, elements_json: str) -> Optional[str]:
|
|
30
|
-
try:
|
|
31
|
-
elements_json_filename = os.path.basename(elements_json)
|
|
32
|
-
filename_ext = os.path.basename(elements_json_filename)
|
|
33
|
-
filename = os.path.splitext(filename_ext)[0]
|
|
34
|
-
hashed_filename = hashlib.sha256(
|
|
35
|
-
f"{self.create_hash()}{filename}".encode(),
|
|
36
|
-
).hexdigest()[:32]
|
|
37
|
-
json_filename = f"{hashed_filename}.json"
|
|
38
|
-
json_path = (Path(self.get_path()) / json_filename).resolve()
|
|
39
|
-
self.pipeline_context.ingest_docs_map[hashed_filename] = (
|
|
40
|
-
self.pipeline_context.ingest_docs_map[filename]
|
|
41
|
-
)
|
|
42
|
-
if (
|
|
43
|
-
not self.pipeline_context.reprocess
|
|
44
|
-
and json_path.is_file()
|
|
45
|
-
and json_path.stat().st_size
|
|
46
|
-
):
|
|
47
|
-
logger.debug(f"file exists: {json_path}, skipping embedding")
|
|
48
|
-
return str(json_path)
|
|
49
|
-
with open(elements_json) as f:
|
|
50
|
-
elements = json.load(f)
|
|
51
|
-
embedder = self.embedder_config.get_embedder()
|
|
52
|
-
element_dicts = embedder.embed_documents(elements=elements)
|
|
53
|
-
with open(json_path, "w", encoding="utf8") as output_f:
|
|
54
|
-
logger.info(f"writing embeddings content to {json_path}")
|
|
55
|
-
json.dump(element_dicts, output_f, ensure_ascii=False, indent=2)
|
|
56
|
-
return str(json_path)
|
|
57
|
-
except Exception as e:
|
|
58
|
-
if self.pipeline_context.raise_on_error:
|
|
59
|
-
raise
|
|
60
|
-
logger.error(f"failed to embed content from file {elements_json}, {e}", exc_info=True)
|
|
61
|
-
return None
|
|
62
|
-
|
|
63
|
-
def get_path(self) -> Path:
|
|
64
|
-
return (Path(self.pipeline_context.work_dir) / "embedded.py").resolve()
|
|
@@ -1,77 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import typing as t
|
|
3
|
-
from dataclasses import dataclass
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.connector.registry import create_ingest_doc_from_dict
|
|
6
|
-
from unstructured_ingest.interfaces import (
|
|
7
|
-
BaseIngestDocBatch,
|
|
8
|
-
BaseSessionHandle,
|
|
9
|
-
BaseSingleIngestDoc,
|
|
10
|
-
IngestDocSessionHandleMixin,
|
|
11
|
-
)
|
|
12
|
-
from unstructured_ingest.logger import logger
|
|
13
|
-
from unstructured_ingest.pipeline.interfaces import SourceNode
|
|
14
|
-
|
|
15
|
-
# module-level variable to store session handle
|
|
16
|
-
session_handle: t.Optional[BaseSessionHandle] = None
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
@dataclass
|
|
20
|
-
class Reader(SourceNode):
|
|
21
|
-
def get_single(self, doc: BaseSingleIngestDoc, ingest_doc_dict: dict) -> str:
|
|
22
|
-
if (
|
|
23
|
-
not self.read_config.re_download
|
|
24
|
-
and doc.filename.is_file()
|
|
25
|
-
and doc.filename.stat().st_size
|
|
26
|
-
):
|
|
27
|
-
logger.info(f"file exists: {doc.filename}, skipping download")
|
|
28
|
-
# Still need to fetch metadata if file exists locally
|
|
29
|
-
doc.update_source_metadata()
|
|
30
|
-
else:
|
|
31
|
-
serialized_doc = doc.to_json(redact_sensitive=True)
|
|
32
|
-
logger.debug(f"fetching {serialized_doc} - PID: {os.getpid()}")
|
|
33
|
-
if self.retry_strategy:
|
|
34
|
-
self.retry_strategy(doc.get_file)
|
|
35
|
-
else:
|
|
36
|
-
doc.get_file()
|
|
37
|
-
for k, v in doc.to_dict().items():
|
|
38
|
-
ingest_doc_dict[k] = v
|
|
39
|
-
return doc.filename
|
|
40
|
-
|
|
41
|
-
def get_batch(self, doc_batch: BaseIngestDocBatch, ingest_doc_dict: dict) -> t.List[str]:
|
|
42
|
-
if self.retry_strategy:
|
|
43
|
-
self.retry_strategy(doc_batch.get_files)
|
|
44
|
-
else:
|
|
45
|
-
doc_batch.get_files()
|
|
46
|
-
for k, v in doc_batch.to_dict().items():
|
|
47
|
-
ingest_doc_dict[k] = v
|
|
48
|
-
return [doc.filename for doc in doc_batch.ingest_docs]
|
|
49
|
-
|
|
50
|
-
def run(self, ingest_doc_dict: dict) -> t.Optional[t.Union[str, t.List[str]]]:
|
|
51
|
-
try:
|
|
52
|
-
global session_handle
|
|
53
|
-
doc = create_ingest_doc_from_dict(ingest_doc_dict)
|
|
54
|
-
if isinstance(doc, IngestDocSessionHandleMixin):
|
|
55
|
-
if session_handle is None:
|
|
56
|
-
# create via doc.session_handle, which is a property that creates a
|
|
57
|
-
# session handle if one is not already defined
|
|
58
|
-
session_handle = doc.session_handle
|
|
59
|
-
else:
|
|
60
|
-
doc._session_handle = session_handle
|
|
61
|
-
if isinstance(doc, BaseSingleIngestDoc):
|
|
62
|
-
return self.get_single(doc=doc, ingest_doc_dict=ingest_doc_dict)
|
|
63
|
-
elif isinstance(doc, BaseIngestDocBatch):
|
|
64
|
-
return self.get_batch(doc_batch=doc, ingest_doc_dict=ingest_doc_dict)
|
|
65
|
-
else:
|
|
66
|
-
raise ValueError(
|
|
67
|
-
f"type of doc ({type(doc)}) is not a recognized type: "
|
|
68
|
-
f"BaseSingleIngestDoc or BaseSingleIngestDoc"
|
|
69
|
-
)
|
|
70
|
-
except Exception as e:
|
|
71
|
-
if self.pipeline_context.raise_on_error:
|
|
72
|
-
raise
|
|
73
|
-
logger.error(
|
|
74
|
-
f"failed to get data associated with source doc: {ingest_doc_dict}, {e}",
|
|
75
|
-
exc_info=True,
|
|
76
|
-
)
|
|
77
|
-
return None
|
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
import os.path
|
|
2
|
-
import typing as t
|
|
3
|
-
from dataclasses import dataclass
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.connector.registry import create_ingest_doc_from_dict
|
|
6
|
-
from unstructured_ingest.pipeline.interfaces import WriteNode
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
@dataclass
|
|
10
|
-
class Writer(WriteNode):
|
|
11
|
-
def run(self, json_paths: t.List[str]):
|
|
12
|
-
ingest_docs = []
|
|
13
|
-
for json_path in json_paths:
|
|
14
|
-
filename = os.path.basename(json_path)
|
|
15
|
-
doc_hash = os.path.splitext(filename)[0]
|
|
16
|
-
ingest_doc_dict = self.pipeline_context.ingest_docs_map[doc_hash]
|
|
17
|
-
ingest_docs.append(create_ingest_doc_from_dict(ingest_doc_dict))
|
|
18
|
-
self.dest_doc_connector.write(docs=ingest_docs)
|
unstructured_ingest/processor.py
DELETED
|
@@ -1,93 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import multiprocessing as mp
|
|
4
|
-
from contextlib import suppress
|
|
5
|
-
from typing import Optional
|
|
6
|
-
|
|
7
|
-
from unstructured_ingest.interfaces import (
|
|
8
|
-
BaseDestinationConnector,
|
|
9
|
-
BaseSourceConnector,
|
|
10
|
-
ChunkingConfig,
|
|
11
|
-
EmbeddingConfig,
|
|
12
|
-
PartitionConfig,
|
|
13
|
-
PermissionsConfig,
|
|
14
|
-
ProcessorConfig,
|
|
15
|
-
RetryStrategyConfig,
|
|
16
|
-
)
|
|
17
|
-
from unstructured_ingest.pipeline import (
|
|
18
|
-
Chunker,
|
|
19
|
-
DocFactory,
|
|
20
|
-
Embedder,
|
|
21
|
-
Partitioner,
|
|
22
|
-
PermissionsDataCleaner,
|
|
23
|
-
Pipeline,
|
|
24
|
-
PipelineContext,
|
|
25
|
-
Reader,
|
|
26
|
-
ReformatNode,
|
|
27
|
-
Writer,
|
|
28
|
-
)
|
|
29
|
-
|
|
30
|
-
with suppress(RuntimeError):
|
|
31
|
-
mp.set_start_method("spawn")
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def process_documents(
|
|
35
|
-
processor_config: ProcessorConfig,
|
|
36
|
-
source_doc_connector: BaseSourceConnector,
|
|
37
|
-
partition_config: PartitionConfig,
|
|
38
|
-
dest_doc_connector: Optional[BaseDestinationConnector] = None,
|
|
39
|
-
chunking_config: Optional[ChunkingConfig] = None,
|
|
40
|
-
embedder_config: Optional[EmbeddingConfig] = None,
|
|
41
|
-
permissions_config: Optional[PermissionsConfig] = None,
|
|
42
|
-
retry_strategy_config: Optional[RetryStrategyConfig] = None,
|
|
43
|
-
) -> None:
|
|
44
|
-
pipeline_config = PipelineContext.from_dict(processor_config.to_dict())
|
|
45
|
-
doc_factory = DocFactory(
|
|
46
|
-
pipeline_context=pipeline_config,
|
|
47
|
-
source_doc_connector=source_doc_connector,
|
|
48
|
-
)
|
|
49
|
-
reader = Reader(
|
|
50
|
-
pipeline_context=pipeline_config,
|
|
51
|
-
retry_strategy_config=retry_strategy_config,
|
|
52
|
-
read_config=source_doc_connector.read_config,
|
|
53
|
-
)
|
|
54
|
-
partitioner = Partitioner(pipeline_context=pipeline_config, partition_config=partition_config)
|
|
55
|
-
reformat_nodes: list[ReformatNode] = []
|
|
56
|
-
if chunking_config:
|
|
57
|
-
reformat_nodes.append(
|
|
58
|
-
Chunker(
|
|
59
|
-
pipeline_context=pipeline_config,
|
|
60
|
-
chunking_config=chunking_config,
|
|
61
|
-
partition_config=partition_config,
|
|
62
|
-
),
|
|
63
|
-
)
|
|
64
|
-
if embedder_config:
|
|
65
|
-
reformat_nodes.append(
|
|
66
|
-
Embedder(
|
|
67
|
-
pipeline_context=pipeline_config,
|
|
68
|
-
embedder_config=embedder_config,
|
|
69
|
-
),
|
|
70
|
-
)
|
|
71
|
-
writer = (
|
|
72
|
-
Writer(
|
|
73
|
-
pipeline_context=pipeline_config,
|
|
74
|
-
dest_doc_connector=dest_doc_connector,
|
|
75
|
-
)
|
|
76
|
-
if dest_doc_connector
|
|
77
|
-
else None
|
|
78
|
-
)
|
|
79
|
-
permissions_data_cleaner = (
|
|
80
|
-
PermissionsDataCleaner(pipeline_context=pipeline_config, processor_config=processor_config)
|
|
81
|
-
if permissions_config
|
|
82
|
-
else None
|
|
83
|
-
)
|
|
84
|
-
pipeline = Pipeline(
|
|
85
|
-
pipeline_context=pipeline_config,
|
|
86
|
-
doc_factory_node=doc_factory,
|
|
87
|
-
source_node=reader,
|
|
88
|
-
partition_node=partitioner,
|
|
89
|
-
reformat_nodes=reformat_nodes,
|
|
90
|
-
write_node=writer,
|
|
91
|
-
permissions_node=permissions_data_cleaner,
|
|
92
|
-
)
|
|
93
|
-
pipeline.run()
|
|
@@ -1,104 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
from typing import Type
|
|
3
|
-
|
|
4
|
-
from .airtable import AirtableRunner
|
|
5
|
-
from .astradb import AstraDBRunner
|
|
6
|
-
from .base_runner import Runner
|
|
7
|
-
from .biomed import BiomedRunner
|
|
8
|
-
from .confluence import ConfluenceRunner
|
|
9
|
-
from .delta_table import DeltaTableRunner
|
|
10
|
-
from .discord import DiscordRunner
|
|
11
|
-
from .elasticsearch import ElasticSearchRunner
|
|
12
|
-
from .fsspec.azure import AzureRunner
|
|
13
|
-
from .fsspec.box import BoxRunner
|
|
14
|
-
from .fsspec.dropbox import DropboxRunner
|
|
15
|
-
from .fsspec.fsspec import FsspecRunner
|
|
16
|
-
from .fsspec.gcs import GCSRunner
|
|
17
|
-
from .fsspec.s3 import S3Runner
|
|
18
|
-
from .fsspec.sftp import SftpRunner
|
|
19
|
-
from .github import GithubRunner
|
|
20
|
-
from .gitlab import GitlabRunner
|
|
21
|
-
from .google_drive import GoogleDriveRunner
|
|
22
|
-
from .hubspot import HubSpotRunner
|
|
23
|
-
from .jira import JiraRunner
|
|
24
|
-
from .kafka import KafkaRunner
|
|
25
|
-
from .local import LocalRunner
|
|
26
|
-
from .mongodb import MongoDBRunner
|
|
27
|
-
from .notion import NotionRunner
|
|
28
|
-
from .onedrive import OneDriveRunner
|
|
29
|
-
from .opensearch import OpenSearchRunner
|
|
30
|
-
from .outlook import OutlookRunner
|
|
31
|
-
from .reddit import RedditRunner
|
|
32
|
-
from .salesforce import SalesforceRunner
|
|
33
|
-
from .sharepoint import SharePointRunner
|
|
34
|
-
from .slack import SlackRunner
|
|
35
|
-
from .wikipedia import WikipediaRunner
|
|
36
|
-
|
|
37
|
-
runner_map: t.Dict[str, Type[Runner]] = {
|
|
38
|
-
"airtable": AirtableRunner,
|
|
39
|
-
"astradb": AstraDBRunner,
|
|
40
|
-
"azure": AzureRunner,
|
|
41
|
-
"biomed": BiomedRunner,
|
|
42
|
-
"box": BoxRunner,
|
|
43
|
-
"confluence": ConfluenceRunner,
|
|
44
|
-
"delta_table": DeltaTableRunner,
|
|
45
|
-
"discord": DiscordRunner,
|
|
46
|
-
"dropbox": DropboxRunner,
|
|
47
|
-
"elasticsearch": ElasticSearchRunner,
|
|
48
|
-
"fsspec": FsspecRunner,
|
|
49
|
-
"gcs": GCSRunner,
|
|
50
|
-
"github": GithubRunner,
|
|
51
|
-
"gitlab": GitlabRunner,
|
|
52
|
-
"gdrive": GoogleDriveRunner,
|
|
53
|
-
"google_drive": GoogleDriveRunner,
|
|
54
|
-
"hubspot": HubSpotRunner,
|
|
55
|
-
"jira": JiraRunner,
|
|
56
|
-
"kafka": KafkaRunner,
|
|
57
|
-
"local": LocalRunner,
|
|
58
|
-
"mongodb": MongoDBRunner,
|
|
59
|
-
"notion": NotionRunner,
|
|
60
|
-
"onedrive": OneDriveRunner,
|
|
61
|
-
"opensearch": OpenSearchRunner,
|
|
62
|
-
"outlook": OutlookRunner,
|
|
63
|
-
"reddit": RedditRunner,
|
|
64
|
-
"s3": S3Runner,
|
|
65
|
-
"salesforce": SalesforceRunner,
|
|
66
|
-
"sftp": SftpRunner,
|
|
67
|
-
"sharepoint": SharePointRunner,
|
|
68
|
-
"slack": SlackRunner,
|
|
69
|
-
"wikipedia": WikipediaRunner,
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
__all__ = [
|
|
73
|
-
"AirtableRunner",
|
|
74
|
-
"AstraRunner",
|
|
75
|
-
"AzureRunner",
|
|
76
|
-
"BiomedRunner",
|
|
77
|
-
"BoxRunner",
|
|
78
|
-
"ConfluenceRunner",
|
|
79
|
-
"DeltaTableRunner",
|
|
80
|
-
"DiscordRunner",
|
|
81
|
-
"DropboxRunner",
|
|
82
|
-
"ElasticSearchRunner",
|
|
83
|
-
"FsspecRunner",
|
|
84
|
-
"GCSRunner",
|
|
85
|
-
"GoogleDriveRunner",
|
|
86
|
-
"GithubRunner",
|
|
87
|
-
"GitlabRunner",
|
|
88
|
-
"JiraRunner",
|
|
89
|
-
"KafkaRunner",
|
|
90
|
-
"LocalRunner",
|
|
91
|
-
"MongoDBRunner",
|
|
92
|
-
"NotionRunner",
|
|
93
|
-
"OneDriveRunner",
|
|
94
|
-
"OpenSearchRunner",
|
|
95
|
-
"OutlookRunner",
|
|
96
|
-
"RedditRunner",
|
|
97
|
-
"S3Runner",
|
|
98
|
-
"SalesforceRunner",
|
|
99
|
-
"SharePointRunner",
|
|
100
|
-
"SlackRunner",
|
|
101
|
-
"WikipediaRunner",
|
|
102
|
-
"runner_map",
|
|
103
|
-
"Runner",
|
|
104
|
-
]
|
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
import hashlib
|
|
2
|
-
import typing as t
|
|
3
|
-
from dataclasses import dataclass
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.interfaces import BaseSourceConnector
|
|
6
|
-
from unstructured_ingest.logger import logger
|
|
7
|
-
from unstructured_ingest.runner.base_runner import Runner
|
|
8
|
-
from unstructured_ingest.runner.utils import update_download_dir_hash
|
|
9
|
-
|
|
10
|
-
if t.TYPE_CHECKING:
|
|
11
|
-
from unstructured_ingest.connector.airtable import SimpleAirtableConfig
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
@dataclass
|
|
15
|
-
class AirtableRunner(Runner):
|
|
16
|
-
connector_config: "SimpleAirtableConfig"
|
|
17
|
-
|
|
18
|
-
def update_read_config(self):
|
|
19
|
-
hashed_dir_name = hashlib.sha256(
|
|
20
|
-
self.connector_config.access_config.personal_access_token.encode("utf-8"),
|
|
21
|
-
)
|
|
22
|
-
|
|
23
|
-
self.read_config.download_dir = update_download_dir_hash(
|
|
24
|
-
connector_name="airtable",
|
|
25
|
-
read_config=self.read_config,
|
|
26
|
-
hashed_dir_name=hashed_dir_name,
|
|
27
|
-
logger=logger,
|
|
28
|
-
)
|
|
29
|
-
|
|
30
|
-
def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
|
|
31
|
-
from unstructured_ingest.connector.airtable import (
|
|
32
|
-
AirtableSourceConnector,
|
|
33
|
-
)
|
|
34
|
-
|
|
35
|
-
return AirtableSourceConnector
|
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
import hashlib
|
|
2
|
-
import typing as t
|
|
3
|
-
from dataclasses import dataclass
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.interfaces import BaseSourceConnector
|
|
6
|
-
from unstructured_ingest.logger import logger
|
|
7
|
-
from unstructured_ingest.runner.base_runner import Runner
|
|
8
|
-
from unstructured_ingest.runner.utils import update_download_dir_hash
|
|
9
|
-
|
|
10
|
-
if t.TYPE_CHECKING:
|
|
11
|
-
from unstructured_ingest.connector.astradb import SimpleAstraDBConfig
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
@dataclass
|
|
15
|
-
class AstraDBRunner(Runner):
|
|
16
|
-
connector_config: "SimpleAstraDBConfig"
|
|
17
|
-
|
|
18
|
-
def update_read_config(self):
|
|
19
|
-
hashed_dir_name = hashlib.sha256(
|
|
20
|
-
str(self.connector_config.access_config.api_endpoint).encode("utf-8"),
|
|
21
|
-
)
|
|
22
|
-
self.read_config.download_dir = update_download_dir_hash(
|
|
23
|
-
connector_name="astradb",
|
|
24
|
-
read_config=self.read_config,
|
|
25
|
-
hashed_dir_name=hashed_dir_name,
|
|
26
|
-
logger=logger,
|
|
27
|
-
)
|
|
28
|
-
|
|
29
|
-
def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
|
|
30
|
-
from unstructured_ingest.connector.astradb import (
|
|
31
|
-
AstraDBSourceConnector,
|
|
32
|
-
)
|
|
33
|
-
|
|
34
|
-
return AstraDBSourceConnector
|