unstructured-ingest 0.6.4__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- examples/airtable.py +44 -0
- examples/azure_cognitive_search.py +55 -0
- examples/chroma.py +54 -0
- examples/couchbase.py +55 -0
- examples/databricks_volumes_dest.py +55 -0
- examples/databricks_volumes_source.py +53 -0
- examples/delta_table.py +45 -0
- examples/discord_example.py +36 -0
- examples/elasticsearch.py +49 -0
- examples/google_drive.py +45 -0
- examples/kdbai.py +54 -0
- examples/local.py +36 -0
- examples/milvus.py +44 -0
- examples/mongodb.py +53 -0
- examples/opensearch.py +50 -0
- examples/pinecone.py +57 -0
- examples/s3.py +38 -0
- examples/salesforce.py +44 -0
- examples/sharepoint.py +47 -0
- examples/singlestore.py +49 -0
- examples/sql.py +90 -0
- examples/vectara.py +54 -0
- examples/weaviate.py +44 -0
- test/integration/chunkers/test_chunkers.py +1 -1
- test/integration/connectors/conftest.py +1 -1
- test/integration/connectors/databricks/test_volumes_native.py +3 -3
- test/integration/connectors/discord/test_discord.py +1 -1
- test/integration/connectors/duckdb/test_duckdb.py +2 -2
- test/integration/connectors/duckdb/test_motherduck.py +2 -2
- test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
- test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
- test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
- test/integration/connectors/sql/test_postgres.py +2 -2
- test/integration/connectors/sql/test_singlestore.py +2 -2
- test/integration/connectors/sql/test_snowflake.py +2 -2
- test/integration/connectors/sql/test_sqlite.py +2 -2
- test/integration/connectors/sql/test_vastdb.py +1 -1
- test/integration/connectors/test_astradb.py +2 -2
- test/integration/connectors/test_azure_ai_search.py +2 -2
- test/integration/connectors/test_chroma.py +2 -2
- test/integration/connectors/test_confluence.py +1 -1
- test/integration/connectors/test_delta_table.py +2 -2
- test/integration/connectors/test_dropbox.py +2 -2
- test/integration/connectors/test_github.py +1 -1
- test/integration/connectors/test_google_drive.py +2 -2
- test/integration/connectors/test_jira.py +1 -1
- test/integration/connectors/test_lancedb.py +7 -7
- test/integration/connectors/test_milvus.py +2 -2
- test/integration/connectors/test_mongodb.py +2 -2
- test/integration/connectors/test_neo4j.py +7 -7
- test/integration/connectors/test_notion.py +2 -2
- test/integration/connectors/test_onedrive.py +2 -2
- test/integration/connectors/test_pinecone.py +3 -3
- test/integration/connectors/test_qdrant.py +6 -6
- test/integration/connectors/test_redis.py +3 -3
- test/integration/connectors/test_s3.py +3 -3
- test/integration/connectors/test_sharepoint.py +1 -1
- test/integration/connectors/test_vectara.py +4 -4
- test/integration/connectors/test_zendesk.py +2 -2
- test/integration/connectors/utils/validation/destination.py +2 -2
- test/integration/connectors/utils/validation/source.py +2 -2
- test/integration/connectors/weaviate/test_cloud.py +1 -1
- test/integration/connectors/weaviate/test_local.py +2 -2
- test/integration/embedders/test_azure_openai.py +1 -1
- test/integration/embedders/test_bedrock.py +2 -2
- test/integration/embedders/test_huggingface.py +1 -1
- test/integration/embedders/test_mixedbread.py +1 -1
- test/integration/embedders/test_octoai.py +2 -2
- test/integration/embedders/test_openai.py +2 -2
- test/integration/embedders/test_togetherai.py +2 -2
- test/integration/embedders/test_vertexai.py +1 -1
- test/integration/embedders/test_voyageai.py +1 -1
- test/integration/partitioners/test_partitioner.py +2 -2
- test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
- test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
- test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
- test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
- test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
- test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
- test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
- test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
- test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
- test/unit/test_html.py +1 -1
- test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
- test/unit/test_utils.py +106 -97
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/__init__.py +0 -14
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +259 -9
- unstructured_ingest/cli/base/dest.py +58 -61
- unstructured_ingest/cli/base/src.py +54 -36
- unstructured_ingest/cli/cli.py +4 -17
- unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
- unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
- unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
- unstructured_ingest/embed/bedrock.py +3 -3
- unstructured_ingest/embed/octoai.py +3 -3
- unstructured_ingest/embed/openai.py +3 -3
- unstructured_ingest/embed/togetherai.py +4 -4
- unstructured_ingest/embed/vertexai.py +1 -1
- unstructured_ingest/embed/voyageai.py +4 -4
- unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
- unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
- unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
- unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
- unstructured_ingest/{v2/otel.py → otel.py} +1 -1
- unstructured_ingest/pipeline/__init__.py +0 -22
- unstructured_ingest/pipeline/interfaces.py +179 -238
- unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
- unstructured_ingest/pipeline/pipeline.py +388 -97
- unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
- unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
- unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
- unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +11 -11
- unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
- unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
- unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
- unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
- unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
- unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/github.py +10 -10
- unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
- unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
- unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
- unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
- unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
- unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +10 -10
- unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
- unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
- unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
- unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
- unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
- unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
- unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
- unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
- unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
- unstructured_ingest/utils/compression.py +1 -48
- unstructured_ingest/utils/data_prep.py +9 -1
- unstructured_ingest/utils/html.py +3 -3
- unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
- unstructured_ingest/utils/string_and_date_utils.py +1 -1
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/METADATA +21 -21
- unstructured_ingest-0.7.0.dist-info/RECORD +370 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/top_level.txt +1 -0
- test/unit/v2/test_utils.py +0 -82
- unstructured_ingest/cli/cmd_factory.py +0 -12
- unstructured_ingest/cli/cmds/__init__.py +0 -145
- unstructured_ingest/cli/cmds/airtable.py +0 -69
- unstructured_ingest/cli/cmds/astradb.py +0 -99
- unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
- unstructured_ingest/cli/cmds/biomed.py +0 -52
- unstructured_ingest/cli/cmds/chroma.py +0 -104
- unstructured_ingest/cli/cmds/clarifai.py +0 -71
- unstructured_ingest/cli/cmds/confluence.py +0 -69
- unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
- unstructured_ingest/cli/cmds/delta_table.py +0 -94
- unstructured_ingest/cli/cmds/discord.py +0 -47
- unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
- unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
- unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
- unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
- unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
- unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
- unstructured_ingest/cli/cmds/github.py +0 -54
- unstructured_ingest/cli/cmds/gitlab.py +0 -54
- unstructured_ingest/cli/cmds/google_drive.py +0 -49
- unstructured_ingest/cli/cmds/hubspot.py +0 -70
- unstructured_ingest/cli/cmds/jira.py +0 -71
- unstructured_ingest/cli/cmds/kafka.py +0 -102
- unstructured_ingest/cli/cmds/local.py +0 -43
- unstructured_ingest/cli/cmds/mongodb.py +0 -72
- unstructured_ingest/cli/cmds/notion.py +0 -48
- unstructured_ingest/cli/cmds/onedrive.py +0 -66
- unstructured_ingest/cli/cmds/opensearch.py +0 -117
- unstructured_ingest/cli/cmds/outlook.py +0 -67
- unstructured_ingest/cli/cmds/pinecone.py +0 -71
- unstructured_ingest/cli/cmds/qdrant.py +0 -124
- unstructured_ingest/cli/cmds/reddit.py +0 -67
- unstructured_ingest/cli/cmds/salesforce.py +0 -58
- unstructured_ingest/cli/cmds/sharepoint.py +0 -66
- unstructured_ingest/cli/cmds/slack.py +0 -56
- unstructured_ingest/cli/cmds/sql.py +0 -66
- unstructured_ingest/cli/cmds/vectara.py +0 -66
- unstructured_ingest/cli/cmds/weaviate.py +0 -98
- unstructured_ingest/cli/cmds/wikipedia.py +0 -40
- unstructured_ingest/cli/common.py +0 -7
- unstructured_ingest/cli/interfaces.py +0 -663
- unstructured_ingest/cli/utils.py +0 -205
- unstructured_ingest/connector/airtable.py +0 -309
- unstructured_ingest/connector/astradb.py +0 -267
- unstructured_ingest/connector/azure_ai_search.py +0 -144
- unstructured_ingest/connector/biomed.py +0 -320
- unstructured_ingest/connector/chroma.py +0 -158
- unstructured_ingest/connector/clarifai.py +0 -122
- unstructured_ingest/connector/confluence.py +0 -285
- unstructured_ingest/connector/databricks_volumes.py +0 -137
- unstructured_ingest/connector/delta_table.py +0 -203
- unstructured_ingest/connector/discord.py +0 -180
- unstructured_ingest/connector/elasticsearch.py +0 -396
- unstructured_ingest/connector/fsspec/azure.py +0 -78
- unstructured_ingest/connector/fsspec/box.py +0 -109
- unstructured_ingest/connector/fsspec/dropbox.py +0 -160
- unstructured_ingest/connector/fsspec/fsspec.py +0 -359
- unstructured_ingest/connector/fsspec/gcs.py +0 -82
- unstructured_ingest/connector/fsspec/s3.py +0 -62
- unstructured_ingest/connector/fsspec/sftp.py +0 -81
- unstructured_ingest/connector/git.py +0 -124
- unstructured_ingest/connector/github.py +0 -174
- unstructured_ingest/connector/gitlab.py +0 -142
- unstructured_ingest/connector/google_drive.py +0 -348
- unstructured_ingest/connector/hubspot.py +0 -278
- unstructured_ingest/connector/jira.py +0 -469
- unstructured_ingest/connector/kafka.py +0 -293
- unstructured_ingest/connector/local.py +0 -139
- unstructured_ingest/connector/mongodb.py +0 -284
- unstructured_ingest/connector/notion/client.py +0 -248
- unstructured_ingest/connector/notion/connector.py +0 -469
- unstructured_ingest/connector/notion/helpers.py +0 -584
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
- unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
- unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
- unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
- unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
- unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
- unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
- unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
- unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
- unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
- unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
- unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
- unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
- unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
- unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
- unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
- unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
- unstructured_ingest/connector/notion/types/date.py +0 -26
- unstructured_ingest/connector/notion/types/file.py +0 -51
- unstructured_ingest/connector/notion/types/user.py +0 -76
- unstructured_ingest/connector/onedrive.py +0 -232
- unstructured_ingest/connector/opensearch.py +0 -218
- unstructured_ingest/connector/outlook.py +0 -285
- unstructured_ingest/connector/pinecone.py +0 -150
- unstructured_ingest/connector/qdrant.py +0 -144
- unstructured_ingest/connector/reddit.py +0 -166
- unstructured_ingest/connector/registry.py +0 -109
- unstructured_ingest/connector/salesforce.py +0 -301
- unstructured_ingest/connector/sharepoint.py +0 -573
- unstructured_ingest/connector/slack.py +0 -224
- unstructured_ingest/connector/sql.py +0 -199
- unstructured_ingest/connector/vectara.py +0 -253
- unstructured_ingest/connector/weaviate.py +0 -190
- unstructured_ingest/connector/wikipedia.py +0 -208
- unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
- unstructured_ingest/enhanced_dataclass/core.py +0 -99
- unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
- unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
- unstructured_ingest/interfaces.py +0 -852
- unstructured_ingest/pipeline/copy.py +0 -19
- unstructured_ingest/pipeline/doc_factory.py +0 -12
- unstructured_ingest/pipeline/partition.py +0 -60
- unstructured_ingest/pipeline/permissions.py +0 -12
- unstructured_ingest/pipeline/reformat/chunking.py +0 -134
- unstructured_ingest/pipeline/reformat/embedding.py +0 -64
- unstructured_ingest/pipeline/source.py +0 -77
- unstructured_ingest/pipeline/utils.py +0 -6
- unstructured_ingest/pipeline/write.py +0 -18
- unstructured_ingest/processor.py +0 -93
- unstructured_ingest/runner/__init__.py +0 -104
- unstructured_ingest/runner/airtable.py +0 -35
- unstructured_ingest/runner/astradb.py +0 -34
- unstructured_ingest/runner/base_runner.py +0 -89
- unstructured_ingest/runner/biomed.py +0 -45
- unstructured_ingest/runner/confluence.py +0 -35
- unstructured_ingest/runner/delta_table.py +0 -34
- unstructured_ingest/runner/discord.py +0 -35
- unstructured_ingest/runner/elasticsearch.py +0 -40
- unstructured_ingest/runner/fsspec/azure.py +0 -30
- unstructured_ingest/runner/fsspec/box.py +0 -28
- unstructured_ingest/runner/fsspec/dropbox.py +0 -30
- unstructured_ingest/runner/fsspec/fsspec.py +0 -40
- unstructured_ingest/runner/fsspec/gcs.py +0 -28
- unstructured_ingest/runner/fsspec/s3.py +0 -28
- unstructured_ingest/runner/fsspec/sftp.py +0 -28
- unstructured_ingest/runner/github.py +0 -37
- unstructured_ingest/runner/gitlab.py +0 -37
- unstructured_ingest/runner/google_drive.py +0 -35
- unstructured_ingest/runner/hubspot.py +0 -35
- unstructured_ingest/runner/jira.py +0 -35
- unstructured_ingest/runner/kafka.py +0 -34
- unstructured_ingest/runner/local.py +0 -23
- unstructured_ingest/runner/mongodb.py +0 -34
- unstructured_ingest/runner/notion.py +0 -61
- unstructured_ingest/runner/onedrive.py +0 -35
- unstructured_ingest/runner/opensearch.py +0 -40
- unstructured_ingest/runner/outlook.py +0 -33
- unstructured_ingest/runner/reddit.py +0 -35
- unstructured_ingest/runner/salesforce.py +0 -33
- unstructured_ingest/runner/sharepoint.py +0 -35
- unstructured_ingest/runner/slack.py +0 -33
- unstructured_ingest/runner/utils.py +0 -47
- unstructured_ingest/runner/wikipedia.py +0 -35
- unstructured_ingest/runner/writers/__init__.py +0 -48
- unstructured_ingest/runner/writers/astradb.py +0 -22
- unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
- unstructured_ingest/runner/writers/base_writer.py +0 -26
- unstructured_ingest/runner/writers/chroma.py +0 -22
- unstructured_ingest/runner/writers/clarifai.py +0 -19
- unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
- unstructured_ingest/runner/writers/delta_table.py +0 -24
- unstructured_ingest/runner/writers/elasticsearch.py +0 -24
- unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
- unstructured_ingest/runner/writers/fsspec/box.py +0 -21
- unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
- unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
- unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
- unstructured_ingest/runner/writers/kafka.py +0 -21
- unstructured_ingest/runner/writers/mongodb.py +0 -21
- unstructured_ingest/runner/writers/opensearch.py +0 -26
- unstructured_ingest/runner/writers/pinecone.py +0 -21
- unstructured_ingest/runner/writers/qdrant.py +0 -19
- unstructured_ingest/runner/writers/sql.py +0 -22
- unstructured_ingest/runner/writers/vectara.py +0 -22
- unstructured_ingest/runner/writers/weaviate.py +0 -21
- unstructured_ingest/utils/google_filetype.py +0 -9
- unstructured_ingest/v2/__init__.py +0 -1
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +0 -4
- unstructured_ingest/v2/cli/base/cmd.py +0 -269
- unstructured_ingest/v2/cli/base/dest.py +0 -85
- unstructured_ingest/v2/cli/base/src.py +0 -85
- unstructured_ingest/v2/cli/cli.py +0 -24
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/logger.py +0 -126
- unstructured_ingest/v2/main.py +0 -11
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +0 -211
- unstructured_ingest/v2/pipeline/pipeline.py +0 -408
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
- unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
- unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/v2/processes/utils/__init__.py +0 -0
- unstructured_ingest/v2/types/__init__.py +0 -0
- unstructured_ingest-0.6.4.dist-info/RECORD +0 -591
- {test/unit/v2 → examples}/__init__.py +0 -0
- /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
- /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
- /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
- /test/unit/{v2/utils → utils}/__init__.py +0 -0
- /test/unit/{v2/utils → utils}/data_generator.py +0 -0
- /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
- /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
- /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
- /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
- /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
- /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
- /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
- /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
- /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
- /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
- /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
- /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
- /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
- /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
- /unstructured_ingest/{v2 → utils}/constants.py +0 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.6.4.dist-info → unstructured_ingest-0.7.0.dist-info}/entry_points.txt +0 -0
examples/mongodb.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import random
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import ProcessorConfig
|
|
5
|
+
from unstructured_ingest.logger import logger
|
|
6
|
+
from unstructured_ingest.pipeline.pipeline import Pipeline
|
|
7
|
+
from unstructured_ingest.processes.chunker import ChunkerConfig
|
|
8
|
+
from unstructured_ingest.processes.connectors.local import (
|
|
9
|
+
LocalConnectionConfig,
|
|
10
|
+
LocalDownloaderConfig,
|
|
11
|
+
LocalIndexerConfig,
|
|
12
|
+
)
|
|
13
|
+
from unstructured_ingest.processes.connectors.mongodb import (
|
|
14
|
+
CONNECTOR_TYPE,
|
|
15
|
+
MongoDBAccessConfig,
|
|
16
|
+
MongoDBConnectionConfig,
|
|
17
|
+
MongoDBUploaderConfig,
|
|
18
|
+
MongoDBUploadStagerConfig,
|
|
19
|
+
)
|
|
20
|
+
from unstructured_ingest.processes.embedder import EmbedderConfig
|
|
21
|
+
from unstructured_ingest.processes.partitioner import PartitionerConfig
|
|
22
|
+
|
|
23
|
+
base_path = Path(__file__).parent.parent.parent.parent
|
|
24
|
+
docs_path = base_path / "example-docs"
|
|
25
|
+
work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
|
|
26
|
+
output_path = work_dir / "output"
|
|
27
|
+
download_path = work_dir / "download"
|
|
28
|
+
|
|
29
|
+
if __name__ == "__main__":
|
|
30
|
+
logger.info(f"writing all content in: {work_dir.resolve()}")
|
|
31
|
+
Pipeline.from_configs(
|
|
32
|
+
context=ProcessorConfig(work_dir=str(work_dir.resolve())),
|
|
33
|
+
indexer_config=LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"),
|
|
34
|
+
downloader_config=LocalDownloaderConfig(download_dir=download_path),
|
|
35
|
+
source_connection_config=LocalConnectionConfig(),
|
|
36
|
+
partitioner_config=PartitionerConfig(strategy="fast"),
|
|
37
|
+
chunker_config=ChunkerConfig(
|
|
38
|
+
chunking_strategy="by_title",
|
|
39
|
+
chunk_include_orig_elements=False,
|
|
40
|
+
chunk_max_characters=1500,
|
|
41
|
+
chunk_multipage_sections=True,
|
|
42
|
+
),
|
|
43
|
+
embedder_config=EmbedderConfig(embedding_provider="huggingface"),
|
|
44
|
+
destination_connection_config=MongoDBConnectionConfig(
|
|
45
|
+
access_config=MongoDBAccessConfig(uri=None),
|
|
46
|
+
host="localhost",
|
|
47
|
+
port=27017,
|
|
48
|
+
collection=f"test-collection-{random.randint(1000, 9999)}",
|
|
49
|
+
database="testDatabase",
|
|
50
|
+
),
|
|
51
|
+
stager_config=MongoDBUploadStagerConfig(),
|
|
52
|
+
uploader_config=MongoDBUploaderConfig(batch_size=10),
|
|
53
|
+
).run()
|
examples/opensearch.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.interfaces import ProcessorConfig
|
|
4
|
+
from unstructured_ingest.logger import logger
|
|
5
|
+
from unstructured_ingest.pipeline.pipeline import Pipeline
|
|
6
|
+
from unstructured_ingest.processes.chunker import ChunkerConfig
|
|
7
|
+
from unstructured_ingest.processes.connectors.local import (
|
|
8
|
+
LocalConnectionConfig,
|
|
9
|
+
LocalDownloaderConfig,
|
|
10
|
+
LocalIndexerConfig,
|
|
11
|
+
)
|
|
12
|
+
from unstructured_ingest.processes.connectors.opensearch import (
|
|
13
|
+
CONNECTOR_TYPE,
|
|
14
|
+
OpenSearchAccessConfig,
|
|
15
|
+
OpenSearchConnectionConfig,
|
|
16
|
+
OpenSearchUploaderConfig,
|
|
17
|
+
OpenSearchUploadStagerConfig,
|
|
18
|
+
)
|
|
19
|
+
from unstructured_ingest.processes.embedder import EmbedderConfig
|
|
20
|
+
from unstructured_ingest.processes.partitioner import PartitionerConfig
|
|
21
|
+
|
|
22
|
+
base_path = Path(__file__).parent.parent.parent.parent
|
|
23
|
+
docs_path = base_path / "example-docs"
|
|
24
|
+
work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
|
|
25
|
+
output_path = work_dir / "output"
|
|
26
|
+
download_path = work_dir / "download"
|
|
27
|
+
|
|
28
|
+
if __name__ == "__main__":
|
|
29
|
+
logger.info(f"writing all content in: {work_dir.resolve()}")
|
|
30
|
+
Pipeline.from_configs(
|
|
31
|
+
context=ProcessorConfig(work_dir=str(work_dir.resolve())),
|
|
32
|
+
indexer_config=LocalIndexerConfig(
|
|
33
|
+
input_path=str(docs_path.resolve()) + "/book-war-and-peace-1p.txt"
|
|
34
|
+
),
|
|
35
|
+
downloader_config=LocalDownloaderConfig(download_dir=download_path),
|
|
36
|
+
source_connection_config=LocalConnectionConfig(),
|
|
37
|
+
partitioner_config=PartitionerConfig(strategy="fast"),
|
|
38
|
+
chunker_config=ChunkerConfig(chunking_strategy="by_title"),
|
|
39
|
+
embedder_config=EmbedderConfig(embedding_provider="huggingface"),
|
|
40
|
+
destination_connection_config=OpenSearchConnectionConfig(
|
|
41
|
+
hosts="http://localhost:9247",
|
|
42
|
+
username="admin",
|
|
43
|
+
use_ssl=True,
|
|
44
|
+
access_config=OpenSearchAccessConfig(password="admin"),
|
|
45
|
+
),
|
|
46
|
+
stager_config=OpenSearchUploadStagerConfig(index_name="ingest-test-destination"),
|
|
47
|
+
uploader_config=OpenSearchUploaderConfig(
|
|
48
|
+
index_name="ingest-test-destination", batch_size_bytes=150
|
|
49
|
+
),
|
|
50
|
+
).run()
|
examples/pinecone.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import ProcessorConfig
|
|
5
|
+
from unstructured_ingest.logger import logger
|
|
6
|
+
from unstructured_ingest.pipeline.pipeline import Pipeline
|
|
7
|
+
from unstructured_ingest.processes.chunker import ChunkerConfig
|
|
8
|
+
from unstructured_ingest.processes.connectors.local import (
|
|
9
|
+
LocalConnectionConfig,
|
|
10
|
+
LocalDownloaderConfig,
|
|
11
|
+
LocalIndexerConfig,
|
|
12
|
+
)
|
|
13
|
+
from unstructured_ingest.processes.connectors.pinecone import (
|
|
14
|
+
CONNECTOR_TYPE,
|
|
15
|
+
PineconeAccessConfig,
|
|
16
|
+
PineconeConnectionConfig,
|
|
17
|
+
PineconeUploaderConfig,
|
|
18
|
+
PineconeUploadStagerConfig,
|
|
19
|
+
)
|
|
20
|
+
from unstructured_ingest.processes.embedder import EmbedderConfig
|
|
21
|
+
from unstructured_ingest.processes.partitioner import PartitionerConfig
|
|
22
|
+
|
|
23
|
+
base_path = Path(__file__).parent.parent.parent.parent
|
|
24
|
+
docs_path = base_path / "example-docs"
|
|
25
|
+
work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
|
|
26
|
+
output_path = work_dir / "output"
|
|
27
|
+
download_path = work_dir / "download"
|
|
28
|
+
|
|
29
|
+
if __name__ == "__main__":
|
|
30
|
+
logger.info(f"writing all content in: {work_dir.resolve()}")
|
|
31
|
+
Pipeline.from_configs(
|
|
32
|
+
context=ProcessorConfig(work_dir=str(work_dir.resolve())),
|
|
33
|
+
indexer_config=LocalIndexerConfig(
|
|
34
|
+
input_path=str(docs_path.resolve()) + "/book-war-and-peace-1p.txt"
|
|
35
|
+
),
|
|
36
|
+
downloader_config=LocalDownloaderConfig(download_dir=download_path),
|
|
37
|
+
source_connection_config=LocalConnectionConfig(),
|
|
38
|
+
partitioner_config=PartitionerConfig(strategy="fast"),
|
|
39
|
+
chunker_config=ChunkerConfig(chunking_strategy="by_title"),
|
|
40
|
+
embedder_config=EmbedderConfig(embedding_provider="huggingface"),
|
|
41
|
+
destination_connection_config=PineconeConnectionConfig(
|
|
42
|
+
# You'll need to set PINECONE_API_KEY environment variable to run this example
|
|
43
|
+
access_config=PineconeAccessConfig(pinecone_api_key=os.getenv("PINECONE_API_KEY")),
|
|
44
|
+
index_name=os.getenv(
|
|
45
|
+
"PINECONE_INDEX",
|
|
46
|
+
default="your index name here. e.g. my-index,"
|
|
47
|
+
"or define in environment variable PINECONE_INDEX",
|
|
48
|
+
),
|
|
49
|
+
environment=os.getenv(
|
|
50
|
+
"PINECONE_ENVIRONMENT",
|
|
51
|
+
default="your environment name here. e.g. us-east-1,"
|
|
52
|
+
"or define in environment variable PINECONE_ENVIRONMENT",
|
|
53
|
+
),
|
|
54
|
+
),
|
|
55
|
+
stager_config=PineconeUploadStagerConfig(),
|
|
56
|
+
uploader_config=PineconeUploaderConfig(batch_size=10, num_processes=2),
|
|
57
|
+
).run()
|
examples/s3.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.interfaces import ProcessorConfig
|
|
4
|
+
from unstructured_ingest.logger import logger
|
|
5
|
+
from unstructured_ingest.pipeline.pipeline import Pipeline
|
|
6
|
+
from unstructured_ingest.processes.chunker import ChunkerConfig
|
|
7
|
+
from unstructured_ingest.processes.connectors.fsspec.s3 import (
|
|
8
|
+
CONNECTOR_TYPE,
|
|
9
|
+
S3ConnectionConfig,
|
|
10
|
+
S3DownloaderConfig,
|
|
11
|
+
S3IndexerConfig,
|
|
12
|
+
)
|
|
13
|
+
from unstructured_ingest.processes.connectors.local import (
|
|
14
|
+
LocalUploaderConfig,
|
|
15
|
+
)
|
|
16
|
+
from unstructured_ingest.processes.embedder import EmbedderConfig
|
|
17
|
+
from unstructured_ingest.processes.filter import FiltererConfig
|
|
18
|
+
from unstructured_ingest.processes.partitioner import PartitionerConfig
|
|
19
|
+
|
|
20
|
+
base_path = Path(__file__).parent.parent.parent.parent
|
|
21
|
+
docs_path = base_path / "example-docs"
|
|
22
|
+
work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
|
|
23
|
+
output_path = work_dir / "output"
|
|
24
|
+
download_path = work_dir / "download"
|
|
25
|
+
|
|
26
|
+
if __name__ == "__main__":
|
|
27
|
+
logger.info(f"writing all content in: {work_dir.resolve()}")
|
|
28
|
+
Pipeline.from_configs(
|
|
29
|
+
context=ProcessorConfig(work_dir=str(work_dir.resolve()), verbose=True, iter_delete=True),
|
|
30
|
+
indexer_config=S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/small-pdf-set/"),
|
|
31
|
+
downloader_config=S3DownloaderConfig(download_dir=download_path),
|
|
32
|
+
source_connection_config=S3ConnectionConfig(anonymous=True),
|
|
33
|
+
partitioner_config=PartitionerConfig(strategy="fast"),
|
|
34
|
+
chunker_config=ChunkerConfig(chunking_strategy="by_title"),
|
|
35
|
+
embedder_config=EmbedderConfig(embedding_provider="huggingface"),
|
|
36
|
+
uploader_config=LocalUploaderConfig(output_dir=str(output_path.resolve())),
|
|
37
|
+
filterer_config=FiltererConfig(max_file_size=900000),
|
|
38
|
+
).run()
|
examples/salesforce.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import ProcessorConfig
|
|
5
|
+
from unstructured_ingest.logger import logger
|
|
6
|
+
from unstructured_ingest.pipeline.pipeline import Pipeline
|
|
7
|
+
from unstructured_ingest.processes.chunker import ChunkerConfig
|
|
8
|
+
from unstructured_ingest.processes.connectors.local import (
|
|
9
|
+
LocalUploaderConfig,
|
|
10
|
+
)
|
|
11
|
+
from unstructured_ingest.processes.connectors.salesforce import (
|
|
12
|
+
CONNECTOR_TYPE,
|
|
13
|
+
SalesforceAccessConfig,
|
|
14
|
+
SalesforceConnectionConfig,
|
|
15
|
+
SalesforceDownloaderConfig,
|
|
16
|
+
SalesforceIndexerConfig,
|
|
17
|
+
)
|
|
18
|
+
from unstructured_ingest.processes.embedder import EmbedderConfig
|
|
19
|
+
from unstructured_ingest.processes.partitioner import PartitionerConfig
|
|
20
|
+
|
|
21
|
+
base_path = Path(__file__).parent.parent.parent.parent
|
|
22
|
+
docs_path = base_path / "example-docs"
|
|
23
|
+
work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
|
|
24
|
+
output_path = work_dir / "output"
|
|
25
|
+
download_path = work_dir / "download"
|
|
26
|
+
|
|
27
|
+
if __name__ == "__main__":
|
|
28
|
+
logger.info(f"writing all content in: {work_dir.resolve()}")
|
|
29
|
+
Pipeline.from_configs(
|
|
30
|
+
context=ProcessorConfig(work_dir=str(work_dir.resolve())),
|
|
31
|
+
indexer_config=SalesforceIndexerConfig(categories=["Campaign", "EmailMessage"]),
|
|
32
|
+
downloader_config=SalesforceDownloaderConfig(download_dir=download_path),
|
|
33
|
+
source_connection_config=SalesforceConnectionConfig(
|
|
34
|
+
SalesforceAccessConfig(
|
|
35
|
+
consumer_key=os.getenv("SALESFORCE_CONSUMER_KEY"),
|
|
36
|
+
private_key=os.getenv("SALESFORCE_PRIVATE_KEY"),
|
|
37
|
+
),
|
|
38
|
+
username=os.getenv("SALESFORCE_USERNAME"),
|
|
39
|
+
),
|
|
40
|
+
partitioner_config=PartitionerConfig(strategy="fast"),
|
|
41
|
+
chunker_config=ChunkerConfig(chunking_strategy="by_title"),
|
|
42
|
+
embedder_config=EmbedderConfig(embedding_provider="huggingface"),
|
|
43
|
+
uploader_config=LocalUploaderConfig(output_dir=str(output_path.resolve())),
|
|
44
|
+
).run()
|
examples/sharepoint.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import ProcessorConfig
|
|
5
|
+
from unstructured_ingest.logger import logger
|
|
6
|
+
from unstructured_ingest.pipeline.pipeline import Pipeline
|
|
7
|
+
from unstructured_ingest.processes.connectors.local import (
|
|
8
|
+
LocalUploaderConfig,
|
|
9
|
+
)
|
|
10
|
+
from unstructured_ingest.processes.connectors.sharepoint import (
|
|
11
|
+
CONNECTOR_TYPE,
|
|
12
|
+
SharepointAccessConfig,
|
|
13
|
+
SharepointConnectionConfig,
|
|
14
|
+
SharepointDownloaderConfig,
|
|
15
|
+
SharepointIndexerConfig,
|
|
16
|
+
SharepointPermissionsConfig,
|
|
17
|
+
)
|
|
18
|
+
from unstructured_ingest.processes.partitioner import PartitionerConfig
|
|
19
|
+
|
|
20
|
+
base_path = Path(__file__).parent.parent.parent.parent
|
|
21
|
+
docs_path = base_path / "example-docs"
|
|
22
|
+
work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
|
|
23
|
+
output_path = work_dir / "output"
|
|
24
|
+
download_path = work_dir / "download"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
if __name__ == "__main__":
|
|
28
|
+
logger.info(f"writing all content in: {work_dir.resolve()}")
|
|
29
|
+
Pipeline.from_configs(
|
|
30
|
+
context=ProcessorConfig(work_dir=str(work_dir.resolve()), tqdm=True, verbose=True),
|
|
31
|
+
indexer_config=SharepointIndexerConfig(),
|
|
32
|
+
downloader_config=SharepointDownloaderConfig(download_dir=download_path),
|
|
33
|
+
source_connection_config=SharepointConnectionConfig(
|
|
34
|
+
client_id=os.getenv("SHAREPOINT_CLIENT_ID"),
|
|
35
|
+
site=os.getenv("SHAREPOINT_SITE"),
|
|
36
|
+
access_config=SharepointAccessConfig(client_cred=os.getenv("SHAREPOINT_CRED")),
|
|
37
|
+
permissions_config=SharepointPermissionsConfig(
|
|
38
|
+
permissions_application_id=os.getenv("SHAREPOINT_PERMISSIONS_APP_ID"),
|
|
39
|
+
permissions_client_cred=os.getenv("SHAREPOINT_PERMISSIONS_APP_CRED"),
|
|
40
|
+
permissions_tenant=os.getenv("SHAREPOINT_PERMISSIONS_TENANT"),
|
|
41
|
+
),
|
|
42
|
+
),
|
|
43
|
+
partitioner_config=PartitionerConfig(strategy="fast"),
|
|
44
|
+
# chunker_config=ChunkerConfig(chunking_strategy="by_title"),
|
|
45
|
+
# embedder_config=EmbedderConfig(embedding_provider="huggingface"),
|
|
46
|
+
uploader_config=LocalUploaderConfig(output_dir=str(output_path.resolve())),
|
|
47
|
+
).run()
|
examples/singlestore.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.interfaces import ProcessorConfig
|
|
4
|
+
from unstructured_ingest.logger import logger
|
|
5
|
+
from unstructured_ingest.pipeline.pipeline import Pipeline
|
|
6
|
+
from unstructured_ingest.processes.chunker import ChunkerConfig
|
|
7
|
+
from unstructured_ingest.processes.connectors.local import (
|
|
8
|
+
LocalConnectionConfig,
|
|
9
|
+
LocalDownloaderConfig,
|
|
10
|
+
LocalIndexerConfig,
|
|
11
|
+
)
|
|
12
|
+
from unstructured_ingest.processes.connectors.singlestore import (
|
|
13
|
+
CONNECTOR_TYPE,
|
|
14
|
+
SingleStoreAccessConfig,
|
|
15
|
+
SingleStoreConnectionConfig,
|
|
16
|
+
SingleStoreUploaderConfig,
|
|
17
|
+
SingleStoreUploadStagerConfig,
|
|
18
|
+
)
|
|
19
|
+
from unstructured_ingest.processes.embedder import EmbedderConfig
|
|
20
|
+
from unstructured_ingest.processes.partitioner import PartitionerConfig
|
|
21
|
+
|
|
22
|
+
base_path = Path(__file__).parent.parent.parent.parent
|
|
23
|
+
docs_path = base_path / "example-docs"
|
|
24
|
+
work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
|
|
25
|
+
output_path = work_dir / "output"
|
|
26
|
+
download_path = work_dir / "download"
|
|
27
|
+
|
|
28
|
+
if __name__ == "__main__":
|
|
29
|
+
logger.info(f"writing all content in: {work_dir.resolve()}")
|
|
30
|
+
Pipeline.from_configs(
|
|
31
|
+
context=ProcessorConfig(work_dir=str(work_dir.resolve()), tqdm=True, verbose=True),
|
|
32
|
+
indexer_config=LocalIndexerConfig(
|
|
33
|
+
input_path=str(docs_path.resolve()) + "/book-war-and-peace-1p.txt"
|
|
34
|
+
),
|
|
35
|
+
downloader_config=LocalDownloaderConfig(download_dir=download_path),
|
|
36
|
+
source_connection_config=LocalConnectionConfig(),
|
|
37
|
+
partitioner_config=PartitionerConfig(strategy="fast"),
|
|
38
|
+
chunker_config=ChunkerConfig(chunking_strategy="by_title"),
|
|
39
|
+
embedder_config=EmbedderConfig(embedding_provider="huggingface"),
|
|
40
|
+
destination_connection_config=SingleStoreConnectionConfig(
|
|
41
|
+
access_config=SingleStoreAccessConfig(password="password"),
|
|
42
|
+
host="localhost",
|
|
43
|
+
port=3306,
|
|
44
|
+
database="ingest_test",
|
|
45
|
+
user="root",
|
|
46
|
+
),
|
|
47
|
+
stager_config=SingleStoreUploadStagerConfig(),
|
|
48
|
+
uploader_config=SingleStoreUploaderConfig(table_name="elements"),
|
|
49
|
+
).run()
|
examples/sql.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sqlite3
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.interfaces import ProcessorConfig
|
|
6
|
+
from unstructured_ingest.logger import logger
|
|
7
|
+
from unstructured_ingest.pipeline.pipeline import Pipeline
|
|
8
|
+
from unstructured_ingest.processes.chunker import ChunkerConfig
|
|
9
|
+
from unstructured_ingest.processes.connectors.local import (
|
|
10
|
+
LocalConnectionConfig,
|
|
11
|
+
LocalDownloaderConfig,
|
|
12
|
+
LocalIndexerConfig,
|
|
13
|
+
)
|
|
14
|
+
from unstructured_ingest.processes.connectors.sql import (
|
|
15
|
+
CONNECTOR_TYPE,
|
|
16
|
+
POSTGRESQL_DB,
|
|
17
|
+
SQLITE_DB,
|
|
18
|
+
SQLAccessConfig,
|
|
19
|
+
SQLConnectionConfig,
|
|
20
|
+
SQLUploaderConfig,
|
|
21
|
+
SQLUploadStagerConfig,
|
|
22
|
+
)
|
|
23
|
+
from unstructured_ingest.processes.embedder import EmbedderConfig
|
|
24
|
+
from unstructured_ingest.processes.partitioner import PartitionerConfig
|
|
25
|
+
|
|
26
|
+
base_path = Path(__file__).parent.parent.parent.parent
|
|
27
|
+
docs_path = base_path / "example-docs"
|
|
28
|
+
work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
|
|
29
|
+
output_path = work_dir / "output"
|
|
30
|
+
download_path = work_dir / "download"
|
|
31
|
+
|
|
32
|
+
SQLITE_DB_PATH = "test-sql-db.sqlite"
|
|
33
|
+
|
|
34
|
+
if __name__ == "__main__":
|
|
35
|
+
logger.info(f"writing all content in: {work_dir.resolve()}")
|
|
36
|
+
|
|
37
|
+
configs = {
|
|
38
|
+
"context": ProcessorConfig(work_dir=str(work_dir.resolve())),
|
|
39
|
+
"indexer_config": LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"),
|
|
40
|
+
"downloader_config": LocalDownloaderConfig(download_dir=download_path),
|
|
41
|
+
"source_connection_config": LocalConnectionConfig(),
|
|
42
|
+
"partitioner_config": PartitionerConfig(strategy="fast"),
|
|
43
|
+
"chunker_config": ChunkerConfig(
|
|
44
|
+
chunking_strategy="by_title",
|
|
45
|
+
chunk_include_orig_elements=False,
|
|
46
|
+
chunk_max_characters=1500,
|
|
47
|
+
chunk_multipage_sections=True,
|
|
48
|
+
),
|
|
49
|
+
"embedder_config": EmbedderConfig(embedding_provider="huggingface"),
|
|
50
|
+
"stager_config": SQLUploadStagerConfig(),
|
|
51
|
+
"uploader_config": SQLUploaderConfig(batch_size=10),
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
if os.path.exists(SQLITE_DB):
|
|
55
|
+
os.remove(SQLITE_DB)
|
|
56
|
+
|
|
57
|
+
connection = sqlite3.connect(database=SQLITE_DB)
|
|
58
|
+
|
|
59
|
+
query = None
|
|
60
|
+
script_path = (
|
|
61
|
+
Path(__file__).parent.parent.parent.parent.parent
|
|
62
|
+
/ Path("test_e2e/env_setup/sql/sqlite-schema.sql")
|
|
63
|
+
).resolve()
|
|
64
|
+
with open(script_path) as f:
|
|
65
|
+
query = f.read()
|
|
66
|
+
cursor = connection.cursor()
|
|
67
|
+
cursor.executescript(query)
|
|
68
|
+
connection.close()
|
|
69
|
+
|
|
70
|
+
# sqlite test first
|
|
71
|
+
Pipeline.from_configs(
|
|
72
|
+
destination_connection_config=SQLConnectionConfig(
|
|
73
|
+
db_type=SQLITE_DB,
|
|
74
|
+
database=SQLITE_DB_PATH,
|
|
75
|
+
access_config=SQLAccessConfig(),
|
|
76
|
+
),
|
|
77
|
+
**configs,
|
|
78
|
+
).run()
|
|
79
|
+
|
|
80
|
+
# now, pg with pgvector
|
|
81
|
+
Pipeline.from_configs(
|
|
82
|
+
destination_connection_config=SQLConnectionConfig(
|
|
83
|
+
db_type=POSTGRESQL_DB,
|
|
84
|
+
database="elements",
|
|
85
|
+
host="localhost",
|
|
86
|
+
port=5433,
|
|
87
|
+
access_config=SQLAccessConfig(username="unstructured", password="test"),
|
|
88
|
+
),
|
|
89
|
+
**configs,
|
|
90
|
+
).run()
|
examples/vectara.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.interfaces import ProcessorConfig
|
|
4
|
+
from unstructured_ingest.logger import logger
|
|
5
|
+
from unstructured_ingest.pipeline.pipeline import Pipeline
|
|
6
|
+
from unstructured_ingest.processes.chunker import ChunkerConfig
|
|
7
|
+
from unstructured_ingest.processes.connectors.local import (
|
|
8
|
+
LocalConnectionConfig,
|
|
9
|
+
LocalDownloaderConfig,
|
|
10
|
+
LocalIndexerConfig,
|
|
11
|
+
)
|
|
12
|
+
from unstructured_ingest.processes.connectors.vectara import (
|
|
13
|
+
CONNECTOR_TYPE,
|
|
14
|
+
VectaraAccessConfig,
|
|
15
|
+
VectaraConnectionConfig,
|
|
16
|
+
VectaraUploaderConfig,
|
|
17
|
+
VectaraUploadStagerConfig,
|
|
18
|
+
)
|
|
19
|
+
from unstructured_ingest.processes.embedder import EmbedderConfig
|
|
20
|
+
from unstructured_ingest.processes.partitioner import PartitionerConfig
|
|
21
|
+
|
|
22
|
+
base_path = Path(__file__).parent.parent.parent.parent
|
|
23
|
+
docs_path = base_path / "example-docs"
|
|
24
|
+
work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
|
|
25
|
+
output_path = work_dir / "output"
|
|
26
|
+
download_path = work_dir / "download"
|
|
27
|
+
|
|
28
|
+
if __name__ == "__main__":
|
|
29
|
+
logger.info(f"writing all content in: {work_dir.resolve()}")
|
|
30
|
+
Pipeline.from_configs(
|
|
31
|
+
context=ProcessorConfig(work_dir=str(work_dir.resolve())),
|
|
32
|
+
indexer_config=LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"),
|
|
33
|
+
downloader_config=LocalDownloaderConfig(download_dir=download_path),
|
|
34
|
+
source_connection_config=LocalConnectionConfig(),
|
|
35
|
+
partitioner_config=PartitionerConfig(strategy="fast"),
|
|
36
|
+
chunker_config=ChunkerConfig(
|
|
37
|
+
chunking_strategy="by_title",
|
|
38
|
+
chunk_include_orig_elements=False,
|
|
39
|
+
chunk_max_characters=1500,
|
|
40
|
+
chunk_multipage_sections=True,
|
|
41
|
+
),
|
|
42
|
+
embedder_config=EmbedderConfig(embedding_provider="huggingface"),
|
|
43
|
+
destination_connection_config=VectaraConnectionConfig(
|
|
44
|
+
access_config=VectaraAccessConfig(
|
|
45
|
+
oauth_client_id="fill oauth_client_id", oauth_secret="fill oauth_secret"
|
|
46
|
+
),
|
|
47
|
+
customer_id="fill customer_id",
|
|
48
|
+
corpus_name="fill corpus_name",
|
|
49
|
+
corpus_key="fill corpus_key",
|
|
50
|
+
token_url="fill token_url",
|
|
51
|
+
),
|
|
52
|
+
stager_config=VectaraUploadStagerConfig(batch_size=10),
|
|
53
|
+
uploader_config=VectaraUploaderConfig(),
|
|
54
|
+
).run()
|
examples/weaviate.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.interfaces import ProcessorConfig
|
|
4
|
+
from unstructured_ingest.logger import logger
|
|
5
|
+
from unstructured_ingest.pipeline.pipeline import Pipeline
|
|
6
|
+
from unstructured_ingest.processes.chunker import ChunkerConfig
|
|
7
|
+
from unstructured_ingest.processes.connectors.local import (
|
|
8
|
+
LocalConnectionConfig,
|
|
9
|
+
LocalDownloaderConfig,
|
|
10
|
+
LocalIndexerConfig,
|
|
11
|
+
)
|
|
12
|
+
from unstructured_ingest.processes.connectors.weaviate.local import (
|
|
13
|
+
CONNECTOR_TYPE,
|
|
14
|
+
LocalWeaviateConnectionConfig,
|
|
15
|
+
LocalWeaviateUploaderConfig,
|
|
16
|
+
LocalWeaviateUploadStagerConfig,
|
|
17
|
+
)
|
|
18
|
+
from unstructured_ingest.processes.embedder import EmbedderConfig
|
|
19
|
+
from unstructured_ingest.processes.partitioner import PartitionerConfig
|
|
20
|
+
|
|
21
|
+
base_path = Path(__file__).parent.parent.parent.parent
|
|
22
|
+
docs_path = base_path / "example-docs"
|
|
23
|
+
work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
|
|
24
|
+
output_path = work_dir / "output"
|
|
25
|
+
download_path = work_dir / "download"
|
|
26
|
+
|
|
27
|
+
if __name__ == "__main__":
|
|
28
|
+
logger.info(f"writing all content in: {work_dir.resolve()}")
|
|
29
|
+
Pipeline.from_configs(
|
|
30
|
+
context=ProcessorConfig(work_dir=str(work_dir.resolve())),
|
|
31
|
+
indexer_config=LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"),
|
|
32
|
+
downloader_config=LocalDownloaderConfig(download_dir=download_path),
|
|
33
|
+
source_connection_config=LocalConnectionConfig(),
|
|
34
|
+
partitioner_config=PartitionerConfig(strategy="fast"),
|
|
35
|
+
chunker_config=ChunkerConfig(chunking_strategy="by_title"),
|
|
36
|
+
embedder_config=EmbedderConfig(embedding_provider="huggingface"),
|
|
37
|
+
destination_connection_config=LocalWeaviateConnectionConfig(
|
|
38
|
+
# Connects to http://localhost:8080
|
|
39
|
+
),
|
|
40
|
+
stager_config=LocalWeaviateUploadStagerConfig(),
|
|
41
|
+
uploader_config=LocalWeaviateUploaderConfig(
|
|
42
|
+
collection="elements", batch_size=10, dynamic_batch=False
|
|
43
|
+
),
|
|
44
|
+
).run()
|
|
@@ -4,7 +4,7 @@ from pathlib import Path
|
|
|
4
4
|
import pytest
|
|
5
5
|
|
|
6
6
|
from test.integration.utils import requires_env
|
|
7
|
-
from unstructured_ingest.
|
|
7
|
+
from unstructured_ingest.processes.chunker import Chunker, ChunkerConfig
|
|
8
8
|
|
|
9
9
|
int_test_dir = Path(__file__).parent
|
|
10
10
|
assets_dir = int_test_dir / "assets"
|
|
@@ -20,8 +20,9 @@ from test.integration.connectors.utils.validation.source import (
|
|
|
20
20
|
source_connector_validation,
|
|
21
21
|
)
|
|
22
22
|
from test.integration.utils import requires_env
|
|
23
|
-
from unstructured_ingest.
|
|
24
|
-
from unstructured_ingest.
|
|
23
|
+
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
24
|
+
from unstructured_ingest.errors_v2 import UserAuthError, UserError
|
|
25
|
+
from unstructured_ingest.processes.connectors.databricks.volumes_native import (
|
|
25
26
|
CONNECTOR_TYPE,
|
|
26
27
|
DatabricksNativeVolumesAccessConfig,
|
|
27
28
|
DatabricksNativeVolumesConnectionConfig,
|
|
@@ -32,7 +33,6 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes_native impor
|
|
|
32
33
|
DatabricksNativeVolumesUploader,
|
|
33
34
|
DatabricksNativeVolumesUploaderConfig,
|
|
34
35
|
)
|
|
35
|
-
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
36
36
|
|
|
37
37
|
|
|
38
38
|
@dataclass
|
|
@@ -13,7 +13,7 @@ from test.integration.connectors.utils.validation.source import (
|
|
|
13
13
|
)
|
|
14
14
|
from test.integration.utils import requires_env
|
|
15
15
|
from unstructured_ingest.error import SourceConnectionError
|
|
16
|
-
from unstructured_ingest.
|
|
16
|
+
from unstructured_ingest.processes.connectors.discord import (
|
|
17
17
|
CONNECTOR_TYPE,
|
|
18
18
|
DiscordAccessConfig,
|
|
19
19
|
DiscordConnectionConfig,
|
|
@@ -10,14 +10,14 @@ from test.integration.connectors.utils.validation.destination import (
|
|
|
10
10
|
StagerValidationConfigs,
|
|
11
11
|
stager_validation,
|
|
12
12
|
)
|
|
13
|
-
from unstructured_ingest.
|
|
13
|
+
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
14
|
+
from unstructured_ingest.processes.connectors.duckdb.duckdb import (
|
|
14
15
|
CONNECTOR_TYPE,
|
|
15
16
|
DuckDBConnectionConfig,
|
|
16
17
|
DuckDBUploader,
|
|
17
18
|
DuckDBUploaderConfig,
|
|
18
19
|
DuckDBUploadStager,
|
|
19
20
|
)
|
|
20
|
-
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
@pytest.fixture
|
|
@@ -9,7 +9,8 @@ import pytest
|
|
|
9
9
|
|
|
10
10
|
from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG
|
|
11
11
|
from test.integration.utils import requires_env
|
|
12
|
-
from unstructured_ingest.
|
|
12
|
+
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
13
|
+
from unstructured_ingest.processes.connectors.duckdb.motherduck import (
|
|
13
14
|
CONNECTOR_TYPE,
|
|
14
15
|
MotherDuckAccessConfig,
|
|
15
16
|
MotherDuckConnectionConfig,
|
|
@@ -17,7 +18,6 @@ from unstructured_ingest.v2.processes.connectors.duckdb.motherduck import (
|
|
|
17
18
|
MotherDuckUploaderConfig,
|
|
18
19
|
MotherDuckUploadStager,
|
|
19
20
|
)
|
|
20
|
-
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
@pytest.fixture
|
|
@@ -22,8 +22,8 @@ from test.integration.connectors.utils.validation.source import (
|
|
|
22
22
|
source_connector_validation,
|
|
23
23
|
)
|
|
24
24
|
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
25
|
-
from unstructured_ingest.
|
|
26
|
-
from unstructured_ingest.
|
|
25
|
+
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
26
|
+
from unstructured_ingest.processes.connectors.elasticsearch.elasticsearch import (
|
|
27
27
|
CONNECTOR_TYPE,
|
|
28
28
|
ElasticsearchAccessConfig,
|
|
29
29
|
ElasticsearchConnectionConfig,
|
|
@@ -20,11 +20,12 @@ from test.integration.connectors.utils.validation.source import (
|
|
|
20
20
|
SourceValidationConfigs,
|
|
21
21
|
source_connector_validation,
|
|
22
22
|
)
|
|
23
|
+
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
23
24
|
from unstructured_ingest.error import (
|
|
24
25
|
DestinationConnectionError,
|
|
25
26
|
SourceConnectionError,
|
|
26
27
|
)
|
|
27
|
-
from unstructured_ingest.
|
|
28
|
+
from unstructured_ingest.processes.connectors.elasticsearch.opensearch import (
|
|
28
29
|
CONNECTOR_TYPE,
|
|
29
30
|
OpenSearchAccessConfig,
|
|
30
31
|
OpenSearchConnectionConfig,
|
|
@@ -37,7 +38,6 @@ from unstructured_ingest.v2.processes.connectors.elasticsearch.opensearch import
|
|
|
37
38
|
OpenSearchUploadStager,
|
|
38
39
|
OpenSearchUploadStagerConfig,
|
|
39
40
|
)
|
|
40
|
-
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
41
41
|
|
|
42
42
|
SOURCE_INDEX_NAME = "movies"
|
|
43
43
|
DESTINATION_INDEX_NAME = "elements"
|