unstructured-ingest 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- examples/airtable.py +44 -0
- examples/azure_cognitive_search.py +55 -0
- examples/chroma.py +54 -0
- examples/couchbase.py +55 -0
- examples/databricks_volumes_dest.py +55 -0
- examples/databricks_volumes_source.py +53 -0
- examples/delta_table.py +45 -0
- examples/discord_example.py +36 -0
- examples/elasticsearch.py +49 -0
- examples/google_drive.py +45 -0
- examples/kdbai.py +54 -0
- examples/local.py +36 -0
- examples/milvus.py +44 -0
- examples/mongodb.py +53 -0
- examples/opensearch.py +50 -0
- examples/pinecone.py +57 -0
- examples/s3.py +38 -0
- examples/salesforce.py +44 -0
- examples/sharepoint.py +47 -0
- examples/singlestore.py +49 -0
- examples/sql.py +90 -0
- examples/vectara.py +54 -0
- examples/weaviate.py +44 -0
- test/integration/chunkers/test_chunkers.py +1 -1
- test/integration/connectors/conftest.py +1 -1
- test/integration/connectors/databricks/test_volumes_native.py +3 -3
- test/integration/connectors/discord/test_discord.py +1 -1
- test/integration/connectors/duckdb/test_duckdb.py +2 -2
- test/integration/connectors/duckdb/test_motherduck.py +2 -2
- test/integration/connectors/elasticsearch/test_elasticsearch.py +2 -2
- test/integration/connectors/elasticsearch/test_opensearch.py +2 -2
- test/integration/connectors/sql/test_databricks_delta_tables.py +3 -3
- test/integration/connectors/sql/test_postgres.py +2 -2
- test/integration/connectors/sql/test_singlestore.py +2 -2
- test/integration/connectors/sql/test_snowflake.py +2 -2
- test/integration/connectors/sql/test_sqlite.py +2 -2
- test/integration/connectors/sql/test_vastdb.py +1 -1
- test/integration/connectors/test_astradb.py +2 -2
- test/integration/connectors/test_azure_ai_search.py +2 -2
- test/integration/connectors/test_chroma.py +2 -2
- test/integration/connectors/test_confluence.py +1 -1
- test/integration/connectors/test_delta_table.py +2 -2
- test/integration/connectors/test_dropbox.py +2 -2
- test/integration/connectors/test_github.py +49 -0
- test/integration/connectors/test_google_drive.py +2 -2
- test/integration/connectors/test_jira.py +1 -1
- test/integration/connectors/test_lancedb.py +7 -7
- test/integration/connectors/test_milvus.py +2 -2
- test/integration/connectors/test_mongodb.py +2 -2
- test/integration/connectors/test_neo4j.py +7 -7
- test/integration/connectors/test_notion.py +2 -2
- test/integration/connectors/test_onedrive.py +2 -2
- test/integration/connectors/test_pinecone.py +3 -3
- test/integration/connectors/test_qdrant.py +6 -6
- test/integration/connectors/test_redis.py +3 -3
- test/integration/connectors/test_s3.py +3 -3
- test/integration/connectors/test_sharepoint.py +1 -1
- test/integration/connectors/test_vectara.py +4 -4
- test/integration/connectors/test_zendesk.py +2 -2
- test/integration/connectors/utils/validation/destination.py +2 -2
- test/integration/connectors/utils/validation/source.py +2 -2
- test/integration/connectors/weaviate/test_cloud.py +1 -1
- test/integration/connectors/weaviate/test_local.py +2 -2
- test/integration/embedders/test_azure_openai.py +1 -1
- test/integration/embedders/test_bedrock.py +2 -2
- test/integration/embedders/test_huggingface.py +1 -1
- test/integration/embedders/test_mixedbread.py +1 -1
- test/integration/embedders/test_octoai.py +2 -2
- test/integration/embedders/test_openai.py +2 -2
- test/integration/embedders/test_togetherai.py +2 -2
- test/integration/embedders/test_vertexai.py +1 -1
- test/integration/embedders/test_voyageai.py +1 -1
- test/integration/partitioners/test_partitioner.py +2 -2
- test/unit/{v2/chunkers → chunkers}/test_chunkers.py +1 -1
- test/unit/{v2/connectors → connectors}/ibm_watsonx/test_ibm_watsonx_s3.py +6 -6
- test/unit/{v2/connectors → connectors}/motherduck/test_base.py +5 -5
- test/unit/{v2/connectors → connectors}/sql/test_sql.py +4 -4
- test/unit/{v2/connectors → connectors}/test_confluence.py +1 -1
- test/unit/{v2/connectors → connectors}/test_jira.py +1 -1
- test/unit/{v2/embedders → embedders}/test_huggingface.py +1 -1
- test/unit/{v2/embedders → embedders}/test_vertexai.py +1 -1
- test/unit/{v2/partitioners → partitioners}/test_partitioner.py +2 -2
- test/unit/test_html.py +1 -1
- test/unit/{v2/test_interfaces.py → test_interfaces.py} +1 -1
- test/unit/test_utils.py +106 -97
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/__init__.py +0 -14
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +259 -9
- unstructured_ingest/cli/base/dest.py +58 -61
- unstructured_ingest/cli/base/src.py +54 -36
- unstructured_ingest/cli/cli.py +4 -17
- unstructured_ingest/{v2/cli → cli}/cmds.py +2 -2
- unstructured_ingest/{v2/cli → cli}/utils/model_conversion.py +6 -6
- unstructured_ingest/{v2/types → data_types}/file_data.py +1 -1
- unstructured_ingest/embed/bedrock.py +3 -3
- unstructured_ingest/embed/octoai.py +3 -3
- unstructured_ingest/embed/openai.py +3 -3
- unstructured_ingest/embed/togetherai.py +4 -4
- unstructured_ingest/embed/vertexai.py +1 -1
- unstructured_ingest/embed/voyageai.py +4 -4
- unstructured_ingest/{v2/interfaces → interfaces}/downloader.py +3 -3
- unstructured_ingest/{v2/interfaces → interfaces}/indexer.py +3 -3
- unstructured_ingest/{v2/interfaces → interfaces}/upload_stager.py +2 -2
- unstructured_ingest/{v2/interfaces → interfaces}/uploader.py +2 -2
- unstructured_ingest/{v2/otel.py → otel.py} +1 -1
- unstructured_ingest/pipeline/__init__.py +0 -22
- unstructured_ingest/pipeline/interfaces.py +179 -238
- unstructured_ingest/{v2/pipeline → pipeline}/otel.py +2 -2
- unstructured_ingest/pipeline/pipeline.py +388 -97
- unstructured_ingest/{v2/pipeline → pipeline}/steps/chunk.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/download.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/embed.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/filter.py +4 -4
- unstructured_ingest/{v2/pipeline → pipeline}/steps/index.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/partition.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/stage.py +5 -5
- unstructured_ingest/{v2/pipeline → pipeline}/steps/uncompress.py +4 -4
- unstructured_ingest/{v2/pipeline → pipeline}/steps/upload.py +5 -5
- unstructured_ingest/{v2/processes → processes}/chunker.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connector_registry.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/__init__.py +14 -11
- unstructured_ingest/{v2/processes → processes}/connectors/airtable.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/astradb.py +15 -15
- unstructured_ingest/{v2/processes → processes}/connectors/azure_ai_search.py +8 -9
- unstructured_ingest/{v2/processes → processes}/connectors/chroma.py +10 -7
- unstructured_ingest/{v2/processes → processes}/connectors/confluence.py +11 -11
- unstructured_ingest/{v2/processes → processes}/connectors/couchbase.py +12 -12
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes.py +12 -11
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_aws.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_azure.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_gcp.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_native.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/databricks/volumes_table.py +8 -9
- unstructured_ingest/{v2/processes → processes}/connectors/delta_table.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/discord.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/base.py +3 -4
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/duckdb.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/duckdb/motherduck.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/elasticsearch.py +17 -17
- unstructured_ingest/{v2/processes → processes}/connectors/elasticsearch/opensearch.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/azure.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/box.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/dropbox.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/fsspec.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/gcs.py +8 -8
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/s3.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/fsspec/sftp.py +5 -5
- unstructured_ingest/processes/connectors/github.py +221 -0
- unstructured_ingest/{v2/processes → processes}/connectors/gitlab.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/google_drive.py +22 -13
- unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/jira.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/kafka.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/kafka/local.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/kdbai.py +11 -7
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/aws.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/azure.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/gcp.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/lancedb.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/lancedb/local.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/local.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/milvus.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/mongodb.py +13 -13
- unstructured_ingest/{v2/processes → processes}/connectors/neo4j.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/notion/client.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/notion/connector.py +15 -15
- unstructured_ingest/{v2/processes → processes}/connectors/notion/helpers.py +4 -4
- unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_wrapper.py +5 -1
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/{connector → processes/connectors}/notion/types/block.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/bookmark.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/breadcrumb.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/bulleted_list_item.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/callout.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/child_database.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/child_page.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/code.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/column_list.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/divider.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/embed.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/equation.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/file.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/heading.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/image.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/link_preview.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/link_to_page.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/numbered_list.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/paragraph.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/pdf.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/quote.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/synced_block.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table.py +2 -5
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/table_of_contents.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/template.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/todo.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/toggle.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/blocks/unsupported.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/video.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/checkbox.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_by.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/created_time.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/date.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/email.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/files.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/formula.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/last_edited_by.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/last_edited_time.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/multiselect.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/number.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/people.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/phone_number.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/relation.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rich_text.py +2 -2
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/rollup.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/select.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/status.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/title.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/database_properties/unique_id.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/url.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/database_properties/verification.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/date.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/file.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/page.py +4 -4
- unstructured_ingest/{connector → processes/connectors}/notion/types/parent.py +1 -1
- unstructured_ingest/{connector → processes/connectors}/notion/types/rich_text.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/notion/types/user.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/onedrive.py +10 -10
- unstructured_ingest/{v2/processes → processes}/connectors/outlook.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/pinecone.py +12 -9
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/cloud.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/local.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/qdrant.py +9 -6
- unstructured_ingest/{v2/processes → processes}/connectors/qdrant/server.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/redisdb.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/salesforce.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sharepoint.py +7 -7
- unstructured_ingest/{v2/processes → processes}/connectors/slack.py +9 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sql/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/sql/databricks_delta_tables.py +11 -9
- unstructured_ingest/{v2/processes → processes}/connectors/sql/postgres.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/singlestore.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/snowflake.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/sql/sql.py +16 -11
- unstructured_ingest/{v2/processes → processes}/connectors/sql/sqlite.py +4 -4
- unstructured_ingest/{v2/processes → processes}/connectors/sql/vastdb.py +9 -10
- unstructured_ingest/{v2/processes → processes}/connectors/vectara.py +6 -6
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/__init__.py +1 -1
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/cloud.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/embedded.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/local.py +3 -3
- unstructured_ingest/{v2/processes → processes}/connectors/weaviate/weaviate.py +5 -5
- unstructured_ingest/{v2/processes → processes}/connectors/zendesk/client.py +2 -2
- unstructured_ingest/{v2/processes → processes}/connectors/zendesk/zendesk.py +10 -10
- unstructured_ingest/{v2/processes → processes}/embedder.py +1 -1
- unstructured_ingest/{v2/processes → processes}/filter.py +4 -4
- unstructured_ingest/{v2/processes → processes}/partitioner.py +6 -6
- unstructured_ingest/{v2/processes → processes}/uncompress.py +3 -3
- unstructured_ingest/{v2/processes → processes}/utils/blob_storage.py +2 -2
- unstructured_ingest/{v2/unstructured_api.py → unstructured_api.py} +2 -2
- unstructured_ingest/utils/compression.py +1 -48
- unstructured_ingest/utils/data_prep.py +9 -1
- unstructured_ingest/utils/html.py +3 -3
- unstructured_ingest/{v2/utils.py → utils/pydantic_models.py} +0 -9
- unstructured_ingest/utils/string_and_date_utils.py +1 -1
- {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/METADATA +99 -99
- unstructured_ingest-0.7.0.dist-info/RECORD +370 -0
- {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/top_level.txt +1 -0
- test/unit/v2/test_utils.py +0 -82
- unstructured_ingest/cli/cmd_factory.py +0 -12
- unstructured_ingest/cli/cmds/__init__.py +0 -145
- unstructured_ingest/cli/cmds/airtable.py +0 -69
- unstructured_ingest/cli/cmds/astradb.py +0 -99
- unstructured_ingest/cli/cmds/azure_ai_search.py +0 -65
- unstructured_ingest/cli/cmds/biomed.py +0 -52
- unstructured_ingest/cli/cmds/chroma.py +0 -104
- unstructured_ingest/cli/cmds/clarifai.py +0 -71
- unstructured_ingest/cli/cmds/confluence.py +0 -69
- unstructured_ingest/cli/cmds/databricks_volumes.py +0 -163
- unstructured_ingest/cli/cmds/delta_table.py +0 -94
- unstructured_ingest/cli/cmds/discord.py +0 -47
- unstructured_ingest/cli/cmds/elasticsearch.py +0 -133
- unstructured_ingest/cli/cmds/fsspec/azure.py +0 -94
- unstructured_ingest/cli/cmds/fsspec/box.py +0 -48
- unstructured_ingest/cli/cmds/fsspec/dropbox.py +0 -51
- unstructured_ingest/cli/cmds/fsspec/fsspec.py +0 -15
- unstructured_ingest/cli/cmds/fsspec/gcs.py +0 -71
- unstructured_ingest/cli/cmds/fsspec/s3.py +0 -74
- unstructured_ingest/cli/cmds/fsspec/sftp.py +0 -58
- unstructured_ingest/cli/cmds/github.py +0 -54
- unstructured_ingest/cli/cmds/gitlab.py +0 -54
- unstructured_ingest/cli/cmds/google_drive.py +0 -49
- unstructured_ingest/cli/cmds/hubspot.py +0 -70
- unstructured_ingest/cli/cmds/jira.py +0 -71
- unstructured_ingest/cli/cmds/kafka.py +0 -102
- unstructured_ingest/cli/cmds/local.py +0 -43
- unstructured_ingest/cli/cmds/mongodb.py +0 -72
- unstructured_ingest/cli/cmds/notion.py +0 -48
- unstructured_ingest/cli/cmds/onedrive.py +0 -66
- unstructured_ingest/cli/cmds/opensearch.py +0 -117
- unstructured_ingest/cli/cmds/outlook.py +0 -67
- unstructured_ingest/cli/cmds/pinecone.py +0 -71
- unstructured_ingest/cli/cmds/qdrant.py +0 -124
- unstructured_ingest/cli/cmds/reddit.py +0 -67
- unstructured_ingest/cli/cmds/salesforce.py +0 -58
- unstructured_ingest/cli/cmds/sharepoint.py +0 -66
- unstructured_ingest/cli/cmds/slack.py +0 -56
- unstructured_ingest/cli/cmds/sql.py +0 -66
- unstructured_ingest/cli/cmds/vectara.py +0 -66
- unstructured_ingest/cli/cmds/weaviate.py +0 -98
- unstructured_ingest/cli/cmds/wikipedia.py +0 -40
- unstructured_ingest/cli/common.py +0 -7
- unstructured_ingest/cli/interfaces.py +0 -663
- unstructured_ingest/cli/utils.py +0 -205
- unstructured_ingest/connector/airtable.py +0 -309
- unstructured_ingest/connector/astradb.py +0 -267
- unstructured_ingest/connector/azure_ai_search.py +0 -144
- unstructured_ingest/connector/biomed.py +0 -320
- unstructured_ingest/connector/chroma.py +0 -158
- unstructured_ingest/connector/clarifai.py +0 -122
- unstructured_ingest/connector/confluence.py +0 -285
- unstructured_ingest/connector/databricks_volumes.py +0 -137
- unstructured_ingest/connector/delta_table.py +0 -203
- unstructured_ingest/connector/discord.py +0 -180
- unstructured_ingest/connector/elasticsearch.py +0 -396
- unstructured_ingest/connector/fsspec/azure.py +0 -78
- unstructured_ingest/connector/fsspec/box.py +0 -109
- unstructured_ingest/connector/fsspec/dropbox.py +0 -160
- unstructured_ingest/connector/fsspec/fsspec.py +0 -359
- unstructured_ingest/connector/fsspec/gcs.py +0 -82
- unstructured_ingest/connector/fsspec/s3.py +0 -62
- unstructured_ingest/connector/fsspec/sftp.py +0 -81
- unstructured_ingest/connector/git.py +0 -124
- unstructured_ingest/connector/github.py +0 -174
- unstructured_ingest/connector/gitlab.py +0 -142
- unstructured_ingest/connector/google_drive.py +0 -348
- unstructured_ingest/connector/hubspot.py +0 -278
- unstructured_ingest/connector/jira.py +0 -469
- unstructured_ingest/connector/kafka.py +0 -293
- unstructured_ingest/connector/local.py +0 -139
- unstructured_ingest/connector/mongodb.py +0 -284
- unstructured_ingest/connector/notion/client.py +0 -248
- unstructured_ingest/connector/notion/connector.py +0 -469
- unstructured_ingest/connector/notion/helpers.py +0 -584
- unstructured_ingest/connector/notion/types/blocks/bookmark.py +0 -40
- unstructured_ingest/connector/notion/types/blocks/callout.py +0 -94
- unstructured_ingest/connector/notion/types/blocks/child_database.py +0 -23
- unstructured_ingest/connector/notion/types/blocks/code.py +0 -43
- unstructured_ingest/connector/notion/types/blocks/column_list.py +0 -35
- unstructured_ingest/connector/notion/types/blocks/divider.py +0 -22
- unstructured_ingest/connector/notion/types/blocks/heading.py +0 -37
- unstructured_ingest/connector/notion/types/blocks/link_preview.py +0 -24
- unstructured_ingest/connector/notion/types/blocks/numbered_list.py +0 -29
- unstructured_ingest/connector/notion/types/blocks/quote.py +0 -37
- unstructured_ingest/connector/notion/types/blocks/synced_block.py +0 -57
- unstructured_ingest/connector/notion/types/blocks/table.py +0 -63
- unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +0 -23
- unstructured_ingest/connector/notion/types/blocks/template.py +0 -30
- unstructured_ingest/connector/notion/types/blocks/todo.py +0 -42
- unstructured_ingest/connector/notion/types/blocks/unsupported.py +0 -20
- unstructured_ingest/connector/notion/types/database_properties/__init__.py +0 -106
- unstructured_ingest/connector/notion/types/database_properties/checkbox.py +0 -38
- unstructured_ingest/connector/notion/types/database_properties/date.py +0 -41
- unstructured_ingest/connector/notion/types/database_properties/formula.py +0 -49
- unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +0 -34
- unstructured_ingest/connector/notion/types/database_properties/multiselect.py +0 -73
- unstructured_ingest/connector/notion/types/database_properties/people.py +0 -40
- unstructured_ingest/connector/notion/types/database_properties/phone_number.py +0 -36
- unstructured_ingest/connector/notion/types/database_properties/relation.py +0 -67
- unstructured_ingest/connector/notion/types/database_properties/select.py +0 -68
- unstructured_ingest/connector/notion/types/database_properties/status.py +0 -80
- unstructured_ingest/connector/notion/types/database_properties/unique_id.py +0 -50
- unstructured_ingest/connector/notion/types/date.py +0 -26
- unstructured_ingest/connector/notion/types/file.py +0 -51
- unstructured_ingest/connector/notion/types/user.py +0 -76
- unstructured_ingest/connector/onedrive.py +0 -232
- unstructured_ingest/connector/opensearch.py +0 -218
- unstructured_ingest/connector/outlook.py +0 -285
- unstructured_ingest/connector/pinecone.py +0 -150
- unstructured_ingest/connector/qdrant.py +0 -144
- unstructured_ingest/connector/reddit.py +0 -166
- unstructured_ingest/connector/registry.py +0 -109
- unstructured_ingest/connector/salesforce.py +0 -301
- unstructured_ingest/connector/sharepoint.py +0 -573
- unstructured_ingest/connector/slack.py +0 -224
- unstructured_ingest/connector/sql.py +0 -199
- unstructured_ingest/connector/vectara.py +0 -253
- unstructured_ingest/connector/weaviate.py +0 -190
- unstructured_ingest/connector/wikipedia.py +0 -208
- unstructured_ingest/enhanced_dataclass/__init__.py +0 -4
- unstructured_ingest/enhanced_dataclass/core.py +0 -99
- unstructured_ingest/enhanced_dataclass/dataclasses.py +0 -54
- unstructured_ingest/enhanced_dataclass/json_mixin.py +0 -125
- unstructured_ingest/interfaces.py +0 -852
- unstructured_ingest/pipeline/copy.py +0 -19
- unstructured_ingest/pipeline/doc_factory.py +0 -12
- unstructured_ingest/pipeline/partition.py +0 -60
- unstructured_ingest/pipeline/permissions.py +0 -12
- unstructured_ingest/pipeline/reformat/chunking.py +0 -134
- unstructured_ingest/pipeline/reformat/embedding.py +0 -64
- unstructured_ingest/pipeline/source.py +0 -77
- unstructured_ingest/pipeline/utils.py +0 -6
- unstructured_ingest/pipeline/write.py +0 -18
- unstructured_ingest/processor.py +0 -93
- unstructured_ingest/runner/__init__.py +0 -104
- unstructured_ingest/runner/airtable.py +0 -35
- unstructured_ingest/runner/astradb.py +0 -34
- unstructured_ingest/runner/base_runner.py +0 -89
- unstructured_ingest/runner/biomed.py +0 -45
- unstructured_ingest/runner/confluence.py +0 -35
- unstructured_ingest/runner/delta_table.py +0 -34
- unstructured_ingest/runner/discord.py +0 -35
- unstructured_ingest/runner/elasticsearch.py +0 -40
- unstructured_ingest/runner/fsspec/azure.py +0 -30
- unstructured_ingest/runner/fsspec/box.py +0 -28
- unstructured_ingest/runner/fsspec/dropbox.py +0 -30
- unstructured_ingest/runner/fsspec/fsspec.py +0 -40
- unstructured_ingest/runner/fsspec/gcs.py +0 -28
- unstructured_ingest/runner/fsspec/s3.py +0 -28
- unstructured_ingest/runner/fsspec/sftp.py +0 -28
- unstructured_ingest/runner/github.py +0 -37
- unstructured_ingest/runner/gitlab.py +0 -37
- unstructured_ingest/runner/google_drive.py +0 -35
- unstructured_ingest/runner/hubspot.py +0 -35
- unstructured_ingest/runner/jira.py +0 -35
- unstructured_ingest/runner/kafka.py +0 -34
- unstructured_ingest/runner/local.py +0 -23
- unstructured_ingest/runner/mongodb.py +0 -34
- unstructured_ingest/runner/notion.py +0 -61
- unstructured_ingest/runner/onedrive.py +0 -35
- unstructured_ingest/runner/opensearch.py +0 -40
- unstructured_ingest/runner/outlook.py +0 -33
- unstructured_ingest/runner/reddit.py +0 -35
- unstructured_ingest/runner/salesforce.py +0 -33
- unstructured_ingest/runner/sharepoint.py +0 -35
- unstructured_ingest/runner/slack.py +0 -33
- unstructured_ingest/runner/utils.py +0 -47
- unstructured_ingest/runner/wikipedia.py +0 -35
- unstructured_ingest/runner/writers/__init__.py +0 -48
- unstructured_ingest/runner/writers/astradb.py +0 -22
- unstructured_ingest/runner/writers/azure_ai_search.py +0 -24
- unstructured_ingest/runner/writers/base_writer.py +0 -26
- unstructured_ingest/runner/writers/chroma.py +0 -22
- unstructured_ingest/runner/writers/clarifai.py +0 -19
- unstructured_ingest/runner/writers/databricks_volumes.py +0 -25
- unstructured_ingest/runner/writers/delta_table.py +0 -24
- unstructured_ingest/runner/writers/elasticsearch.py +0 -24
- unstructured_ingest/runner/writers/fsspec/azure.py +0 -24
- unstructured_ingest/runner/writers/fsspec/box.py +0 -21
- unstructured_ingest/runner/writers/fsspec/dropbox.py +0 -21
- unstructured_ingest/runner/writers/fsspec/gcs.py +0 -19
- unstructured_ingest/runner/writers/fsspec/s3.py +0 -21
- unstructured_ingest/runner/writers/kafka.py +0 -21
- unstructured_ingest/runner/writers/mongodb.py +0 -21
- unstructured_ingest/runner/writers/opensearch.py +0 -26
- unstructured_ingest/runner/writers/pinecone.py +0 -21
- unstructured_ingest/runner/writers/qdrant.py +0 -19
- unstructured_ingest/runner/writers/sql.py +0 -22
- unstructured_ingest/runner/writers/vectara.py +0 -22
- unstructured_ingest/runner/writers/weaviate.py +0 -21
- unstructured_ingest/utils/google_filetype.py +0 -9
- unstructured_ingest/v2/__init__.py +0 -1
- unstructured_ingest/v2/cli/__init__.py +0 -0
- unstructured_ingest/v2/cli/base/__init__.py +0 -4
- unstructured_ingest/v2/cli/base/cmd.py +0 -269
- unstructured_ingest/v2/cli/base/dest.py +0 -85
- unstructured_ingest/v2/cli/base/src.py +0 -85
- unstructured_ingest/v2/cli/cli.py +0 -24
- unstructured_ingest/v2/cli/utils/__init__.py +0 -0
- unstructured_ingest/v2/logger.py +0 -126
- unstructured_ingest/v2/main.py +0 -11
- unstructured_ingest/v2/pipeline/__init__.py +0 -0
- unstructured_ingest/v2/pipeline/interfaces.py +0 -211
- unstructured_ingest/v2/pipeline/pipeline.py +0 -408
- unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +0 -10
- unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json +0 -23
- unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/interfaces.py +0 -32
- unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/v2/processes/connectors/notion/types/block.py +0 -96
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +0 -63
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +0 -21
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -31
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +0 -23
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +0 -36
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +0 -23
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +0 -21
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +0 -29
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +0 -31
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +0 -22
- unstructured_ingest/v2/processes/connectors/notion/types/database.py +0 -73
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +0 -35
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +0 -34
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +0 -36
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -34
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +0 -49
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +0 -43
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +0 -56
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +0 -37
- unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +0 -78
- unstructured_ingest/v2/processes/connectors/notion/types/page.py +0 -45
- unstructured_ingest/v2/processes/connectors/notion/types/parent.py +0 -66
- unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +0 -189
- unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/v2/processes/utils/__init__.py +0 -0
- unstructured_ingest/v2/types/__init__.py +0 -0
- unstructured_ingest-0.6.2.dist-info/RECORD +0 -589
- {test/unit/v2 → examples}/__init__.py +0 -0
- /test/unit/{v2/chunkers → chunkers}/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/ibm_watsonx/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/motherduck/__init__.py +0 -0
- /test/unit/{v2/connectors → connectors}/sql/__init__.py +0 -0
- /test/unit/{v2/embedders → embedders}/__init__.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_bedrock.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_mixedbread.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_octoai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_openai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_togetherai.py +0 -0
- /test/unit/{v2/embedders → embedders}/test_voyageai.py +0 -0
- /test/unit/{v2/partitioners → partitioners}/__init__.py +0 -0
- /test/unit/{v2/utils → utils}/__init__.py +0 -0
- /test/unit/{v2/utils → utils}/data_generator.py +0 -0
- /unstructured_ingest/{v2/cli → cli}/base/importer.py +0 -0
- /unstructured_ingest/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- /unstructured_ingest/{v2/cli → cli}/utils/click.py +0 -0
- /unstructured_ingest/{connector → data_types}/__init__.py +0 -0
- /unstructured_ingest/{v2/errors.py → errors_v2.py} +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/__init__.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/connector.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/process.py +0 -0
- /unstructured_ingest/{v2/interfaces → interfaces}/processor.py +0 -0
- /unstructured_ingest/{connector/fsspec → pipeline/steps}/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/__init__.py +0 -0
- /unstructured_ingest/{connector/notion → processes/connectors/assets}/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/connectors/fsspec/utils.py +0 -0
- /unstructured_ingest/{connector/notion/types → processes/connectors/notion}/__init__.py +0 -0
- /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/__init__.py +0 -0
- /unstructured_ingest/{ingest_backoff → processes/connectors/notion/ingest_backoff}/_common.py +0 -0
- /unstructured_ingest/{connector → processes/connectors}/notion/interfaces.py +0 -0
- /unstructured_ingest/{pipeline/reformat → processes/connectors/notion/types}/__init__.py +0 -0
- /unstructured_ingest/{connector → processes/connectors}/notion/types/blocks/__init__.py +0 -0
- /unstructured_ingest/{v2/processes → processes}/connectors/utils.py +0 -0
- /unstructured_ingest/{runner/fsspec → processes/connectors/zendesk}/__init__.py +0 -0
- /unstructured_ingest/{runner/writers/fsspec → processes/utils}/__init__.py +0 -0
- /unstructured_ingest/{v2 → utils}/constants.py +0 -0
- {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.6.2.dist-info → unstructured_ingest-0.7.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,166 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
from dataclasses import dataclass, field
|
|
3
|
-
from datetime import datetime
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
|
|
6
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
7
|
-
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
8
|
-
from unstructured_ingest.interfaces import (
|
|
9
|
-
AccessConfig,
|
|
10
|
-
BaseConnectorConfig,
|
|
11
|
-
BaseSingleIngestDoc,
|
|
12
|
-
BaseSourceConnector,
|
|
13
|
-
IngestDocCleanupMixin,
|
|
14
|
-
SourceConnectorCleanupMixin,
|
|
15
|
-
SourceMetadata,
|
|
16
|
-
)
|
|
17
|
-
from unstructured_ingest.logger import logger
|
|
18
|
-
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
19
|
-
|
|
20
|
-
if t.TYPE_CHECKING:
|
|
21
|
-
from praw import Reddit
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
@dataclass
|
|
25
|
-
class RedditAccessConfig(AccessConfig):
|
|
26
|
-
client_secret: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
@dataclass
|
|
30
|
-
class SimpleRedditConfig(BaseConnectorConfig):
|
|
31
|
-
access_config: RedditAccessConfig
|
|
32
|
-
subreddit_name: str
|
|
33
|
-
num_posts: int
|
|
34
|
-
user_agent: str
|
|
35
|
-
client_id: str
|
|
36
|
-
search_query: t.Optional[str] = None
|
|
37
|
-
|
|
38
|
-
def __post_init__(self):
|
|
39
|
-
if self.num_posts <= 0:
|
|
40
|
-
raise ValueError("The number of Reddit posts to fetch must be positive.")
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
@dataclass
|
|
44
|
-
class RedditIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
45
|
-
connector_config: SimpleRedditConfig = field(repr=False)
|
|
46
|
-
post_id: str
|
|
47
|
-
registry_name: str = "reddit"
|
|
48
|
-
|
|
49
|
-
def _create_full_tmp_dir_path(self):
|
|
50
|
-
self.filename.parent.mkdir(parents=True, exist_ok=True)
|
|
51
|
-
|
|
52
|
-
@SourceConnectionNetworkError.wrap
|
|
53
|
-
@requires_dependencies(["praw"])
|
|
54
|
-
def get_post(self):
|
|
55
|
-
from praw import Reddit
|
|
56
|
-
from praw.models import Submission
|
|
57
|
-
|
|
58
|
-
reddit = Reddit(
|
|
59
|
-
client_id=self.connector_config.client_id,
|
|
60
|
-
client_secret=self.connector_config.access_config.client_secret,
|
|
61
|
-
user_agent=self.connector_config.user_agent,
|
|
62
|
-
)
|
|
63
|
-
post = Submission(reddit, self.post_id)
|
|
64
|
-
return post
|
|
65
|
-
|
|
66
|
-
def update_source_metadata(self, **kwargs):
|
|
67
|
-
post = kwargs.get("post", self.get_post())
|
|
68
|
-
if post is None:
|
|
69
|
-
self.source_metadata = SourceMetadata(
|
|
70
|
-
exists=False,
|
|
71
|
-
)
|
|
72
|
-
return
|
|
73
|
-
|
|
74
|
-
file_exists = (post.author != "[deleted]" or post.auth is not None) and (
|
|
75
|
-
post.selftext != "[deleted]" or post.selftext != "[removed]"
|
|
76
|
-
)
|
|
77
|
-
|
|
78
|
-
self.source_metadata = SourceMetadata(
|
|
79
|
-
date_created=datetime.utcfromtimestamp(post.created_utc).isoformat(),
|
|
80
|
-
source_url=post.permalink,
|
|
81
|
-
exists=file_exists,
|
|
82
|
-
)
|
|
83
|
-
|
|
84
|
-
@SourceConnectionError.wrap
|
|
85
|
-
@BaseSingleIngestDoc.skip_if_file_exists
|
|
86
|
-
def get_file(self):
|
|
87
|
-
"""Fetches the "remote" doc and stores it locally on the filesystem."""
|
|
88
|
-
self._create_full_tmp_dir_path()
|
|
89
|
-
# Write the title plus the body, if any
|
|
90
|
-
post = self.get_post()
|
|
91
|
-
self.update_source_metadata(post=post)
|
|
92
|
-
if post is None:
|
|
93
|
-
raise ValueError(
|
|
94
|
-
f"Failed to retrieve post {self.post_id}",
|
|
95
|
-
)
|
|
96
|
-
|
|
97
|
-
text_to_write = f"# {post.title}\n{post.selftext}"
|
|
98
|
-
with open(self.filename, "w", encoding="utf8") as f:
|
|
99
|
-
f.write(text_to_write)
|
|
100
|
-
|
|
101
|
-
@property
|
|
102
|
-
def filename(self) -> Path:
|
|
103
|
-
return (Path(self.read_config.download_dir) / f"{self.post_id}.md").resolve()
|
|
104
|
-
|
|
105
|
-
@property
|
|
106
|
-
def _output_filename(self):
|
|
107
|
-
return Path(self.processor_config.output_dir) / f"{self.post_id}.json"
|
|
108
|
-
|
|
109
|
-
@property
|
|
110
|
-
def date_modified(self) -> t.Optional[str]:
|
|
111
|
-
return None
|
|
112
|
-
|
|
113
|
-
@property
|
|
114
|
-
def version(self) -> t.Optional[str]:
|
|
115
|
-
return None
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
@dataclass
|
|
119
|
-
class RedditSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
120
|
-
connector_config: SimpleRedditConfig
|
|
121
|
-
_reddit: t.Optional["Reddit"] = field(init=False, default=None)
|
|
122
|
-
|
|
123
|
-
@property
|
|
124
|
-
def reddit(self) -> "Reddit":
|
|
125
|
-
from praw import Reddit
|
|
126
|
-
|
|
127
|
-
if self._reddit is None:
|
|
128
|
-
self._reddit = Reddit(
|
|
129
|
-
client_id=self.connector_config.client_id,
|
|
130
|
-
client_secret=self.connector_config.access_config.client_secret,
|
|
131
|
-
user_agent=self.connector_config.user_agent,
|
|
132
|
-
)
|
|
133
|
-
return self._reddit
|
|
134
|
-
|
|
135
|
-
@requires_dependencies(["praw"], extras="reddit")
|
|
136
|
-
def initialize(self):
|
|
137
|
-
_ = self.reddit
|
|
138
|
-
|
|
139
|
-
def check_connection(self):
|
|
140
|
-
from praw.endpoints import API_PATH
|
|
141
|
-
from prawcore import ResponseException
|
|
142
|
-
|
|
143
|
-
try:
|
|
144
|
-
self.reddit._objectify_request(method="HEAD", params=None, path=API_PATH["me"])
|
|
145
|
-
except ResponseException as response_error:
|
|
146
|
-
logger.error(f"failed to validate connection: {response_error}", exc_info=True)
|
|
147
|
-
raise SourceConnectionError(f"failed to validate connection: {response_error}")
|
|
148
|
-
|
|
149
|
-
def get_ingest_docs(self):
|
|
150
|
-
subreddit = self.reddit.subreddit(self.connector_config.subreddit_name)
|
|
151
|
-
if self.connector_config.search_query:
|
|
152
|
-
posts = subreddit.search(
|
|
153
|
-
self.connector_config.search_query,
|
|
154
|
-
limit=self.connector_config.num_posts,
|
|
155
|
-
)
|
|
156
|
-
else:
|
|
157
|
-
posts = subreddit.hot(limit=self.connector_config.num_posts)
|
|
158
|
-
return [
|
|
159
|
-
RedditIngestDoc(
|
|
160
|
-
connector_config=self.connector_config,
|
|
161
|
-
processor_config=self.processor_config,
|
|
162
|
-
read_config=self.read_config,
|
|
163
|
-
post_id=post.id,
|
|
164
|
-
)
|
|
165
|
-
for post in posts
|
|
166
|
-
]
|
|
@@ -1,109 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from typing import Dict, Type, cast
|
|
3
|
-
|
|
4
|
-
from unstructured_ingest.connector.airtable import AirtableIngestDoc
|
|
5
|
-
from unstructured_ingest.connector.astradb import AstraDBIngestDoc
|
|
6
|
-
from unstructured_ingest.connector.biomed import BiomedIngestDoc
|
|
7
|
-
from unstructured_ingest.connector.confluence import ConfluenceIngestDoc
|
|
8
|
-
from unstructured_ingest.connector.delta_table import DeltaTableIngestDoc
|
|
9
|
-
from unstructured_ingest.connector.discord import DiscordIngestDoc
|
|
10
|
-
from unstructured_ingest.connector.elasticsearch import (
|
|
11
|
-
ElasticsearchIngestDoc,
|
|
12
|
-
ElasticsearchIngestDocBatch,
|
|
13
|
-
)
|
|
14
|
-
from unstructured_ingest.connector.fsspec.azure import AzureBlobStorageIngestDoc
|
|
15
|
-
from unstructured_ingest.connector.fsspec.box import BoxIngestDoc
|
|
16
|
-
from unstructured_ingest.connector.fsspec.dropbox import DropboxIngestDoc
|
|
17
|
-
from unstructured_ingest.connector.fsspec.gcs import GcsIngestDoc
|
|
18
|
-
from unstructured_ingest.connector.fsspec.s3 import S3IngestDoc
|
|
19
|
-
from unstructured_ingest.connector.fsspec.sftp import SftpIngestDoc
|
|
20
|
-
from unstructured_ingest.connector.github import GitHubIngestDoc
|
|
21
|
-
from unstructured_ingest.connector.gitlab import GitLabIngestDoc
|
|
22
|
-
from unstructured_ingest.connector.google_drive import GoogleDriveIngestDoc
|
|
23
|
-
from unstructured_ingest.connector.hubspot import HubSpotIngestDoc
|
|
24
|
-
from unstructured_ingest.connector.jira import JiraIngestDoc
|
|
25
|
-
from unstructured_ingest.connector.kafka import KafkaIngestDoc
|
|
26
|
-
from unstructured_ingest.connector.local import LocalIngestDoc
|
|
27
|
-
from unstructured_ingest.connector.mongodb import MongoDBIngestDoc, MongoDBIngestDocBatch
|
|
28
|
-
from unstructured_ingest.connector.notion.connector import (
|
|
29
|
-
NotionDatabaseIngestDoc,
|
|
30
|
-
NotionPageIngestDoc,
|
|
31
|
-
)
|
|
32
|
-
from unstructured_ingest.connector.onedrive import OneDriveIngestDoc
|
|
33
|
-
from unstructured_ingest.connector.opensearch import OpenSearchIngestDoc, OpenSearchIngestDocBatch
|
|
34
|
-
from unstructured_ingest.connector.outlook import OutlookIngestDoc
|
|
35
|
-
from unstructured_ingest.connector.reddit import RedditIngestDoc
|
|
36
|
-
from unstructured_ingest.connector.salesforce import SalesforceIngestDoc
|
|
37
|
-
from unstructured_ingest.connector.sharepoint import SharepointIngestDoc
|
|
38
|
-
from unstructured_ingest.connector.slack import SlackIngestDoc
|
|
39
|
-
from unstructured_ingest.connector.wikipedia import (
|
|
40
|
-
WikipediaIngestHTMLDoc,
|
|
41
|
-
WikipediaIngestSummaryDoc,
|
|
42
|
-
WikipediaIngestTextDoc,
|
|
43
|
-
)
|
|
44
|
-
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
|
|
45
|
-
from unstructured_ingest.interfaces import BaseIngestDoc
|
|
46
|
-
|
|
47
|
-
INGEST_DOC_NAME_TO_CLASS: Dict[str, Type[EnhancedDataClassJsonMixin]] = {
|
|
48
|
-
"airtable": AirtableIngestDoc,
|
|
49
|
-
"astradb": AstraDBIngestDoc,
|
|
50
|
-
"azure": AzureBlobStorageIngestDoc,
|
|
51
|
-
"biomed": BiomedIngestDoc,
|
|
52
|
-
"box": BoxIngestDoc,
|
|
53
|
-
"confluence": ConfluenceIngestDoc,
|
|
54
|
-
"delta-table": DeltaTableIngestDoc,
|
|
55
|
-
"discord": DiscordIngestDoc,
|
|
56
|
-
"dropbox": DropboxIngestDoc,
|
|
57
|
-
"elasticsearch": ElasticsearchIngestDoc,
|
|
58
|
-
"elasticsearch_batch": ElasticsearchIngestDocBatch,
|
|
59
|
-
"gcs": GcsIngestDoc,
|
|
60
|
-
"github": GitHubIngestDoc,
|
|
61
|
-
"gitlab": GitLabIngestDoc,
|
|
62
|
-
"google_drive": GoogleDriveIngestDoc,
|
|
63
|
-
"hubspot": HubSpotIngestDoc,
|
|
64
|
-
"jira": JiraIngestDoc,
|
|
65
|
-
"kafka": KafkaIngestDoc,
|
|
66
|
-
"local": LocalIngestDoc,
|
|
67
|
-
"mongodb": MongoDBIngestDoc,
|
|
68
|
-
"mongodb_batch": MongoDBIngestDocBatch,
|
|
69
|
-
"notion_database": NotionDatabaseIngestDoc,
|
|
70
|
-
"notion_page": NotionPageIngestDoc,
|
|
71
|
-
"onedrive": OneDriveIngestDoc,
|
|
72
|
-
"opensearch": OpenSearchIngestDoc,
|
|
73
|
-
"opensearch_batch": OpenSearchIngestDocBatch,
|
|
74
|
-
"outlook": OutlookIngestDoc,
|
|
75
|
-
"reddit": RedditIngestDoc,
|
|
76
|
-
"s3": S3IngestDoc,
|
|
77
|
-
"salesforce": SalesforceIngestDoc,
|
|
78
|
-
"sftp": SftpIngestDoc,
|
|
79
|
-
"sharepoint": SharepointIngestDoc,
|
|
80
|
-
"slack": SlackIngestDoc,
|
|
81
|
-
"wikipedia_html": WikipediaIngestHTMLDoc,
|
|
82
|
-
"wikipedia_text": WikipediaIngestTextDoc,
|
|
83
|
-
"wikipedia_summary": WikipediaIngestSummaryDoc,
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def create_ingest_doc_from_json(ingest_doc_json: str) -> BaseIngestDoc:
|
|
88
|
-
try:
|
|
89
|
-
ingest_doc_dict: dict = json.loads(ingest_doc_json)
|
|
90
|
-
except TypeError as te:
|
|
91
|
-
raise TypeError(
|
|
92
|
-
f"failed to load json string when deserializing IngestDoc: {ingest_doc_json}",
|
|
93
|
-
) from te
|
|
94
|
-
return create_ingest_doc_from_dict(ingest_doc_dict)
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
def create_ingest_doc_from_dict(ingest_doc_dict: dict) -> BaseIngestDoc:
|
|
98
|
-
ingest_doc_dict = ingest_doc_dict.copy()
|
|
99
|
-
if "registry_name" not in ingest_doc_dict:
|
|
100
|
-
raise ValueError(f"registry_name not present in ingest doc: {ingest_doc_dict}")
|
|
101
|
-
registry_name = ingest_doc_dict.pop("registry_name")
|
|
102
|
-
try:
|
|
103
|
-
ingest_doc_cls = INGEST_DOC_NAME_TO_CLASS[registry_name]
|
|
104
|
-
return cast(BaseIngestDoc, ingest_doc_cls.from_dict(ingest_doc_dict))
|
|
105
|
-
except KeyError:
|
|
106
|
-
raise ValueError(
|
|
107
|
-
f"Error: Received unknown IngestDoc name: {registry_name} while deserializing",
|
|
108
|
-
"IngestDoc.",
|
|
109
|
-
)
|
|
@@ -1,301 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Salesforce Connector
|
|
3
|
-
Able to download Account, Case, Campaign, EmailMessage, Lead
|
|
4
|
-
Salesforce returns everything as a list of json.
|
|
5
|
-
This saves each entry as a separate file to be partitioned.
|
|
6
|
-
Using JWT authorization
|
|
7
|
-
https://developer.salesforce.com/docs/atlas.en-us.sfdx_dev.meta/sfdx_dev/sfdx_dev_auth_key_and_cert.htm
|
|
8
|
-
https://developer.salesforce.com/docs/atlas.en-us.sfdx_dev.meta/sfdx_dev/sfdx_dev_auth_connected_app.htm
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
import json
|
|
12
|
-
import typing as t
|
|
13
|
-
from collections import OrderedDict
|
|
14
|
-
from dataclasses import dataclass, field
|
|
15
|
-
from datetime import datetime
|
|
16
|
-
from email.utils import formatdate
|
|
17
|
-
from pathlib import Path
|
|
18
|
-
from string import Template
|
|
19
|
-
from textwrap import dedent
|
|
20
|
-
|
|
21
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
22
|
-
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
23
|
-
from unstructured_ingest.interfaces import (
|
|
24
|
-
AccessConfig,
|
|
25
|
-
BaseConnectorConfig,
|
|
26
|
-
BaseSingleIngestDoc,
|
|
27
|
-
BaseSourceConnector,
|
|
28
|
-
IngestDocCleanupMixin,
|
|
29
|
-
SourceConnectorCleanupMixin,
|
|
30
|
-
SourceMetadata,
|
|
31
|
-
)
|
|
32
|
-
from unstructured_ingest.logger import logger
|
|
33
|
-
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
class MissingCategoryError(Exception):
|
|
37
|
-
"""There are no categories with that name."""
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
SALESFORCE_API_VERSION = "57.0"
|
|
41
|
-
|
|
42
|
-
ACCEPTED_CATEGORIES = ["Account", "Case", "Campaign", "EmailMessage", "Lead"]
|
|
43
|
-
|
|
44
|
-
EMAIL_TEMPLATE = Template(
|
|
45
|
-
"""MIME-Version: 1.0
|
|
46
|
-
Date: $date
|
|
47
|
-
Message-ID: $message_identifier
|
|
48
|
-
Subject: $subject
|
|
49
|
-
From: $from_email
|
|
50
|
-
To: $to_email
|
|
51
|
-
Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630"
|
|
52
|
-
--00000000000095c9b205eff92630
|
|
53
|
-
Content-Type: text/plain; charset="UTF-8"
|
|
54
|
-
$textbody
|
|
55
|
-
--00000000000095c9b205eff92630
|
|
56
|
-
Content-Type: text/html; charset="UTF-8"
|
|
57
|
-
$htmlbody
|
|
58
|
-
--00000000000095c9b205eff92630--
|
|
59
|
-
""",
|
|
60
|
-
)
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
@dataclass
|
|
64
|
-
class SalesforceAccessConfig(AccessConfig):
|
|
65
|
-
consumer_key: str = enhanced_field(sensitive=True)
|
|
66
|
-
private_key: str = enhanced_field(sensitive=True)
|
|
67
|
-
|
|
68
|
-
@requires_dependencies(["cryptography"])
|
|
69
|
-
def get_private_key_value_and_type(self) -> t.Tuple[str, t.Type]:
|
|
70
|
-
from cryptography.hazmat.primitives import serialization
|
|
71
|
-
|
|
72
|
-
try:
|
|
73
|
-
serialization.load_pem_private_key(data=self.private_key.encode("utf-8"), password=None)
|
|
74
|
-
except ValueError:
|
|
75
|
-
pass
|
|
76
|
-
else:
|
|
77
|
-
return self.private_key, str
|
|
78
|
-
|
|
79
|
-
if Path(self.private_key).is_file():
|
|
80
|
-
return self.private_key, Path
|
|
81
|
-
|
|
82
|
-
raise ValueError("private_key does not contain PEM private key or path")
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
@dataclass
|
|
86
|
-
class SimpleSalesforceConfig(BaseConnectorConfig):
|
|
87
|
-
"""Connector specific attributes"""
|
|
88
|
-
|
|
89
|
-
access_config: SalesforceAccessConfig
|
|
90
|
-
categories: t.List[str]
|
|
91
|
-
username: str
|
|
92
|
-
recursive: bool = False
|
|
93
|
-
|
|
94
|
-
@requires_dependencies(["simple_salesforce"], extras="salesforce")
|
|
95
|
-
def get_client(self):
|
|
96
|
-
from simple_salesforce import Salesforce
|
|
97
|
-
|
|
98
|
-
pkey_value, pkey_type = self.access_config.get_private_key_value_and_type()
|
|
99
|
-
|
|
100
|
-
return Salesforce(
|
|
101
|
-
username=self.username,
|
|
102
|
-
consumer_key=self.access_config.consumer_key,
|
|
103
|
-
privatekey_file=pkey_value if pkey_type is Path else None,
|
|
104
|
-
privatekey=pkey_value if pkey_type is str else None,
|
|
105
|
-
version=SALESFORCE_API_VERSION,
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
@dataclass
|
|
110
|
-
class SalesforceIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
|
111
|
-
connector_config: SimpleSalesforceConfig
|
|
112
|
-
record_type: str
|
|
113
|
-
record_id: str
|
|
114
|
-
registry_name: str = "salesforce"
|
|
115
|
-
_record: OrderedDict = field(default_factory=lambda: OrderedDict())
|
|
116
|
-
|
|
117
|
-
@property
|
|
118
|
-
def record(self):
|
|
119
|
-
if not self._record:
|
|
120
|
-
self._record = self.get_record()
|
|
121
|
-
return self._record
|
|
122
|
-
|
|
123
|
-
def get_file_extension(self) -> str:
|
|
124
|
-
if self.record_type == "EmailMessage":
|
|
125
|
-
extension = ".eml"
|
|
126
|
-
elif self.record_type in ["Account", "Lead", "Case", "Campaign"]:
|
|
127
|
-
extension = ".xml"
|
|
128
|
-
else:
|
|
129
|
-
raise MissingCategoryError(
|
|
130
|
-
f"There are no categories with the name: {self.record_type}",
|
|
131
|
-
)
|
|
132
|
-
return extension
|
|
133
|
-
|
|
134
|
-
def _tmp_download_file(self) -> Path:
|
|
135
|
-
record_file = self.record_id + self.get_file_extension()
|
|
136
|
-
return Path(self.read_config.download_dir) / self.record_type / record_file
|
|
137
|
-
|
|
138
|
-
@property
|
|
139
|
-
def _output_filename(self) -> Path:
|
|
140
|
-
record_file = self.record_id + self.get_file_extension() + ".json"
|
|
141
|
-
return Path(self.processor_config.output_dir) / self.record_type / record_file
|
|
142
|
-
|
|
143
|
-
def _create_full_tmp_dir_path(self):
|
|
144
|
-
self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)
|
|
145
|
-
|
|
146
|
-
def _xml_for_record(self, record: OrderedDict) -> str:
|
|
147
|
-
"""Creates partitionable xml file from a record"""
|
|
148
|
-
import xml.etree.ElementTree as ET
|
|
149
|
-
|
|
150
|
-
def flatten_dict(data, parent, prefix=""):
|
|
151
|
-
for key, value in data.items():
|
|
152
|
-
if isinstance(value, OrderedDict):
|
|
153
|
-
flatten_dict(value, parent, prefix=f"{prefix}{key}.")
|
|
154
|
-
else:
|
|
155
|
-
item = ET.Element("item")
|
|
156
|
-
item.text = f"{prefix}{key}: {value}"
|
|
157
|
-
parent.append(item)
|
|
158
|
-
|
|
159
|
-
root = ET.Element("root")
|
|
160
|
-
flatten_dict(record, root)
|
|
161
|
-
xml_string = ET.tostring(root, encoding="utf-8", xml_declaration=True).decode()
|
|
162
|
-
return xml_string
|
|
163
|
-
|
|
164
|
-
def _eml_for_record(self, email_json: t.Dict[str, t.Any]) -> str:
|
|
165
|
-
from dateutil import parser # type: ignore
|
|
166
|
-
|
|
167
|
-
"""Recreates standard expected .eml format using template."""
|
|
168
|
-
eml = EMAIL_TEMPLATE.substitute(
|
|
169
|
-
date=formatdate(parser.parse(email_json.get("MessageDate")).timestamp()),
|
|
170
|
-
message_identifier=email_json.get("MessageIdentifier"),
|
|
171
|
-
subject=email_json.get("Subject"),
|
|
172
|
-
from_email=email_json.get("FromAddress"),
|
|
173
|
-
to_email=email_json.get("ToAddress"),
|
|
174
|
-
textbody=email_json.get("TextBody"),
|
|
175
|
-
# TODO: This is a hack to get emails to process correctly.
|
|
176
|
-
# The HTML partitioner seems to have issues with <br> and text without tags like <p>
|
|
177
|
-
htmlbody=email_json.get("HtmlBody", "") # "" because you can't .replace None
|
|
178
|
-
.replace("<br />", "<p>")
|
|
179
|
-
.replace("<body", "<body><p"),
|
|
180
|
-
)
|
|
181
|
-
return dedent(eml)
|
|
182
|
-
|
|
183
|
-
@SourceConnectionNetworkError.wrap
|
|
184
|
-
def _get_response(self):
|
|
185
|
-
client = self.connector_config.get_client()
|
|
186
|
-
return client.query_all(
|
|
187
|
-
f"select FIELDS(STANDARD) from {self.record_type} where Id='{self.record_id}'",
|
|
188
|
-
)
|
|
189
|
-
|
|
190
|
-
def get_record(self) -> OrderedDict:
|
|
191
|
-
# Get record from Salesforce based on id
|
|
192
|
-
response = self._get_response()
|
|
193
|
-
logger.debug(f"response was returned for salesforce record id: {self.record_id}")
|
|
194
|
-
records = response["records"]
|
|
195
|
-
if not records:
|
|
196
|
-
raise ValueError(
|
|
197
|
-
f"No record found with record id {self.record_id}: {json.dumps(response)}"
|
|
198
|
-
)
|
|
199
|
-
record_json = records[0]
|
|
200
|
-
return record_json
|
|
201
|
-
|
|
202
|
-
def update_source_metadata(self) -> None: # type: ignore
|
|
203
|
-
record_json = self.record
|
|
204
|
-
|
|
205
|
-
date_format = "%Y-%m-%dT%H:%M:%S.000+0000"
|
|
206
|
-
self.source_metadata = SourceMetadata(
|
|
207
|
-
date_created=datetime.strptime(record_json["CreatedDate"], date_format).isoformat(),
|
|
208
|
-
date_modified=datetime.strptime(
|
|
209
|
-
record_json["LastModifiedDate"],
|
|
210
|
-
date_format,
|
|
211
|
-
).isoformat(),
|
|
212
|
-
# SystemModstamp is Timestamp if record has been modified by person or automated system
|
|
213
|
-
version=record_json.get("SystemModstamp"),
|
|
214
|
-
source_url=record_json["attributes"].get("url"),
|
|
215
|
-
exists=True,
|
|
216
|
-
)
|
|
217
|
-
|
|
218
|
-
@SourceConnectionError.wrap
|
|
219
|
-
@BaseSingleIngestDoc.skip_if_file_exists
|
|
220
|
-
def get_file(self):
|
|
221
|
-
"""Saves individual json records locally."""
|
|
222
|
-
self._create_full_tmp_dir_path()
|
|
223
|
-
record = self.record
|
|
224
|
-
|
|
225
|
-
self.update_source_metadata()
|
|
226
|
-
|
|
227
|
-
try:
|
|
228
|
-
if self.record_type == "EmailMessage":
|
|
229
|
-
document = self._eml_for_record(record)
|
|
230
|
-
else:
|
|
231
|
-
document = self._xml_for_record(record)
|
|
232
|
-
|
|
233
|
-
with open(self._tmp_download_file(), "w") as page_file:
|
|
234
|
-
page_file.write(document)
|
|
235
|
-
|
|
236
|
-
except Exception as e:
|
|
237
|
-
logger.error(
|
|
238
|
-
f"Error while downloading and saving file: {self.record_id}.",
|
|
239
|
-
)
|
|
240
|
-
logger.error(e)
|
|
241
|
-
|
|
242
|
-
@property
|
|
243
|
-
def filename(self):
|
|
244
|
-
"""The filename of the file created from a Salesforce record"""
|
|
245
|
-
return self._tmp_download_file()
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
@dataclass
|
|
249
|
-
class SalesforceSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
250
|
-
connector_config: SimpleSalesforceConfig
|
|
251
|
-
|
|
252
|
-
def __post_init__(self):
|
|
253
|
-
self.ingest_doc_cls: t.Type[SalesforceIngestDoc] = SalesforceIngestDoc
|
|
254
|
-
|
|
255
|
-
def initialize(self):
|
|
256
|
-
pass
|
|
257
|
-
|
|
258
|
-
@requires_dependencies(["simple_salesforce"], extras="salesforce")
|
|
259
|
-
def check_connection(self):
|
|
260
|
-
from simple_salesforce.exceptions import SalesforceError
|
|
261
|
-
|
|
262
|
-
try:
|
|
263
|
-
self.connector_config.get_client()
|
|
264
|
-
except SalesforceError as salesforce_error:
|
|
265
|
-
logger.error(f"failed to validate connection: {salesforce_error}", exc_info=True)
|
|
266
|
-
raise SourceConnectionError(f"failed to validate connection: {salesforce_error}")
|
|
267
|
-
|
|
268
|
-
@requires_dependencies(["simple_salesforce"], extras="salesforce")
|
|
269
|
-
def get_ingest_docs(self) -> t.List[SalesforceIngestDoc]:
|
|
270
|
-
"""Get Salesforce Ids for the records.
|
|
271
|
-
Send them to next phase where each doc gets downloaded into the
|
|
272
|
-
appropriate format for partitioning.
|
|
273
|
-
"""
|
|
274
|
-
from simple_salesforce.exceptions import SalesforceMalformedRequest
|
|
275
|
-
|
|
276
|
-
client = self.connector_config.get_client()
|
|
277
|
-
|
|
278
|
-
ingest_docs = []
|
|
279
|
-
for record_type in self.connector_config.categories:
|
|
280
|
-
if record_type not in ACCEPTED_CATEGORIES:
|
|
281
|
-
raise ValueError(f"{record_type} not currently an accepted Salesforce category")
|
|
282
|
-
|
|
283
|
-
try:
|
|
284
|
-
# Get ids from Salesforce
|
|
285
|
-
records = client.query_all(
|
|
286
|
-
f"select Id from {record_type}",
|
|
287
|
-
)
|
|
288
|
-
for record in records["records"]:
|
|
289
|
-
ingest_docs.append(
|
|
290
|
-
SalesforceIngestDoc(
|
|
291
|
-
connector_config=self.connector_config,
|
|
292
|
-
processor_config=self.processor_config,
|
|
293
|
-
read_config=self.read_config,
|
|
294
|
-
record_type=record_type,
|
|
295
|
-
record_id=record["Id"],
|
|
296
|
-
),
|
|
297
|
-
)
|
|
298
|
-
except SalesforceMalformedRequest as e:
|
|
299
|
-
raise SalesforceMalformedRequest(f"Problem with Salesforce query: {e}")
|
|
300
|
-
|
|
301
|
-
return ingest_docs
|