unstructured-ingest 1.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/cli/__init__.py +0 -0
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +269 -0
- unstructured_ingest/cli/base/dest.py +84 -0
- unstructured_ingest/cli/base/importer.py +34 -0
- unstructured_ingest/cli/base/src.py +75 -0
- unstructured_ingest/cli/cli.py +24 -0
- unstructured_ingest/cli/cmds.py +14 -0
- unstructured_ingest/cli/utils/__init__.py +0 -0
- unstructured_ingest/cli/utils/click.py +237 -0
- unstructured_ingest/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/data_types/__init__.py +0 -0
- unstructured_ingest/data_types/entities.py +17 -0
- unstructured_ingest/data_types/file_data.py +116 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +63 -0
- unstructured_ingest/embed/bedrock.py +323 -0
- unstructured_ingest/embed/huggingface.py +69 -0
- unstructured_ingest/embed/interfaces.py +146 -0
- unstructured_ingest/embed/mixedbreadai.py +134 -0
- unstructured_ingest/embed/octoai.py +133 -0
- unstructured_ingest/embed/openai.py +142 -0
- unstructured_ingest/embed/togetherai.py +116 -0
- unstructured_ingest/embed/vertexai.py +109 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/error.py +156 -0
- unstructured_ingest/errors_v2.py +156 -0
- unstructured_ingest/interfaces/__init__.py +27 -0
- unstructured_ingest/interfaces/connector.py +56 -0
- unstructured_ingest/interfaces/downloader.py +90 -0
- unstructured_ingest/interfaces/indexer.py +29 -0
- unstructured_ingest/interfaces/process.py +22 -0
- unstructured_ingest/interfaces/processor.py +88 -0
- unstructured_ingest/interfaces/upload_stager.py +89 -0
- unstructured_ingest/interfaces/uploader.py +67 -0
- unstructured_ingest/logger.py +39 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/otel.py +128 -0
- unstructured_ingest/pipeline/__init__.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +211 -0
- unstructured_ingest/pipeline/otel.py +32 -0
- unstructured_ingest/pipeline/pipeline.py +408 -0
- unstructured_ingest/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/pipeline/steps/chunk.py +78 -0
- unstructured_ingest/pipeline/steps/download.py +206 -0
- unstructured_ingest/pipeline/steps/embed.py +77 -0
- unstructured_ingest/pipeline/steps/filter.py +35 -0
- unstructured_ingest/pipeline/steps/index.py +86 -0
- unstructured_ingest/pipeline/steps/partition.py +77 -0
- unstructured_ingest/pipeline/steps/stage.py +65 -0
- unstructured_ingest/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/pipeline/steps/upload.py +58 -0
- unstructured_ingest/processes/__init__.py +18 -0
- unstructured_ingest/processes/chunker.py +131 -0
- unstructured_ingest/processes/connector_registry.py +69 -0
- unstructured_ingest/processes/connectors/__init__.py +129 -0
- unstructured_ingest/processes/connectors/airtable.py +238 -0
- unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +592 -0
- unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
- unstructured_ingest/processes/connectors/chroma.py +193 -0
- unstructured_ingest/processes/connectors/confluence.py +527 -0
- unstructured_ingest/processes/connectors/couchbase.py +336 -0
- unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
- unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
- unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
- unstructured_ingest/processes/connectors/delta_table.py +310 -0
- unstructured_ingest/processes/connectors/discord.py +161 -0
- unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
- unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
- unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
- unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
- unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
- unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
- unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
- unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/processes/connectors/github.py +226 -0
- unstructured_ingest/processes/connectors/gitlab.py +270 -0
- unstructured_ingest/processes/connectors/google_drive.py +848 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
- unstructured_ingest/processes/connectors/jira.py +522 -0
- unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
- unstructured_ingest/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/processes/connectors/kdbai.py +156 -0
- unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
- unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/processes/connectors/local.py +227 -0
- unstructured_ingest/processes/connectors/milvus.py +311 -0
- unstructured_ingest/processes/connectors/mongodb.py +389 -0
- unstructured_ingest/processes/connectors/neo4j.py +534 -0
- unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/processes/connectors/notion/connector.py +350 -0
- unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
- unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
- unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
- unstructured_ingest/processes/connectors/onedrive.py +485 -0
- unstructured_ingest/processes/connectors/outlook.py +242 -0
- unstructured_ingest/processes/connectors/pinecone.py +400 -0
- unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
- unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/processes/connectors/redisdb.py +214 -0
- unstructured_ingest/processes/connectors/salesforce.py +307 -0
- unstructured_ingest/processes/connectors/sharepoint.py +282 -0
- unstructured_ingest/processes/connectors/slack.py +249 -0
- unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
- unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
- unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
- unstructured_ingest/processes/connectors/sql/sql.py +456 -0
- unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
- unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
- unstructured_ingest/processes/connectors/utils.py +60 -0
- unstructured_ingest/processes/connectors/vectara.py +348 -0
- unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
- unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
- unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
- unstructured_ingest/processes/embedder.py +203 -0
- unstructured_ingest/processes/filter.py +60 -0
- unstructured_ingest/processes/partitioner.py +233 -0
- unstructured_ingest/processes/uncompress.py +61 -0
- unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest/processes/utils/blob_storage.py +32 -0
- unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- unstructured_ingest/unstructured_api.py +140 -0
- unstructured_ingest/utils/__init__.py +5 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +72 -0
- unstructured_ingest/utils/constants.py +2 -0
- unstructured_ingest/utils/data_prep.py +216 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/filesystem.py +27 -0
- unstructured_ingest/utils/html.py +174 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/utils/pydantic_models.py +52 -0
- unstructured_ingest/utils/string_and_date_utils.py +74 -0
- unstructured_ingest/utils/table.py +80 -0
- unstructured_ingest/utils/tls.py +15 -0
- unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
- unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
- unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
- unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
- unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import unstructured_ingest.processes.connectors.databricks # noqa: F401
|
|
4
|
+
import unstructured_ingest.processes.connectors.duckdb # noqa: F401
|
|
5
|
+
import unstructured_ingest.processes.connectors.elasticsearch # noqa: F401
|
|
6
|
+
import unstructured_ingest.processes.connectors.fsspec # noqa: F401
|
|
7
|
+
import unstructured_ingest.processes.connectors.ibm_watsonx # noqa: F401
|
|
8
|
+
import unstructured_ingest.processes.connectors.kafka # noqa: F401
|
|
9
|
+
import unstructured_ingest.processes.connectors.lancedb # noqa: F401
|
|
10
|
+
import unstructured_ingest.processes.connectors.qdrant # noqa: F401
|
|
11
|
+
import unstructured_ingest.processes.connectors.sql # noqa: F401
|
|
12
|
+
import unstructured_ingest.processes.connectors.weaviate # noqa: F401
|
|
13
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
14
|
+
add_destination_entry,
|
|
15
|
+
add_source_entry,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
from .airtable import CONNECTOR_TYPE as AIRTABLE_CONNECTOR_TYPE
|
|
19
|
+
from .airtable import airtable_source_entry
|
|
20
|
+
from .astradb import CONNECTOR_TYPE as ASTRA_DB_CONNECTOR_TYPE
|
|
21
|
+
from .astradb import astra_db_destination_entry, astra_db_source_entry
|
|
22
|
+
from .azure_ai_search import CONNECTOR_TYPE as AZURE_AI_SEARCH_CONNECTOR_TYPE
|
|
23
|
+
from .azure_ai_search import azure_ai_search_destination_entry
|
|
24
|
+
from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
|
|
25
|
+
from .chroma import chroma_destination_entry
|
|
26
|
+
from .confluence import CONNECTOR_TYPE as CONFLUENCE_CONNECTOR_TYPE
|
|
27
|
+
from .confluence import confluence_source_entry
|
|
28
|
+
from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
|
|
29
|
+
from .couchbase import couchbase_destination_entry, couchbase_source_entry
|
|
30
|
+
from .delta_table import CONNECTOR_TYPE as DELTA_TABLE_CONNECTOR_TYPE
|
|
31
|
+
from .delta_table import delta_table_destination_entry
|
|
32
|
+
from .discord import CONNECTOR_TYPE as DISCORD_CONNECTOR_TYPE
|
|
33
|
+
from .discord import discord_source_entry
|
|
34
|
+
from .github import CONNECTOR_TYPE as GITHUB_CONNECTOR_TYPE
|
|
35
|
+
from .github import github_source_entry
|
|
36
|
+
from .gitlab import CONNECTOR_TYPE as GITLAB_CONNECTOR_TYPE
|
|
37
|
+
from .gitlab import gitlab_source_entry
|
|
38
|
+
from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
|
|
39
|
+
from .google_drive import google_drive_source_entry
|
|
40
|
+
from .jira import CONNECTOR_TYPE as JIRA_CONNECTOR_TYPE
|
|
41
|
+
from .jira import jira_source_entry
|
|
42
|
+
from .kdbai import CONNECTOR_TYPE as KDBAI_CONNECTOR_TYPE
|
|
43
|
+
from .kdbai import kdbai_destination_entry
|
|
44
|
+
from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE
|
|
45
|
+
from .local import local_destination_entry, local_source_entry
|
|
46
|
+
from .milvus import CONNECTOR_TYPE as MILVUS_CONNECTOR_TYPE
|
|
47
|
+
from .milvus import milvus_destination_entry
|
|
48
|
+
from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
|
|
49
|
+
from .mongodb import mongodb_destination_entry, mongodb_source_entry
|
|
50
|
+
from .neo4j import CONNECTOR_TYPE as NEO4J_CONNECTOR_TYPE
|
|
51
|
+
from .neo4j import neo4j_destination_entry
|
|
52
|
+
from .notion.connector import CONNECTOR_TYPE as NOTION_CONNECTOR_TYPE
|
|
53
|
+
from .notion.connector import notion_source_entry
|
|
54
|
+
from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
|
|
55
|
+
from .onedrive import onedrive_destination_entry, onedrive_source_entry
|
|
56
|
+
from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
|
|
57
|
+
from .outlook import outlook_source_entry
|
|
58
|
+
from .pinecone import CONNECTOR_TYPE as PINECONE_CONNECTOR_TYPE
|
|
59
|
+
from .pinecone import pinecone_destination_entry
|
|
60
|
+
from .redisdb import CONNECTOR_TYPE as REDIS_CONNECTOR_TYPE
|
|
61
|
+
from .redisdb import redis_destination_entry
|
|
62
|
+
from .salesforce import CONNECTOR_TYPE as SALESFORCE_CONNECTOR_TYPE
|
|
63
|
+
from .salesforce import salesforce_source_entry
|
|
64
|
+
from .sharepoint import CONNECTOR_TYPE as SHAREPOINT_CONNECTOR_TYPE
|
|
65
|
+
from .sharepoint import sharepoint_source_entry
|
|
66
|
+
from .slack import CONNECTOR_TYPE as SLACK_CONNECTOR_TYPE
|
|
67
|
+
from .slack import slack_source_entry
|
|
68
|
+
from .vectara import CONNECTOR_TYPE as VECTARA_CONNECTOR_TYPE
|
|
69
|
+
from .vectara import vectara_destination_entry
|
|
70
|
+
from .zendesk.zendesk import CONNECTOR_TYPE as ZENDESK_CONNECTOR_TYPE
|
|
71
|
+
from .zendesk.zendesk import zendesk_source_entry
|
|
72
|
+
|
|
73
|
+
add_source_entry(source_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_source_entry)
|
|
74
|
+
add_destination_entry(destination_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_destination_entry)
|
|
75
|
+
|
|
76
|
+
add_destination_entry(destination_type=CHROMA_CONNECTOR_TYPE, entry=chroma_destination_entry)
|
|
77
|
+
|
|
78
|
+
add_source_entry(source_type=COUCHBASE_CONNECTOR_TYPE, entry=couchbase_source_entry)
|
|
79
|
+
add_destination_entry(destination_type=COUCHBASE_CONNECTOR_TYPE, entry=couchbase_destination_entry)
|
|
80
|
+
|
|
81
|
+
add_destination_entry(
|
|
82
|
+
destination_type=DELTA_TABLE_CONNECTOR_TYPE, entry=delta_table_destination_entry
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
add_source_entry(source_type=GOOGLE_DRIVE_CONNECTOR_TYPE, entry=google_drive_source_entry)
|
|
87
|
+
|
|
88
|
+
add_source_entry(source_type=LOCAL_CONNECTOR_TYPE, entry=local_source_entry)
|
|
89
|
+
add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=local_destination_entry)
|
|
90
|
+
|
|
91
|
+
add_source_entry(source_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_source_entry)
|
|
92
|
+
add_destination_entry(destination_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_destination_entry)
|
|
93
|
+
|
|
94
|
+
add_destination_entry(destination_type=NEO4J_CONNECTOR_TYPE, entry=neo4j_destination_entry)
|
|
95
|
+
|
|
96
|
+
add_source_entry(source_type=SALESFORCE_CONNECTOR_TYPE, entry=salesforce_source_entry)
|
|
97
|
+
|
|
98
|
+
add_destination_entry(destination_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_destination_entry)
|
|
99
|
+
add_source_entry(source_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_source_entry)
|
|
100
|
+
|
|
101
|
+
add_destination_entry(destination_type=PINECONE_CONNECTOR_TYPE, entry=pinecone_destination_entry)
|
|
102
|
+
add_source_entry(source_type=SHAREPOINT_CONNECTOR_TYPE, entry=sharepoint_source_entry)
|
|
103
|
+
|
|
104
|
+
add_destination_entry(destination_type=MILVUS_CONNECTOR_TYPE, entry=milvus_destination_entry)
|
|
105
|
+
add_destination_entry(
|
|
106
|
+
destination_type=AZURE_AI_SEARCH_CONNECTOR_TYPE,
|
|
107
|
+
entry=azure_ai_search_destination_entry,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry)
|
|
111
|
+
add_source_entry(source_type=AIRTABLE_CONNECTOR_TYPE, entry=airtable_source_entry)
|
|
112
|
+
add_source_entry(source_type=NOTION_CONNECTOR_TYPE, entry=notion_source_entry)
|
|
113
|
+
|
|
114
|
+
add_source_entry(source_type=OUTLOOK_CONNECTOR_TYPE, entry=outlook_source_entry)
|
|
115
|
+
|
|
116
|
+
add_source_entry(source_type=GITLAB_CONNECTOR_TYPE, entry=gitlab_source_entry)
|
|
117
|
+
|
|
118
|
+
add_source_entry(source_type=SLACK_CONNECTOR_TYPE, entry=slack_source_entry)
|
|
119
|
+
|
|
120
|
+
add_destination_entry(destination_type=VECTARA_CONNECTOR_TYPE, entry=vectara_destination_entry)
|
|
121
|
+
add_source_entry(source_type=CONFLUENCE_CONNECTOR_TYPE, entry=confluence_source_entry)
|
|
122
|
+
|
|
123
|
+
add_source_entry(source_type=DISCORD_CONNECTOR_TYPE, entry=discord_source_entry)
|
|
124
|
+
add_destination_entry(destination_type=REDIS_CONNECTOR_TYPE, entry=redis_destination_entry)
|
|
125
|
+
|
|
126
|
+
add_source_entry(source_type=JIRA_CONNECTOR_TYPE, entry=jira_source_entry)
|
|
127
|
+
|
|
128
|
+
add_source_entry(source_type=ZENDESK_CONNECTOR_TYPE, entry=zendesk_source_entry)
|
|
129
|
+
add_source_entry(source_type=GITHUB_CONNECTOR_TYPE, entry=github_source_entry)
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
4
|
+
from uuid import NAMESPACE_DNS, uuid5
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, Field, Secret, field_validator
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
9
|
+
from unstructured_ingest.error import ValueError
|
|
10
|
+
from unstructured_ingest.interfaces import (
|
|
11
|
+
AccessConfig,
|
|
12
|
+
ConnectionConfig,
|
|
13
|
+
Downloader,
|
|
14
|
+
DownloaderConfig,
|
|
15
|
+
DownloadResponse,
|
|
16
|
+
Indexer,
|
|
17
|
+
IndexerConfig,
|
|
18
|
+
)
|
|
19
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
20
|
+
SourceRegistryEntry,
|
|
21
|
+
)
|
|
22
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from pyairtable import Api
|
|
26
|
+
from pyairtable.api.types import RecordDict
|
|
27
|
+
|
|
28
|
+
CONNECTOR_TYPE = "airtable"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class AirtableTableMeta(BaseModel):
|
|
32
|
+
"""Metadata specifying a table id, a base id which the table is stored in,
|
|
33
|
+
and an t.Optional view id in case particular rows and fields are to be ingested"""
|
|
34
|
+
|
|
35
|
+
base_id: str
|
|
36
|
+
table_id: str
|
|
37
|
+
view_id: Optional[str] = None
|
|
38
|
+
|
|
39
|
+
def get_id(self) -> str:
|
|
40
|
+
id_s = f"{self.base_id}{self.table_id}"
|
|
41
|
+
id_s = f"{id_s}{self.view_id}" if self.view_id else id_s
|
|
42
|
+
return str(uuid5(NAMESPACE_DNS, id_s))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class AirtableAccessConfig(AccessConfig):
|
|
46
|
+
personal_access_token: str = Field(
|
|
47
|
+
description="Personal access token to authenticate into Airtable. Check: "
|
|
48
|
+
"https://support.airtable.com/docs/creating-and-using-api-keys-and-access-tokens "
|
|
49
|
+
"for more info"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class AirtableConnectionConfig(ConnectionConfig):
|
|
54
|
+
access_config: Secret[AirtableAccessConfig]
|
|
55
|
+
|
|
56
|
+
@requires_dependencies(["pyairtable"], extras="airtable")
|
|
57
|
+
def get_client(self) -> "Api":
|
|
58
|
+
from pyairtable import Api
|
|
59
|
+
|
|
60
|
+
access_config = self.access_config.get_secret_value()
|
|
61
|
+
return Api(api_key=access_config.personal_access_token)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class AirtableIndexerConfig(IndexerConfig):
|
|
65
|
+
list_of_paths: Optional[list[str]] = Field(
|
|
66
|
+
default=None,
|
|
67
|
+
description="""
|
|
68
|
+
A list of paths that specify the locations to ingest data from within Airtable.
|
|
69
|
+
|
|
70
|
+
If this argument is not set, the connector ingests all tables within each and every base.
|
|
71
|
+
--list-of-paths: path1 path2 path3 ….
|
|
72
|
+
path: base_id/table_id(optional)/view_id(optional)/
|
|
73
|
+
|
|
74
|
+
To obtain (base, table, view) ids in bulk, check:
|
|
75
|
+
https://airtable.com/developers/web/api/list-bases (base ids)
|
|
76
|
+
https://airtable.com/developers/web/api/get-base-schema (table and view ids)
|
|
77
|
+
https://pyairtable.readthedocs.io/en/latest/metadata.html (base, table and view ids)
|
|
78
|
+
|
|
79
|
+
To obtain specific ids from Airtable UI, go to your workspace, and copy any
|
|
80
|
+
relevant id from the URL structure:
|
|
81
|
+
https://airtable.com/appAbcDeF1ghijKlm/tblABcdEfG1HIJkLm/viwABCDEfg6hijKLM
|
|
82
|
+
appAbcDeF1ghijKlm -> base_id
|
|
83
|
+
tblABcdEfG1HIJkLm -> table_id
|
|
84
|
+
viwABCDEfg6hijKLM -> view_id
|
|
85
|
+
|
|
86
|
+
You can also check: https://support.airtable.com/docs/finding-airtable-ids
|
|
87
|
+
|
|
88
|
+
Here is an example for one --list-of-paths:
|
|
89
|
+
base1/ → gets the entirety of all tables inside base1
|
|
90
|
+
base1/table1 → gets all rows and columns within table1 in base1
|
|
91
|
+
base1/table1/view1 → gets the rows and columns that are
|
|
92
|
+
visible in view1 for the table1 in base1
|
|
93
|
+
|
|
94
|
+
Examples to invalid airtable_paths:
|
|
95
|
+
table1 → has to mention base to be valid
|
|
96
|
+
base1/view1 → has to mention table to be valid
|
|
97
|
+
""",
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
@classmethod
|
|
101
|
+
def validate_path(cls, path: str):
|
|
102
|
+
components = path.split("/")
|
|
103
|
+
if len(components) > 3:
|
|
104
|
+
raise ValueError(
|
|
105
|
+
f"Path must be of the format: base_id/table_id/view_id, "
|
|
106
|
+
f"where table id and view id are optional. Got: {path}"
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
@field_validator("list_of_paths")
|
|
110
|
+
@classmethod
|
|
111
|
+
def validate_format(cls, v: list[str]) -> list[str]:
|
|
112
|
+
for path in v:
|
|
113
|
+
cls.validate_path(path=path)
|
|
114
|
+
return v
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
@dataclass
|
|
118
|
+
class AirtableIndexer(Indexer):
|
|
119
|
+
connector_type: str = CONNECTOR_TYPE
|
|
120
|
+
connection_config: AirtableConnectionConfig
|
|
121
|
+
index_config: AirtableIndexerConfig
|
|
122
|
+
|
|
123
|
+
def get_all_table_meta(self) -> list[AirtableTableMeta]:
|
|
124
|
+
client = self.connection_config.get_client()
|
|
125
|
+
bases = client.bases()
|
|
126
|
+
airtable_meta = []
|
|
127
|
+
for base in bases:
|
|
128
|
+
for table in base.schema().tables:
|
|
129
|
+
airtable_meta.append(AirtableTableMeta(base_id=base.id, table_id=table.id))
|
|
130
|
+
return airtable_meta
|
|
131
|
+
|
|
132
|
+
def get_base_tables_meta(self, base_id: str) -> list[AirtableTableMeta]:
|
|
133
|
+
client = self.connection_config.get_client()
|
|
134
|
+
base = client.base(base_id=base_id)
|
|
135
|
+
airtable_meta = []
|
|
136
|
+
for table in base.tables():
|
|
137
|
+
airtable_meta.append(AirtableTableMeta(base_id=base.id, table_id=table.id))
|
|
138
|
+
return airtable_meta
|
|
139
|
+
|
|
140
|
+
def get_meta_from_list(self) -> list[AirtableTableMeta]:
|
|
141
|
+
airtable_meta = []
|
|
142
|
+
for path in self.index_config.list_of_paths:
|
|
143
|
+
components = path.split("/")
|
|
144
|
+
if len(components) == 1:
|
|
145
|
+
airtable_meta.extend(self.get_base_tables_meta(base_id=components[0]))
|
|
146
|
+
elif len(components) == 2:
|
|
147
|
+
airtable_meta.append(
|
|
148
|
+
AirtableTableMeta(base_id=components[0], table_id=components[1])
|
|
149
|
+
)
|
|
150
|
+
elif len(components) == 3:
|
|
151
|
+
airtable_meta.append(
|
|
152
|
+
AirtableTableMeta(
|
|
153
|
+
base_id=components[0], table_id=components[1], view_id=components[2]
|
|
154
|
+
)
|
|
155
|
+
)
|
|
156
|
+
else:
|
|
157
|
+
raise ValueError(
|
|
158
|
+
f"Path must be of the format: base_id/table_id/view_id, "
|
|
159
|
+
f"where table id and view id are optional. Got: {path}"
|
|
160
|
+
)
|
|
161
|
+
return airtable_meta
|
|
162
|
+
|
|
163
|
+
def get_table_metas(self) -> list[AirtableTableMeta]:
|
|
164
|
+
if not self.index_config.list_of_paths:
|
|
165
|
+
return self.get_all_table_meta()
|
|
166
|
+
return self.get_meta_from_list()
|
|
167
|
+
|
|
168
|
+
def precheck(self) -> None:
|
|
169
|
+
client = self.connection_config.get_client()
|
|
170
|
+
client.request(method="HEAD", url=client.build_url("meta", "bases"))
|
|
171
|
+
|
|
172
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
173
|
+
table_metas = self.get_table_metas()
|
|
174
|
+
for table_meta in table_metas:
|
|
175
|
+
fullpath = (
|
|
176
|
+
f"{table_meta.base_id}/{table_meta.table_id}/{table_meta.view_id}.csv"
|
|
177
|
+
if table_meta.view_id
|
|
178
|
+
else f"{table_meta.base_id}/{table_meta.table_id}.csv"
|
|
179
|
+
)
|
|
180
|
+
yield FileData(
|
|
181
|
+
identifier=table_meta.get_id(),
|
|
182
|
+
connector_type=CONNECTOR_TYPE,
|
|
183
|
+
additional_metadata=table_meta.model_dump(),
|
|
184
|
+
source_identifiers=SourceIdentifiers(
|
|
185
|
+
filename=str(Path(fullpath).name),
|
|
186
|
+
fullpath=fullpath,
|
|
187
|
+
),
|
|
188
|
+
display_name=fullpath,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class AirtableDownloaderConfig(DownloaderConfig):
|
|
193
|
+
pass
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
@dataclass
|
|
197
|
+
class AirtableDownloader(Downloader):
|
|
198
|
+
connection_config: AirtableConnectionConfig
|
|
199
|
+
download_config: AirtableDownloaderConfig = field(default_factory=AirtableDownloaderConfig)
|
|
200
|
+
connector_type: str = CONNECTOR_TYPE
|
|
201
|
+
|
|
202
|
+
def get_table_contents(self, table_meta: AirtableTableMeta) -> list["RecordDict"]:
|
|
203
|
+
client = self.connection_config.get_client()
|
|
204
|
+
table = client.table(base_id=table_meta.base_id, table_name=table_meta.table_id)
|
|
205
|
+
table_fetch_kwargs = {"view": table_meta.view_id} if table_meta.view_id else {}
|
|
206
|
+
rows = table.all(**table_fetch_kwargs)
|
|
207
|
+
return rows
|
|
208
|
+
|
|
209
|
+
def _table_row_to_dict(self, table_row: "RecordDict") -> dict:
|
|
210
|
+
row_dict = {
|
|
211
|
+
"id": table_row["id"],
|
|
212
|
+
"created_time": table_row["createdTime"],
|
|
213
|
+
}
|
|
214
|
+
row_dict.update(table_row["fields"])
|
|
215
|
+
return row_dict
|
|
216
|
+
|
|
217
|
+
@requires_dependencies(["pandas"], extras="airtable")
|
|
218
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
219
|
+
import pandas as pd
|
|
220
|
+
|
|
221
|
+
table_meta = AirtableTableMeta.model_validate(file_data.additional_metadata)
|
|
222
|
+
table_contents = self.get_table_contents(table_meta=table_meta)
|
|
223
|
+
df = pd.DataFrame.from_dict(
|
|
224
|
+
data=[self._table_row_to_dict(table_row=row) for row in table_contents]
|
|
225
|
+
).sort_index(axis=1)
|
|
226
|
+
download_path = self.get_download_path(file_data=file_data)
|
|
227
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
228
|
+
df.to_csv(path_or_buf=download_path)
|
|
229
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
airtable_source_entry = SourceRegistryEntry(
|
|
233
|
+
indexer=AirtableIndexer,
|
|
234
|
+
indexer_config=AirtableIndexerConfig,
|
|
235
|
+
downloader=AirtableDownloader,
|
|
236
|
+
downloader_config=AirtableDownloaderConfig,
|
|
237
|
+
connection_config=AirtableConnectionConfig,
|
|
238
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
{
|
|
2
|
+
"properties": [
|
|
3
|
+
{
|
|
4
|
+
"dataType": [
|
|
5
|
+
"text"
|
|
6
|
+
],
|
|
7
|
+
"indexFilterable": true,
|
|
8
|
+
"indexSearchable": true,
|
|
9
|
+
"name": "record_id",
|
|
10
|
+
"tokenization": "word"
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
"dataType": [
|
|
14
|
+
"text"
|
|
15
|
+
],
|
|
16
|
+
"indexFilterable": true,
|
|
17
|
+
"indexSearchable": true,
|
|
18
|
+
"name": "text",
|
|
19
|
+
"tokenization": "word"
|
|
20
|
+
}
|
|
21
|
+
],
|
|
22
|
+
"vectorizer": "none"
|
|
23
|
+
}
|