unstructured-ingest 1.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/cli/__init__.py +0 -0
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +269 -0
- unstructured_ingest/cli/base/dest.py +84 -0
- unstructured_ingest/cli/base/importer.py +34 -0
- unstructured_ingest/cli/base/src.py +75 -0
- unstructured_ingest/cli/cli.py +24 -0
- unstructured_ingest/cli/cmds.py +14 -0
- unstructured_ingest/cli/utils/__init__.py +0 -0
- unstructured_ingest/cli/utils/click.py +237 -0
- unstructured_ingest/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/data_types/__init__.py +0 -0
- unstructured_ingest/data_types/entities.py +17 -0
- unstructured_ingest/data_types/file_data.py +116 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +63 -0
- unstructured_ingest/embed/bedrock.py +323 -0
- unstructured_ingest/embed/huggingface.py +69 -0
- unstructured_ingest/embed/interfaces.py +146 -0
- unstructured_ingest/embed/mixedbreadai.py +134 -0
- unstructured_ingest/embed/octoai.py +133 -0
- unstructured_ingest/embed/openai.py +142 -0
- unstructured_ingest/embed/togetherai.py +116 -0
- unstructured_ingest/embed/vertexai.py +109 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/error.py +156 -0
- unstructured_ingest/errors_v2.py +156 -0
- unstructured_ingest/interfaces/__init__.py +27 -0
- unstructured_ingest/interfaces/connector.py +56 -0
- unstructured_ingest/interfaces/downloader.py +90 -0
- unstructured_ingest/interfaces/indexer.py +29 -0
- unstructured_ingest/interfaces/process.py +22 -0
- unstructured_ingest/interfaces/processor.py +88 -0
- unstructured_ingest/interfaces/upload_stager.py +89 -0
- unstructured_ingest/interfaces/uploader.py +67 -0
- unstructured_ingest/logger.py +39 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/otel.py +128 -0
- unstructured_ingest/pipeline/__init__.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +211 -0
- unstructured_ingest/pipeline/otel.py +32 -0
- unstructured_ingest/pipeline/pipeline.py +408 -0
- unstructured_ingest/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/pipeline/steps/chunk.py +78 -0
- unstructured_ingest/pipeline/steps/download.py +206 -0
- unstructured_ingest/pipeline/steps/embed.py +77 -0
- unstructured_ingest/pipeline/steps/filter.py +35 -0
- unstructured_ingest/pipeline/steps/index.py +86 -0
- unstructured_ingest/pipeline/steps/partition.py +77 -0
- unstructured_ingest/pipeline/steps/stage.py +65 -0
- unstructured_ingest/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/pipeline/steps/upload.py +58 -0
- unstructured_ingest/processes/__init__.py +18 -0
- unstructured_ingest/processes/chunker.py +131 -0
- unstructured_ingest/processes/connector_registry.py +69 -0
- unstructured_ingest/processes/connectors/__init__.py +129 -0
- unstructured_ingest/processes/connectors/airtable.py +238 -0
- unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +592 -0
- unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
- unstructured_ingest/processes/connectors/chroma.py +193 -0
- unstructured_ingest/processes/connectors/confluence.py +527 -0
- unstructured_ingest/processes/connectors/couchbase.py +336 -0
- unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
- unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
- unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
- unstructured_ingest/processes/connectors/delta_table.py +310 -0
- unstructured_ingest/processes/connectors/discord.py +161 -0
- unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
- unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
- unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
- unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
- unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
- unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
- unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
- unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/processes/connectors/github.py +226 -0
- unstructured_ingest/processes/connectors/gitlab.py +270 -0
- unstructured_ingest/processes/connectors/google_drive.py +848 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
- unstructured_ingest/processes/connectors/jira.py +522 -0
- unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
- unstructured_ingest/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/processes/connectors/kdbai.py +156 -0
- unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
- unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/processes/connectors/local.py +227 -0
- unstructured_ingest/processes/connectors/milvus.py +311 -0
- unstructured_ingest/processes/connectors/mongodb.py +389 -0
- unstructured_ingest/processes/connectors/neo4j.py +534 -0
- unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/processes/connectors/notion/connector.py +350 -0
- unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
- unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
- unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
- unstructured_ingest/processes/connectors/onedrive.py +485 -0
- unstructured_ingest/processes/connectors/outlook.py +242 -0
- unstructured_ingest/processes/connectors/pinecone.py +400 -0
- unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
- unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/processes/connectors/redisdb.py +214 -0
- unstructured_ingest/processes/connectors/salesforce.py +307 -0
- unstructured_ingest/processes/connectors/sharepoint.py +282 -0
- unstructured_ingest/processes/connectors/slack.py +249 -0
- unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
- unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
- unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
- unstructured_ingest/processes/connectors/sql/sql.py +456 -0
- unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
- unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
- unstructured_ingest/processes/connectors/utils.py +60 -0
- unstructured_ingest/processes/connectors/vectara.py +348 -0
- unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
- unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
- unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
- unstructured_ingest/processes/embedder.py +203 -0
- unstructured_ingest/processes/filter.py +60 -0
- unstructured_ingest/processes/partitioner.py +233 -0
- unstructured_ingest/processes/uncompress.py +61 -0
- unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest/processes/utils/blob_storage.py +32 -0
- unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- unstructured_ingest/unstructured_api.py +140 -0
- unstructured_ingest/utils/__init__.py +5 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +72 -0
- unstructured_ingest/utils/constants.py +2 -0
- unstructured_ingest/utils/data_prep.py +216 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/filesystem.py +27 -0
- unstructured_ingest/utils/html.py +174 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/utils/pydantic_models.py +52 -0
- unstructured_ingest/utils/string_and_date_utils.py +74 -0
- unstructured_ingest/utils/table.py +80 -0
- unstructured_ingest/utils/tls.py +15 -0
- unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
- unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
- unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
- unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
- unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from time import time
|
|
7
|
+
from typing import Any, AsyncGenerator, Literal, Union
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, Field, Secret
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.data_types.file_data import (
|
|
12
|
+
FileData,
|
|
13
|
+
FileDataSourceMetadata,
|
|
14
|
+
SourceIdentifiers,
|
|
15
|
+
)
|
|
16
|
+
from unstructured_ingest.error import SourceConnectionError, ValueError
|
|
17
|
+
from unstructured_ingest.interfaces import (
|
|
18
|
+
AccessConfig,
|
|
19
|
+
ConnectionConfig,
|
|
20
|
+
Downloader,
|
|
21
|
+
DownloaderConfig,
|
|
22
|
+
DownloadResponse,
|
|
23
|
+
Indexer,
|
|
24
|
+
IndexerConfig,
|
|
25
|
+
)
|
|
26
|
+
from unstructured_ingest.logger import logger
|
|
27
|
+
from unstructured_ingest.processes.connector_registry import SourceRegistryEntry
|
|
28
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
29
|
+
from unstructured_ingest.utils.html import HtmlMixin
|
|
30
|
+
|
|
31
|
+
from .client import ZendeskArticle, ZendeskClient, ZendeskTicket
|
|
32
|
+
|
|
33
|
+
CONNECTOR_TYPE = "zendesk"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ZendeskAdditionalMetadata(BaseModel):
|
|
37
|
+
item_type: Literal["ticket", "article"]
|
|
38
|
+
content: Union[ZendeskTicket, ZendeskArticle]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class ZendeskFileData(FileData):
|
|
42
|
+
additional_metadata: ZendeskAdditionalMetadata
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class ZendeskAccessConfig(AccessConfig):
|
|
46
|
+
api_token: str = Field(
|
|
47
|
+
description="API token for zendesk generated under Apps and Integrations"
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class ZendeskConnectionConfig(ConnectionConfig):
|
|
52
|
+
subdomain: str = Field(description="Subdomain for zendesk site, <sub-domain>.company.com")
|
|
53
|
+
email: str = Field(description="Email for zendesk site registered at the subdomain")
|
|
54
|
+
access_config: Secret[ZendeskAccessConfig]
|
|
55
|
+
|
|
56
|
+
def get_client(self) -> ZendeskClient:
|
|
57
|
+
access_config = self.access_config.get_secret_value()
|
|
58
|
+
|
|
59
|
+
return ZendeskClient(
|
|
60
|
+
email=self.email, subdomain=self.subdomain, token=access_config.api_token
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class ZendeskIndexerConfig(IndexerConfig):
|
|
65
|
+
item_type: Literal["tickets", "articles", "all"] = Field(
|
|
66
|
+
default="tickets",
|
|
67
|
+
description="Type of item from zendesk to parse, can only be `tickets` or `articles`.",
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass
|
|
72
|
+
class ZendeskIndexer(Indexer):
|
|
73
|
+
connection_config: ZendeskConnectionConfig
|
|
74
|
+
index_config: ZendeskIndexerConfig
|
|
75
|
+
connector_type: str = CONNECTOR_TYPE
|
|
76
|
+
|
|
77
|
+
def precheck(self) -> None:
|
|
78
|
+
"""Validates connection to Zendesk API."""
|
|
79
|
+
self.connection_config.get_client()
|
|
80
|
+
|
|
81
|
+
def is_async(self) -> bool:
|
|
82
|
+
return True
|
|
83
|
+
|
|
84
|
+
def _generate_fullpath(self, identifier: str) -> Path:
|
|
85
|
+
return Path(hashlib.sha256(identifier.encode("utf-8")).hexdigest()[:16] + ".txt")
|
|
86
|
+
|
|
87
|
+
async def get_tickets(self) -> AsyncGenerator[ZendeskFileData, None]:
|
|
88
|
+
async with self.connection_config.get_client() as client:
|
|
89
|
+
async for ticket in client.get_tickets():
|
|
90
|
+
source_identifiers = SourceIdentifiers(
|
|
91
|
+
filename=f"{ticket.id}.txt", fullpath=f"tickets/{ticket.id}.txt"
|
|
92
|
+
)
|
|
93
|
+
yield ZendeskFileData(
|
|
94
|
+
identifier=str(ticket.id),
|
|
95
|
+
connector_type=self.connector_type,
|
|
96
|
+
source_identifiers=source_identifiers,
|
|
97
|
+
additional_metadata=ZendeskAdditionalMetadata(
|
|
98
|
+
item_type="ticket", content=ticket
|
|
99
|
+
),
|
|
100
|
+
metadata=FileDataSourceMetadata(
|
|
101
|
+
url=str(ticket.url) if ticket.url else None,
|
|
102
|
+
date_created=ticket.created_at.isoformat() if ticket.created_at else None,
|
|
103
|
+
date_modified=ticket.updated_at.isoformat() if ticket.updated_at else None,
|
|
104
|
+
date_processed=str(time()),
|
|
105
|
+
),
|
|
106
|
+
display_name=source_identifiers.fullpath,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
async def get_articles(self) -> AsyncGenerator[ZendeskFileData, None]:
|
|
110
|
+
async with self.connection_config.get_client() as client:
|
|
111
|
+
async for article in client.get_articles():
|
|
112
|
+
source_identifiers = SourceIdentifiers(
|
|
113
|
+
filename=f"{article.id}.html", fullpath=f"articles/{article.id}.html"
|
|
114
|
+
)
|
|
115
|
+
yield ZendeskFileData(
|
|
116
|
+
identifier=str(article.id),
|
|
117
|
+
connector_type=self.connector_type,
|
|
118
|
+
source_identifiers=source_identifiers,
|
|
119
|
+
additional_metadata=ZendeskAdditionalMetadata(
|
|
120
|
+
item_type="article", content=article
|
|
121
|
+
),
|
|
122
|
+
metadata=FileDataSourceMetadata(
|
|
123
|
+
url=str(article.url) if article.url else None,
|
|
124
|
+
date_created=article.created_at.isoformat() if article.created_at else None,
|
|
125
|
+
date_modified=(
|
|
126
|
+
article.updated_at.isoformat() if article.updated_at else None
|
|
127
|
+
),
|
|
128
|
+
date_processed=str(time()),
|
|
129
|
+
),
|
|
130
|
+
display_name=source_identifiers.fullpath,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
async def run_async(self, **kwargs: Any) -> AsyncGenerator[ZendeskFileData, None]:
|
|
134
|
+
"""Determines item type and processes accordingly asynchronously."""
|
|
135
|
+
item_type = self.index_config.item_type
|
|
136
|
+
|
|
137
|
+
if item_type == "articles":
|
|
138
|
+
async for article_file_data in self.get_articles():
|
|
139
|
+
yield article_file_data
|
|
140
|
+
|
|
141
|
+
elif item_type == "tickets":
|
|
142
|
+
async for ticket_file_data in self.get_tickets():
|
|
143
|
+
yield ticket_file_data
|
|
144
|
+
|
|
145
|
+
elif item_type == "all":
|
|
146
|
+
async for article_file_data in self.get_articles():
|
|
147
|
+
yield article_file_data
|
|
148
|
+
async for ticket_file_data in self.get_tickets():
|
|
149
|
+
yield ticket_file_data
|
|
150
|
+
|
|
151
|
+
else:
|
|
152
|
+
raise ValueError(f"Item type {item_type} is not supported by the indexer")
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class ZendeskDownloaderConfig(DownloaderConfig, HtmlMixin):
|
|
156
|
+
pass
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
@dataclass
|
|
160
|
+
class ZendeskDownloader(Downloader):
|
|
161
|
+
download_config: ZendeskDownloaderConfig
|
|
162
|
+
connection_config: ZendeskConnectionConfig
|
|
163
|
+
connector_type: str = CONNECTOR_TYPE
|
|
164
|
+
|
|
165
|
+
def is_async(self) -> bool:
|
|
166
|
+
return True
|
|
167
|
+
|
|
168
|
+
def download_embedded_files(
|
|
169
|
+
self, session, html: str, current_file_data: FileData
|
|
170
|
+
) -> list[DownloadResponse]:
|
|
171
|
+
if not self.download_config.extract_files:
|
|
172
|
+
return []
|
|
173
|
+
url = current_file_data.metadata.url
|
|
174
|
+
if url is None:
|
|
175
|
+
logger.warning(
|
|
176
|
+
f"""Missing URL for file: {current_file_data.source_identifiers.filename}.
|
|
177
|
+
Skipping file extraction."""
|
|
178
|
+
)
|
|
179
|
+
return []
|
|
180
|
+
filepath = current_file_data.source_identifiers.relative_path
|
|
181
|
+
download_path = Path(self.download_dir) / filepath
|
|
182
|
+
download_dir = download_path.with_suffix("")
|
|
183
|
+
return self.download_config.extract_embedded_files(
|
|
184
|
+
url=url,
|
|
185
|
+
download_dir=download_dir,
|
|
186
|
+
original_filedata=current_file_data,
|
|
187
|
+
html=html,
|
|
188
|
+
session=session,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
@requires_dependencies(["aiofiles", "bs4"], extras="zendesk")
|
|
192
|
+
async def download_article(self, article: ZendeskArticle, download_path: Path) -> None:
|
|
193
|
+
import aiofiles
|
|
194
|
+
import bs4
|
|
195
|
+
|
|
196
|
+
article_html = article.as_html()
|
|
197
|
+
soup = bs4.BeautifulSoup(article_html, "html.parser")
|
|
198
|
+
async with aiofiles.open(download_path, "w", encoding="utf8") as f:
|
|
199
|
+
await f.write(soup.prettify())
|
|
200
|
+
|
|
201
|
+
@requires_dependencies(["aiofiles"], extras="zendesk")
|
|
202
|
+
async def download_ticket(self, ticket: ZendeskTicket, download_path: Path) -> None:
|
|
203
|
+
import aiofiles
|
|
204
|
+
|
|
205
|
+
async with aiofiles.open(download_path, "w", encoding="utf8") as f:
|
|
206
|
+
await f.write(ticket.as_text())
|
|
207
|
+
async with self.connection_config.get_client() as client:
|
|
208
|
+
comments = [comment async for comment in client.get_comments(ticket_id=ticket.id)]
|
|
209
|
+
for comment in comments:
|
|
210
|
+
await f.write(comment.as_text())
|
|
211
|
+
|
|
212
|
+
async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
213
|
+
zendesk_filedata = ZendeskFileData.cast(file_data=file_data)
|
|
214
|
+
|
|
215
|
+
item_type = zendesk_filedata.additional_metadata.item_type
|
|
216
|
+
download_path = self.get_download_path(file_data=zendesk_filedata)
|
|
217
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
218
|
+
|
|
219
|
+
if item_type == "article":
|
|
220
|
+
article = ZendeskArticle.model_validate(zendesk_filedata.additional_metadata.content)
|
|
221
|
+
await self.download_article(article=article, download_path=download_path)
|
|
222
|
+
elif item_type == "ticket":
|
|
223
|
+
ticket = ZendeskTicket.model_validate(zendesk_filedata.additional_metadata.content)
|
|
224
|
+
await self.download_ticket(ticket=ticket, download_path=download_path)
|
|
225
|
+
else:
|
|
226
|
+
raise SourceConnectionError(
|
|
227
|
+
f"Item type {item_type} cannot be handled by the downloader"
|
|
228
|
+
)
|
|
229
|
+
return super().generate_download_response(
|
|
230
|
+
file_data=zendesk_filedata, download_path=download_path
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
# create entry
|
|
235
|
+
zendesk_source_entry = SourceRegistryEntry(
|
|
236
|
+
connection_config=ZendeskConnectionConfig,
|
|
237
|
+
indexer_config=ZendeskIndexerConfig,
|
|
238
|
+
indexer=ZendeskIndexer,
|
|
239
|
+
downloader=ZendeskDownloader,
|
|
240
|
+
downloader_config=ZendeskDownloaderConfig,
|
|
241
|
+
)
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Literal, Optional
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, Field, SecretStr
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.interfaces.process import BaseProcess
|
|
9
|
+
from unstructured_ingest.utils.data_prep import get_json_data
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class EmbedderConfig(BaseModel):
|
|
16
|
+
embedding_provider: Optional[
|
|
17
|
+
Literal[
|
|
18
|
+
"openai",
|
|
19
|
+
"azure-openai",
|
|
20
|
+
"huggingface",
|
|
21
|
+
"bedrock",
|
|
22
|
+
"vertexai",
|
|
23
|
+
"voyageai",
|
|
24
|
+
"octoai",
|
|
25
|
+
"mixedbread-ai",
|
|
26
|
+
"togetherai",
|
|
27
|
+
]
|
|
28
|
+
] = Field(default=None, description="Type of the embedding class to be used.")
|
|
29
|
+
embedding_api_key: Optional[SecretStr] = Field(
|
|
30
|
+
default=None,
|
|
31
|
+
description="API key for the embedding model, for the case an API key is needed.",
|
|
32
|
+
)
|
|
33
|
+
embedding_model_name: Optional[str] = Field(
|
|
34
|
+
default=None,
|
|
35
|
+
description="Embedding model name, if needed. "
|
|
36
|
+
"Chooses a particular LLM between different options, to embed with it.",
|
|
37
|
+
)
|
|
38
|
+
embedding_aws_access_key_id: Optional[str] = Field(
|
|
39
|
+
default=None, description="AWS access key used for AWS-based embedders, such as bedrock"
|
|
40
|
+
)
|
|
41
|
+
embedding_aws_secret_access_key: Optional[SecretStr] = Field(
|
|
42
|
+
default=None, description="AWS secret key used for AWS-based embedders, such as bedrock"
|
|
43
|
+
)
|
|
44
|
+
embedding_aws_region: Optional[str] = Field(
|
|
45
|
+
default="us-west-2", description="AWS region used for AWS-based embedders, such as bedrock"
|
|
46
|
+
)
|
|
47
|
+
embedding_azure_endpoint: Optional[str] = Field(
|
|
48
|
+
default=None,
|
|
49
|
+
description="Your Azure endpoint, including the resource, "
|
|
50
|
+
"e.g. `https://example-resource.azure.openai.com/`",
|
|
51
|
+
)
|
|
52
|
+
embedding_azure_api_version: Optional[str] = Field(
|
|
53
|
+
description="Azure API version", default=None
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
def get_huggingface_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
57
|
+
from unstructured_ingest.embed.huggingface import (
|
|
58
|
+
HuggingFaceEmbeddingConfig,
|
|
59
|
+
HuggingFaceEmbeddingEncoder,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
return HuggingFaceEmbeddingEncoder(
|
|
63
|
+
config=HuggingFaceEmbeddingConfig.model_validate(embedding_kwargs)
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
def get_openai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
67
|
+
from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
|
|
68
|
+
|
|
69
|
+
return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig.model_validate(embedding_kwargs))
|
|
70
|
+
|
|
71
|
+
def get_azure_openai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
72
|
+
from unstructured_ingest.embed.azure_openai import (
|
|
73
|
+
AzureOpenAIEmbeddingConfig,
|
|
74
|
+
AzureOpenAIEmbeddingEncoder,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
config_kwargs = {
|
|
78
|
+
"api_key": self.embedding_api_key,
|
|
79
|
+
"azure_endpoint": self.embedding_azure_endpoint,
|
|
80
|
+
}
|
|
81
|
+
if api_version := self.embedding_azure_api_version:
|
|
82
|
+
config_kwargs["api_version"] = api_version
|
|
83
|
+
if model_name := self.embedding_model_name:
|
|
84
|
+
config_kwargs["model_name"] = model_name
|
|
85
|
+
|
|
86
|
+
return AzureOpenAIEmbeddingEncoder(
|
|
87
|
+
config=AzureOpenAIEmbeddingConfig.model_validate(config_kwargs)
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
def get_octoai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
91
|
+
from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
|
|
92
|
+
|
|
93
|
+
return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig.model_validate(embedding_kwargs))
|
|
94
|
+
|
|
95
|
+
def get_bedrock_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
96
|
+
from unstructured_ingest.embed.bedrock import (
|
|
97
|
+
BedrockEmbeddingConfig,
|
|
98
|
+
BedrockEmbeddingEncoder,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
embedding_kwargs = embedding_kwargs | {
|
|
102
|
+
"aws_access_key_id": self.embedding_aws_access_key_id,
|
|
103
|
+
"aws_secret_access_key": self.embedding_aws_secret_access_key.get_secret_value(),
|
|
104
|
+
"region_name": self.embedding_aws_region,
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
return BedrockEmbeddingEncoder(
|
|
108
|
+
config=BedrockEmbeddingConfig.model_validate(embedding_kwargs)
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
def get_vertexai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
112
|
+
from unstructured_ingest.embed.vertexai import (
|
|
113
|
+
VertexAIEmbeddingConfig,
|
|
114
|
+
VertexAIEmbeddingEncoder,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
return VertexAIEmbeddingEncoder(
|
|
118
|
+
config=VertexAIEmbeddingConfig.model_validate(embedding_kwargs)
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
def get_voyageai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
122
|
+
from unstructured_ingest.embed.voyageai import (
|
|
123
|
+
VoyageAIEmbeddingConfig,
|
|
124
|
+
VoyageAIEmbeddingEncoder,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
return VoyageAIEmbeddingEncoder(
|
|
128
|
+
config=VoyageAIEmbeddingConfig.model_validate(embedding_kwargs)
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
def get_mixedbread_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
132
|
+
from unstructured_ingest.embed.mixedbreadai import (
|
|
133
|
+
MixedbreadAIEmbeddingConfig,
|
|
134
|
+
MixedbreadAIEmbeddingEncoder,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
return MixedbreadAIEmbeddingEncoder(
|
|
138
|
+
config=MixedbreadAIEmbeddingConfig.model_validate(embedding_kwargs)
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
def get_togetherai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
142
|
+
from unstructured_ingest.embed.togetherai import (
|
|
143
|
+
TogetherAIEmbeddingConfig,
|
|
144
|
+
TogetherAIEmbeddingEncoder,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
return TogetherAIEmbeddingEncoder(
|
|
148
|
+
config=TogetherAIEmbeddingConfig.model_validate(embedding_kwargs)
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
def get_embedder(self) -> "BaseEmbeddingEncoder":
|
|
152
|
+
kwargs: dict[str, Any] = {}
|
|
153
|
+
if self.embedding_api_key:
|
|
154
|
+
kwargs["api_key"] = self.embedding_api_key.get_secret_value()
|
|
155
|
+
if self.embedding_model_name:
|
|
156
|
+
kwargs["model_name"] = self.embedding_model_name
|
|
157
|
+
# TODO make this more dynamic to map to encoder configs
|
|
158
|
+
if self.embedding_provider == "openai":
|
|
159
|
+
return self.get_openai_embedder(embedding_kwargs=kwargs)
|
|
160
|
+
|
|
161
|
+
if self.embedding_provider == "huggingface":
|
|
162
|
+
return self.get_huggingface_embedder(embedding_kwargs=kwargs)
|
|
163
|
+
|
|
164
|
+
if self.embedding_provider == "octoai":
|
|
165
|
+
return self.get_octoai_embedder(embedding_kwargs=kwargs)
|
|
166
|
+
|
|
167
|
+
if self.embedding_provider == "bedrock":
|
|
168
|
+
return self.get_bedrock_embedder(embedding_kwargs=kwargs)
|
|
169
|
+
|
|
170
|
+
if self.embedding_provider == "vertexai":
|
|
171
|
+
return self.get_vertexai_embedder(embedding_kwargs=kwargs)
|
|
172
|
+
|
|
173
|
+
if self.embedding_provider == "voyageai":
|
|
174
|
+
return self.get_voyageai_embedder(embedding_kwargs=kwargs)
|
|
175
|
+
if self.embedding_provider == "mixedbread-ai":
|
|
176
|
+
return self.get_mixedbread_embedder(embedding_kwargs=kwargs)
|
|
177
|
+
if self.embedding_provider == "togetherai":
|
|
178
|
+
return self.get_togetherai_embedder(embedding_kwargs=kwargs)
|
|
179
|
+
if self.embedding_provider == "azure-openai":
|
|
180
|
+
return self.get_azure_openai_embedder(embedding_kwargs=kwargs)
|
|
181
|
+
|
|
182
|
+
raise ValueError(f"{self.embedding_provider} not a recognized encoder")
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
@dataclass
|
|
186
|
+
class Embedder(BaseProcess, ABC):
|
|
187
|
+
config: EmbedderConfig
|
|
188
|
+
|
|
189
|
+
def init(self, **kwargs: Any) -> None:
|
|
190
|
+
self.config.get_embedder().initialize()
|
|
191
|
+
|
|
192
|
+
def precheck(self) -> None:
|
|
193
|
+
embedder = self.config.get_embedder()
|
|
194
|
+
embedder.precheck()
|
|
195
|
+
|
|
196
|
+
def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
|
|
197
|
+
# TODO update base embedder classes to support async
|
|
198
|
+
embedder = self.config.get_embedder()
|
|
199
|
+
elements = get_json_data(path=elements_filepath)
|
|
200
|
+
if not elements:
|
|
201
|
+
return []
|
|
202
|
+
embedded_elements = embedder.embed_documents(elements=elements)
|
|
203
|
+
return embedded_elements
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import fnmatch
|
|
2
|
+
from abc import ABC
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Any, Callable, Optional
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
9
|
+
from unstructured_ingest.interfaces.process import BaseProcess
|
|
10
|
+
from unstructured_ingest.logger import logger
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class FiltererConfig(BaseModel):
|
|
14
|
+
file_glob: Optional[list[str]] = Field(
|
|
15
|
+
default=None,
|
|
16
|
+
description="file globs to limit which data_types of files are accepted",
|
|
17
|
+
examples=["*.pdf", "*.html"],
|
|
18
|
+
)
|
|
19
|
+
max_file_size: Optional[int] = Field(
|
|
20
|
+
default=None, description="Max file size to process in bytes"
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class Filterer(BaseProcess, ABC):
|
|
26
|
+
config: FiltererConfig = field(default_factory=lambda: FiltererConfig())
|
|
27
|
+
filters: list[Callable[[FileData], bool]] = field(init=False, default_factory=list)
|
|
28
|
+
|
|
29
|
+
def __post_init__(self):
|
|
30
|
+
# Populate the filters based on values in config
|
|
31
|
+
if self.config.file_glob is not None:
|
|
32
|
+
self.filters.append(self.glob_filter)
|
|
33
|
+
if self.config.max_file_size:
|
|
34
|
+
self.filters.append(self.file_size_filter)
|
|
35
|
+
|
|
36
|
+
def is_async(self) -> bool:
|
|
37
|
+
return False
|
|
38
|
+
|
|
39
|
+
def file_size_filter(self, file_data: FileData) -> bool:
|
|
40
|
+
if filesize_bytes := file_data.metadata.filesize_bytes:
|
|
41
|
+
return filesize_bytes <= self.config.max_file_size
|
|
42
|
+
return True
|
|
43
|
+
|
|
44
|
+
def glob_filter(self, file_data: FileData) -> bool:
|
|
45
|
+
patterns = self.config.file_glob
|
|
46
|
+
path = file_data.source_identifiers.fullpath
|
|
47
|
+
for pattern in patterns:
|
|
48
|
+
if fnmatch.filter([path], pattern):
|
|
49
|
+
return True
|
|
50
|
+
logger.debug(f"the file {path!r} is discarded as it does not match any given glob.")
|
|
51
|
+
return False
|
|
52
|
+
|
|
53
|
+
def run(self, file_data: FileData, **kwargs: Any) -> Optional[FileData]:
|
|
54
|
+
for filter in self.filters:
|
|
55
|
+
if not filter(file_data):
|
|
56
|
+
logger.debug(
|
|
57
|
+
f"filtered out file data due to {filter.__name__}: {file_data.identifier}"
|
|
58
|
+
)
|
|
59
|
+
return None
|
|
60
|
+
return file_data
|