unstructured-ingest 1.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/cli/__init__.py +0 -0
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +269 -0
- unstructured_ingest/cli/base/dest.py +84 -0
- unstructured_ingest/cli/base/importer.py +34 -0
- unstructured_ingest/cli/base/src.py +75 -0
- unstructured_ingest/cli/cli.py +24 -0
- unstructured_ingest/cli/cmds.py +14 -0
- unstructured_ingest/cli/utils/__init__.py +0 -0
- unstructured_ingest/cli/utils/click.py +237 -0
- unstructured_ingest/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/data_types/__init__.py +0 -0
- unstructured_ingest/data_types/entities.py +17 -0
- unstructured_ingest/data_types/file_data.py +116 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +63 -0
- unstructured_ingest/embed/bedrock.py +323 -0
- unstructured_ingest/embed/huggingface.py +69 -0
- unstructured_ingest/embed/interfaces.py +146 -0
- unstructured_ingest/embed/mixedbreadai.py +134 -0
- unstructured_ingest/embed/octoai.py +133 -0
- unstructured_ingest/embed/openai.py +142 -0
- unstructured_ingest/embed/togetherai.py +116 -0
- unstructured_ingest/embed/vertexai.py +109 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/error.py +156 -0
- unstructured_ingest/errors_v2.py +156 -0
- unstructured_ingest/interfaces/__init__.py +27 -0
- unstructured_ingest/interfaces/connector.py +56 -0
- unstructured_ingest/interfaces/downloader.py +90 -0
- unstructured_ingest/interfaces/indexer.py +29 -0
- unstructured_ingest/interfaces/process.py +22 -0
- unstructured_ingest/interfaces/processor.py +88 -0
- unstructured_ingest/interfaces/upload_stager.py +89 -0
- unstructured_ingest/interfaces/uploader.py +67 -0
- unstructured_ingest/logger.py +39 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/otel.py +128 -0
- unstructured_ingest/pipeline/__init__.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +211 -0
- unstructured_ingest/pipeline/otel.py +32 -0
- unstructured_ingest/pipeline/pipeline.py +408 -0
- unstructured_ingest/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/pipeline/steps/chunk.py +78 -0
- unstructured_ingest/pipeline/steps/download.py +206 -0
- unstructured_ingest/pipeline/steps/embed.py +77 -0
- unstructured_ingest/pipeline/steps/filter.py +35 -0
- unstructured_ingest/pipeline/steps/index.py +86 -0
- unstructured_ingest/pipeline/steps/partition.py +77 -0
- unstructured_ingest/pipeline/steps/stage.py +65 -0
- unstructured_ingest/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/pipeline/steps/upload.py +58 -0
- unstructured_ingest/processes/__init__.py +18 -0
- unstructured_ingest/processes/chunker.py +131 -0
- unstructured_ingest/processes/connector_registry.py +69 -0
- unstructured_ingest/processes/connectors/__init__.py +129 -0
- unstructured_ingest/processes/connectors/airtable.py +238 -0
- unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +592 -0
- unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
- unstructured_ingest/processes/connectors/chroma.py +193 -0
- unstructured_ingest/processes/connectors/confluence.py +527 -0
- unstructured_ingest/processes/connectors/couchbase.py +336 -0
- unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
- unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
- unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
- unstructured_ingest/processes/connectors/delta_table.py +310 -0
- unstructured_ingest/processes/connectors/discord.py +161 -0
- unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
- unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
- unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
- unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
- unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
- unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
- unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
- unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/processes/connectors/github.py +226 -0
- unstructured_ingest/processes/connectors/gitlab.py +270 -0
- unstructured_ingest/processes/connectors/google_drive.py +848 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
- unstructured_ingest/processes/connectors/jira.py +522 -0
- unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
- unstructured_ingest/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/processes/connectors/kdbai.py +156 -0
- unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
- unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/processes/connectors/local.py +227 -0
- unstructured_ingest/processes/connectors/milvus.py +311 -0
- unstructured_ingest/processes/connectors/mongodb.py +389 -0
- unstructured_ingest/processes/connectors/neo4j.py +534 -0
- unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/processes/connectors/notion/connector.py +350 -0
- unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
- unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
- unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
- unstructured_ingest/processes/connectors/onedrive.py +485 -0
- unstructured_ingest/processes/connectors/outlook.py +242 -0
- unstructured_ingest/processes/connectors/pinecone.py +400 -0
- unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
- unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/processes/connectors/redisdb.py +214 -0
- unstructured_ingest/processes/connectors/salesforce.py +307 -0
- unstructured_ingest/processes/connectors/sharepoint.py +282 -0
- unstructured_ingest/processes/connectors/slack.py +249 -0
- unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
- unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
- unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
- unstructured_ingest/processes/connectors/sql/sql.py +456 -0
- unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
- unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
- unstructured_ingest/processes/connectors/utils.py +60 -0
- unstructured_ingest/processes/connectors/vectara.py +348 -0
- unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
- unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
- unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
- unstructured_ingest/processes/embedder.py +203 -0
- unstructured_ingest/processes/filter.py +60 -0
- unstructured_ingest/processes/partitioner.py +233 -0
- unstructured_ingest/processes/uncompress.py +61 -0
- unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest/processes/utils/blob_storage.py +32 -0
- unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- unstructured_ingest/unstructured_api.py +140 -0
- unstructured_ingest/utils/__init__.py +5 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +72 -0
- unstructured_ingest/utils/constants.py +2 -0
- unstructured_ingest/utils/data_prep.py +216 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/filesystem.py +27 -0
- unstructured_ingest/utils/html.py +174 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/utils/pydantic_models.py +52 -0
- unstructured_ingest/utils/string_and_date_utils.py +74 -0
- unstructured_ingest/utils/table.py +80 -0
- unstructured_ingest/utils/tls.py +15 -0
- unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
- unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
- unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
- unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
- unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
|
@@ -0,0 +1,337 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from contextlib import contextmanager
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from datetime import date, datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
9
|
+
|
|
10
|
+
from dateutil import parser
|
|
11
|
+
from pydantic import Field, Secret
|
|
12
|
+
|
|
13
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
14
|
+
from unstructured_ingest.error import DestinationConnectionError, ValueError, WriteError
|
|
15
|
+
from unstructured_ingest.interfaces import (
|
|
16
|
+
AccessConfig,
|
|
17
|
+
ConnectionConfig,
|
|
18
|
+
UploaderConfig,
|
|
19
|
+
UploadStager,
|
|
20
|
+
UploadStagerConfig,
|
|
21
|
+
VectorDBUploader,
|
|
22
|
+
)
|
|
23
|
+
from unstructured_ingest.logger import logger
|
|
24
|
+
from unstructured_ingest.utils.constants import RECORD_ID_LABEL
|
|
25
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
26
|
+
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from weaviate.classes.init import Timeout
|
|
29
|
+
from weaviate.client import WeaviateClient
|
|
30
|
+
from weaviate.collections.batch.client import BatchClient
|
|
31
|
+
|
|
32
|
+
CONNECTOR_TYPE = "weaviate"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class WeaviateAccessConfig(AccessConfig, ABC):
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class WeaviateConnectionConfig(ConnectionConfig, ABC):
|
|
40
|
+
init_timeout: int = Field(default=2, ge=0, description="Timeout for initialization checks")
|
|
41
|
+
insert_timeout: int = Field(default=90, ge=0, description="Timeout for insert operations")
|
|
42
|
+
query_timeout: int = Field(default=30, ge=0, description="Timeout for query operations")
|
|
43
|
+
access_config: Secret[WeaviateAccessConfig] = Field(
|
|
44
|
+
default=WeaviateAccessConfig(), validate_default=True
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
48
|
+
def get_timeout(self) -> "Timeout":
|
|
49
|
+
from weaviate.classes.init import Timeout
|
|
50
|
+
|
|
51
|
+
return Timeout(init=self.init_timeout, query=self.query_timeout, insert=self.insert_timeout)
|
|
52
|
+
|
|
53
|
+
@abstractmethod
|
|
54
|
+
@contextmanager
|
|
55
|
+
def get_client(self) -> Generator["WeaviateClient", None, None]:
|
|
56
|
+
pass
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class WeaviateUploadStagerConfig(UploadStagerConfig):
|
|
60
|
+
pass
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class WeaviateUploadStager(UploadStager):
|
|
65
|
+
upload_stager_config: WeaviateUploadStagerConfig = field(
|
|
66
|
+
default_factory=lambda: WeaviateUploadStagerConfig()
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
@staticmethod
|
|
70
|
+
def parse_date_string(date_string: str) -> date:
|
|
71
|
+
try:
|
|
72
|
+
timestamp = float(date_string)
|
|
73
|
+
return datetime.fromtimestamp(timestamp)
|
|
74
|
+
except Exception as e:
|
|
75
|
+
logger.debug(f"date {date_string} string not a timestamp: {e}")
|
|
76
|
+
return parser.parse(date_string)
|
|
77
|
+
|
|
78
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
79
|
+
"""
|
|
80
|
+
Updates the element dictionary to conform to the Weaviate schema
|
|
81
|
+
"""
|
|
82
|
+
data = element_dict.copy()
|
|
83
|
+
working_data = data.copy()
|
|
84
|
+
# Dict as string formatting
|
|
85
|
+
if (
|
|
86
|
+
record_locator := working_data.get("metadata", {})
|
|
87
|
+
.get("data_source", {})
|
|
88
|
+
.get("record_locator")
|
|
89
|
+
):
|
|
90
|
+
# Explicit casting otherwise fails schema type checking
|
|
91
|
+
working_data["metadata"]["data_source"]["record_locator"] = str(
|
|
92
|
+
json.dumps(record_locator)
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Array of items as string formatting
|
|
96
|
+
if points := working_data.get("metadata", {}).get("coordinates", {}).get("points"):
|
|
97
|
+
working_data["metadata"]["coordinates"]["points"] = str(json.dumps(points))
|
|
98
|
+
|
|
99
|
+
if links := working_data.get("metadata", {}).get("links", {}):
|
|
100
|
+
working_data["metadata"]["links"] = str(json.dumps(links))
|
|
101
|
+
|
|
102
|
+
if permissions_data := (
|
|
103
|
+
working_data.get("metadata", {}).get("data_source", {}).get("permissions_data")
|
|
104
|
+
):
|
|
105
|
+
working_data["metadata"]["data_source"]["permissions_data"] = json.dumps(
|
|
106
|
+
permissions_data
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Datetime formatting
|
|
110
|
+
if (
|
|
111
|
+
date_created := working_data.get("metadata", {})
|
|
112
|
+
.get("data_source", {})
|
|
113
|
+
.get("date_created")
|
|
114
|
+
):
|
|
115
|
+
working_data["metadata"]["data_source"]["date_created"] = self.parse_date_string(
|
|
116
|
+
date_created
|
|
117
|
+
).strftime(
|
|
118
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
if (
|
|
122
|
+
date_modified := working_data.get("metadata", {})
|
|
123
|
+
.get("data_source", {})
|
|
124
|
+
.get("date_modified")
|
|
125
|
+
):
|
|
126
|
+
working_data["metadata"]["data_source"]["date_modified"] = self.parse_date_string(
|
|
127
|
+
date_modified
|
|
128
|
+
).strftime(
|
|
129
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
if (
|
|
133
|
+
date_processed := working_data.get("metadata", {})
|
|
134
|
+
.get("data_source", {})
|
|
135
|
+
.get("date_processed")
|
|
136
|
+
):
|
|
137
|
+
working_data["metadata"]["data_source"]["date_processed"] = self.parse_date_string(
|
|
138
|
+
date_processed
|
|
139
|
+
).strftime(
|
|
140
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
if last_modified := working_data.get("metadata", {}).get("last_modified"):
|
|
144
|
+
working_data["metadata"]["last_modified"] = self.parse_date_string(
|
|
145
|
+
last_modified
|
|
146
|
+
).strftime(
|
|
147
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
# String casting
|
|
151
|
+
if version := working_data.get("metadata", {}).get("data_source", {}).get("version"):
|
|
152
|
+
working_data["metadata"]["data_source"]["version"] = str(version)
|
|
153
|
+
|
|
154
|
+
if page_number := working_data.get("metadata", {}).get("page_number"):
|
|
155
|
+
working_data["metadata"]["page_number"] = str(page_number)
|
|
156
|
+
|
|
157
|
+
if regex_metadata := working_data.get("metadata", {}).get("regex_metadata"):
|
|
158
|
+
working_data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
|
|
159
|
+
|
|
160
|
+
working_data[RECORD_ID_LABEL] = file_data.identifier
|
|
161
|
+
return working_data
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class WeaviateUploaderConfig(UploaderConfig):
|
|
165
|
+
collection: Optional[str] = Field(
|
|
166
|
+
description="The name of the collection this object belongs to", default=None
|
|
167
|
+
)
|
|
168
|
+
batch_size: Optional[int] = Field(default=None, description="Number of records per batch")
|
|
169
|
+
requests_per_minute: Optional[int] = Field(default=None, description="Rate limit for upload")
|
|
170
|
+
dynamic_batch: bool = Field(default=True, description="Whether to use dynamic batch")
|
|
171
|
+
record_id_key: str = Field(
|
|
172
|
+
default=RECORD_ID_LABEL,
|
|
173
|
+
description="searchable key to find entries for the same record on previous runs",
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
def model_post_init(self, __context: Any) -> None:
|
|
177
|
+
batch_types = {
|
|
178
|
+
"fixed_size": self.batch_size is not None,
|
|
179
|
+
"rate_limited": self.requests_per_minute is not None,
|
|
180
|
+
"dynamic": self.dynamic_batch,
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
enabled_batch_modes = [batch_key for batch_key, flag in batch_types.items() if flag]
|
|
184
|
+
if not enabled_batch_modes:
|
|
185
|
+
raise ValueError("No batch mode enabled")
|
|
186
|
+
if len(enabled_batch_modes) > 1:
|
|
187
|
+
raise ValueError(
|
|
188
|
+
"Multiple batch modes enabled, only one mode can be used: {}".format(
|
|
189
|
+
", ".join(enabled_batch_modes)
|
|
190
|
+
)
|
|
191
|
+
)
|
|
192
|
+
logger.info(f"Uploader config instantiated with {enabled_batch_modes[0]} batch mode")
|
|
193
|
+
|
|
194
|
+
@contextmanager
|
|
195
|
+
def get_batch_client(self, client: "WeaviateClient") -> Generator["BatchClient", None, None]:
|
|
196
|
+
if self.dynamic_batch:
|
|
197
|
+
with client.batch.dynamic() as batch_client:
|
|
198
|
+
yield batch_client
|
|
199
|
+
elif self.batch_size:
|
|
200
|
+
with client.batch.fixed_size(batch_size=self.batch_size) as batch_client:
|
|
201
|
+
yield batch_client
|
|
202
|
+
elif self.requests_per_minute:
|
|
203
|
+
with client.batch.rate_limit(
|
|
204
|
+
requests_per_minute=self.requests_per_minute
|
|
205
|
+
) as batch_client:
|
|
206
|
+
yield batch_client
|
|
207
|
+
else:
|
|
208
|
+
raise ValueError("No batch mode enabled")
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
@dataclass
|
|
212
|
+
class WeaviateUploader(VectorDBUploader, ABC):
|
|
213
|
+
upload_config: WeaviateUploaderConfig
|
|
214
|
+
connection_config: WeaviateConnectionConfig
|
|
215
|
+
|
|
216
|
+
def _collection_exists(self, collection_name: Optional[str] = None):
|
|
217
|
+
collection_name = collection_name or self.upload_config.collection
|
|
218
|
+
with self.connection_config.get_client() as weaviate_client:
|
|
219
|
+
return weaviate_client.collections.exists(name=collection_name)
|
|
220
|
+
|
|
221
|
+
def precheck(self) -> None:
|
|
222
|
+
try:
|
|
223
|
+
with self.connection_config.get_client():
|
|
224
|
+
# Connection test successful - client is available but not needed
|
|
225
|
+
pass
|
|
226
|
+
|
|
227
|
+
# only if collection name populated should we check that it exists
|
|
228
|
+
if self.upload_config.collection and not self._collection_exists():
|
|
229
|
+
raise DestinationConnectionError(
|
|
230
|
+
f"collection '{self.upload_config.collection}' does not exist"
|
|
231
|
+
)
|
|
232
|
+
except Exception as e:
|
|
233
|
+
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
234
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
235
|
+
|
|
236
|
+
def init(self, **kwargs: Any) -> None:
|
|
237
|
+
self.create_destination(**kwargs)
|
|
238
|
+
|
|
239
|
+
def format_destination_name(self, destination_name: str) -> str:
|
|
240
|
+
"""
|
|
241
|
+
Weaviate Collection naming conventions:
|
|
242
|
+
1. must begin with an uppercase letter
|
|
243
|
+
2. must be alphanumeric and underscores only
|
|
244
|
+
"""
|
|
245
|
+
|
|
246
|
+
# Check if the first character is an uppercase letter
|
|
247
|
+
if not re.match(r"^[a-zA-Z]", destination_name):
|
|
248
|
+
raise ValueError("Collection name must start with an uppercase letter")
|
|
249
|
+
# Replace all non-alphanumeric characters with underscores
|
|
250
|
+
formatted = re.sub(r"[^a-zA-Z0-9]", "_", destination_name)
|
|
251
|
+
# Make the first character uppercase and leave the rest as is
|
|
252
|
+
if len(formatted) == 1:
|
|
253
|
+
formatted = formatted.capitalize()
|
|
254
|
+
else:
|
|
255
|
+
formatted = formatted[0].capitalize() + formatted[1:]
|
|
256
|
+
if formatted != destination_name:
|
|
257
|
+
logger.warning(
|
|
258
|
+
f"Given Collection name '{destination_name}' doesn't follow naming conventions. "
|
|
259
|
+
f"Renaming to '{formatted}'"
|
|
260
|
+
)
|
|
261
|
+
return formatted
|
|
262
|
+
|
|
263
|
+
def create_destination(
|
|
264
|
+
self,
|
|
265
|
+
destination_name: str = "Unstructuredautocreated",
|
|
266
|
+
vector_length: Optional[int] = None,
|
|
267
|
+
**kwargs: Any,
|
|
268
|
+
) -> bool:
|
|
269
|
+
collection_name = self.upload_config.collection or destination_name
|
|
270
|
+
collection_name = self.format_destination_name(collection_name)
|
|
271
|
+
self.upload_config.collection = collection_name
|
|
272
|
+
|
|
273
|
+
if not self._collection_exists():
|
|
274
|
+
connectors_dir = Path(__file__).parents[1]
|
|
275
|
+
collection_config_file = connectors_dir / "assets" / "weaviate_collection_config.json"
|
|
276
|
+
with collection_config_file.open() as f:
|
|
277
|
+
collection_config = json.load(f)
|
|
278
|
+
collection_config["class"] = collection_name
|
|
279
|
+
|
|
280
|
+
logger.info(f"Creating weaviate collection '{collection_name}' with default configs")
|
|
281
|
+
with self.connection_config.get_client() as weaviate_client:
|
|
282
|
+
weaviate_client.collections.create_from_dict(config=collection_config)
|
|
283
|
+
return True
|
|
284
|
+
logger.debug(f"Collection with name '{collection_name}' already exists, skipping creation")
|
|
285
|
+
return False
|
|
286
|
+
|
|
287
|
+
def check_for_errors(self, client: "WeaviateClient") -> None:
|
|
288
|
+
failed_uploads = client.batch.failed_objects
|
|
289
|
+
if failed_uploads:
|
|
290
|
+
for failure in failed_uploads:
|
|
291
|
+
logger.error(
|
|
292
|
+
f"Failed to upload object with id {failure.original_uuid}: {failure.message}"
|
|
293
|
+
)
|
|
294
|
+
raise WriteError("Failed to upload to weaviate")
|
|
295
|
+
|
|
296
|
+
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
297
|
+
def delete_by_record_id(self, client: "WeaviateClient", file_data: FileData) -> None:
|
|
298
|
+
from weaviate.classes.query import Filter
|
|
299
|
+
|
|
300
|
+
record_id = file_data.identifier
|
|
301
|
+
collection = client.collections.get(self.upload_config.collection)
|
|
302
|
+
delete_filter = Filter.by_property(name=self.upload_config.record_id_key).equal(
|
|
303
|
+
val=record_id
|
|
304
|
+
)
|
|
305
|
+
# There is a configurable maximum limit (QUERY_MAXIMUM_RESULTS) on the number of
|
|
306
|
+
# objects that can be deleted in a single query (default 10,000). To delete
|
|
307
|
+
# more objects than the limit, re-run the query until nothing is deleted.
|
|
308
|
+
while True:
|
|
309
|
+
resp = collection.data.delete_many(where=delete_filter)
|
|
310
|
+
if resp.failed:
|
|
311
|
+
raise WriteError(
|
|
312
|
+
f"failed to delete records in collection "
|
|
313
|
+
f"{self.upload_config.collection} with record "
|
|
314
|
+
f"id property {record_id}"
|
|
315
|
+
)
|
|
316
|
+
if not resp.failed and not resp.successful:
|
|
317
|
+
break
|
|
318
|
+
|
|
319
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
320
|
+
logger.info(
|
|
321
|
+
f"writing {len(data)} objects to destination "
|
|
322
|
+
f"class {self.connection_config.access_config} "
|
|
323
|
+
)
|
|
324
|
+
if not self.upload_config.collection:
|
|
325
|
+
raise ValueError("No collection specified")
|
|
326
|
+
|
|
327
|
+
with self.connection_config.get_client() as weaviate_client:
|
|
328
|
+
self.delete_by_record_id(client=weaviate_client, file_data=file_data)
|
|
329
|
+
with self.upload_config.get_batch_client(client=weaviate_client) as batch_client:
|
|
330
|
+
for e in data:
|
|
331
|
+
vector = e.pop("embeddings", None)
|
|
332
|
+
batch_client.add_object(
|
|
333
|
+
collection=self.upload_config.collection,
|
|
334
|
+
properties=e,
|
|
335
|
+
vector=vector,
|
|
336
|
+
)
|
|
337
|
+
self.check_for_errors(client=weaviate_client)
|
|
File without changes
|
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, Literal, Optional, Union
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field, HttpUrl
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.error import (
|
|
8
|
+
ProviderError,
|
|
9
|
+
RateLimitError,
|
|
10
|
+
UnstructuredIngestError,
|
|
11
|
+
UserAuthError,
|
|
12
|
+
UserError,
|
|
13
|
+
)
|
|
14
|
+
from unstructured_ingest.logger import logger
|
|
15
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
16
|
+
from unstructured_ingest.utils.string_and_date_utils import fix_unescaped_unicode
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from httpx import AsyncClient, Client
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Attachment(BaseModel):
|
|
23
|
+
# https://developer.zendesk.com/api-reference/ticketing/tickets/ticket-attachments/#json-format
|
|
24
|
+
content_type: Optional[str] = None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class Via(BaseModel):
|
|
28
|
+
# https://developer.zendesk.com/documentation/ticketing/reference-guides/via-object-reference/
|
|
29
|
+
channel: Union[int, str]
|
|
30
|
+
source: dict = Field(default_factory=dict)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ZendeskComment(BaseModel):
|
|
34
|
+
# https://developer.zendesk.com/api-reference/ticketing/tickets/ticket_comments/#json-format
|
|
35
|
+
attachments: list[Attachment] = Field(default_factory=list)
|
|
36
|
+
audit_id: Optional[int] = None
|
|
37
|
+
author_id: Optional[int] = None
|
|
38
|
+
body: Optional[str] = None
|
|
39
|
+
created_at: Optional[datetime] = None
|
|
40
|
+
html_body: Optional[str] = None
|
|
41
|
+
id: Optional[int] = None
|
|
42
|
+
metadata: Optional[dict] = None
|
|
43
|
+
plain_body: Optional[str] = None
|
|
44
|
+
public: Optional[bool] = None
|
|
45
|
+
comment_type: Literal["Comment", "VoiceComment"] = Field(alias="type")
|
|
46
|
+
uploads: list[str] = Field(default_factory=list)
|
|
47
|
+
via: Optional[Via] = None
|
|
48
|
+
|
|
49
|
+
def as_text(self) -> str:
|
|
50
|
+
all_data = self.model_dump()
|
|
51
|
+
filtered_data = {
|
|
52
|
+
k: v
|
|
53
|
+
for k, v in all_data.items()
|
|
54
|
+
if k in ["id", "author_id", "body", "created_at"] and v is not None
|
|
55
|
+
}
|
|
56
|
+
return "".join(
|
|
57
|
+
[f"{v}\n" for v in ["comment"] + [f"{k}: {v}" for k, v in filtered_data.items()]]
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class ZendeskTicket(BaseModel):
|
|
62
|
+
# https://developer.zendesk.com/api-reference/ticketing/tickets/tickets/#json-format
|
|
63
|
+
allow_attachments: bool = True
|
|
64
|
+
allow_channelback: bool = True
|
|
65
|
+
assignee_email: Optional[str] = None
|
|
66
|
+
assignee_id: Optional[int] = None
|
|
67
|
+
attribute_value_ids: list[int] = Field(default_factory=list)
|
|
68
|
+
brand_id: Optional[int] = None
|
|
69
|
+
collaborator_ids: list[int] = Field(default_factory=list)
|
|
70
|
+
collaborators: list[Union[int, str, dict[str, str]]] = Field(default_factory=list)
|
|
71
|
+
comment: Optional[ZendeskComment] = None
|
|
72
|
+
created_at: Optional[datetime] = None
|
|
73
|
+
custom_fields: list[dict[str, Any]] = Field(default_factory=list)
|
|
74
|
+
custom_status_id: Optional[int] = None
|
|
75
|
+
description: Optional[str] = None
|
|
76
|
+
due_at: Optional[datetime] = None
|
|
77
|
+
email_cc_ids: list[int] = Field(default_factory=list)
|
|
78
|
+
email_ccs: list[dict[str, str]] = Field(default_factory=list)
|
|
79
|
+
external_id: Optional[str] = None
|
|
80
|
+
follower_ids: list[int] = Field(default_factory=list)
|
|
81
|
+
followers: list[dict[str, str]] = Field(default_factory=list)
|
|
82
|
+
followup_ids: list[int] = Field(default_factory=list)
|
|
83
|
+
forum_topic_id: Optional[int] = None
|
|
84
|
+
from_messaging_channel: bool
|
|
85
|
+
generated_timestamp: Optional[datetime] = None
|
|
86
|
+
group_id: Optional[int] = None
|
|
87
|
+
has_incidents: bool = False
|
|
88
|
+
id: Optional[int] = None
|
|
89
|
+
is_public: bool = False
|
|
90
|
+
macro_id: Optional[int] = None
|
|
91
|
+
macro_ids: list[int] = Field(default_factory=list)
|
|
92
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
93
|
+
organization_id: Optional[int] = None
|
|
94
|
+
priority: Optional[Literal["urgent", "high", "normal", "low"]] = None
|
|
95
|
+
problem_id: Optional[int] = None
|
|
96
|
+
raw_subject: Optional[str] = None
|
|
97
|
+
recipient: Optional[str] = None
|
|
98
|
+
requester: dict[str, str] = Field(default_factory=dict)
|
|
99
|
+
requester_id: int
|
|
100
|
+
safe_update: Optional[bool] = None
|
|
101
|
+
satisfaction_rating: Optional[Union[str, dict[str, Any]]] = None
|
|
102
|
+
sharing_agreement_ids: list[int] = Field(default_factory=list)
|
|
103
|
+
status: Optional[Literal["new", "open", "pending", "hold", "solved", "closed"]] = None
|
|
104
|
+
subject: Optional[str] = None
|
|
105
|
+
submitter_id: Optional[int] = None
|
|
106
|
+
tags: list[str] = Field(default_factory=list)
|
|
107
|
+
ticket_form_id: Optional[int] = None
|
|
108
|
+
ticket_type: Optional[Literal["problem", "incident", "question", "task"]] = Field(
|
|
109
|
+
default=None, alias="type"
|
|
110
|
+
)
|
|
111
|
+
updated_at: Optional[datetime] = None
|
|
112
|
+
updated_stamp: Optional[str] = None
|
|
113
|
+
url: Optional[HttpUrl] = None
|
|
114
|
+
via: Optional[Via] = None
|
|
115
|
+
via_followup_source_id: Optional[int] = None
|
|
116
|
+
via_id: Optional[int] = None
|
|
117
|
+
voice_comment: Optional[dict] = None
|
|
118
|
+
|
|
119
|
+
def as_text(self) -> str:
|
|
120
|
+
all_data = self.model_dump()
|
|
121
|
+
filtered_data = {
|
|
122
|
+
k: v
|
|
123
|
+
for k, v in all_data.items()
|
|
124
|
+
if k in ["id", "subject", "description", "created_at"] and v is not None
|
|
125
|
+
}
|
|
126
|
+
return "".join(
|
|
127
|
+
[f"{v}\n" for v in ["ticket"] + [f"{k}: {v}" for k, v in filtered_data.items()]]
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class ZendeskArticle(BaseModel):
|
|
132
|
+
# https://developer.zendesk.com/api-reference/help_center/help-center-api/articles/#json-format
|
|
133
|
+
author_id: Optional[int] = None
|
|
134
|
+
body: Optional[str] = None
|
|
135
|
+
comments_disabled: bool = False
|
|
136
|
+
content_tag_ids: list[str] = Field(default_factory=list)
|
|
137
|
+
created_at: Optional[datetime] = None
|
|
138
|
+
draft: bool = False
|
|
139
|
+
edited_at: Optional[datetime] = None
|
|
140
|
+
html_url: Optional[HttpUrl] = None
|
|
141
|
+
id: int
|
|
142
|
+
label_names: list[str] = Field(default_factory=list)
|
|
143
|
+
locale: str
|
|
144
|
+
outdated: bool = False
|
|
145
|
+
outdated_locales: list[str] = Field(default_factory=list)
|
|
146
|
+
permission_group_id: int
|
|
147
|
+
position: Optional[int] = None
|
|
148
|
+
promoted: bool = False
|
|
149
|
+
section_id: Optional[int] = None
|
|
150
|
+
source_locale: Optional[str] = None
|
|
151
|
+
title: str
|
|
152
|
+
updated_at: Optional[datetime] = None
|
|
153
|
+
url: Optional[HttpUrl] = None
|
|
154
|
+
user_segment_id: Optional[int] = None
|
|
155
|
+
user_segment_ids: list[int] = Field(default_factory=list)
|
|
156
|
+
vote_count: Optional[int] = None
|
|
157
|
+
vote_sum: Optional[int] = None
|
|
158
|
+
|
|
159
|
+
def as_html(self) -> str:
|
|
160
|
+
html = self.body
|
|
161
|
+
if title := self.title:
|
|
162
|
+
html = f"<h1>{title}</h1>{html}"
|
|
163
|
+
return fix_unescaped_unicode(f"<body class='Document' >{html}</body>")
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class ZendeskArticleAttachment(BaseModel):
|
|
167
|
+
# https://developer.zendesk.com/api-reference/help_center/help-center-api/article_attachments/#json-format
|
|
168
|
+
article_id: Optional[int] = None
|
|
169
|
+
content_type: Optional[str] = None
|
|
170
|
+
content_url: Optional[HttpUrl] = None
|
|
171
|
+
created_at: Optional[datetime] = None
|
|
172
|
+
guide_media_id: Optional[str] = None
|
|
173
|
+
id: Optional[int] = None
|
|
174
|
+
inline: bool = False
|
|
175
|
+
locale: Optional[str] = None
|
|
176
|
+
size: Optional[int] = None
|
|
177
|
+
updated_at: Optional[datetime] = None
|
|
178
|
+
url: Optional[HttpUrl] = None
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
@dataclass
|
|
182
|
+
class ZendeskClient:
|
|
183
|
+
token: str
|
|
184
|
+
subdomain: str
|
|
185
|
+
email: str
|
|
186
|
+
max_page_size: int = 100
|
|
187
|
+
_async_client: "AsyncClient" = field(init=False, default=None)
|
|
188
|
+
_client: "Client" = field(init=False, default=None)
|
|
189
|
+
_base_url: str = field(init=False, default=None)
|
|
190
|
+
|
|
191
|
+
async def __aenter__(self) -> "ZendeskClient":
|
|
192
|
+
return self
|
|
193
|
+
|
|
194
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
195
|
+
await self._async_client.aclose()
|
|
196
|
+
|
|
197
|
+
@requires_dependencies(["httpx"], extras="zendesk")
|
|
198
|
+
def __post_init__(self):
|
|
199
|
+
import httpx
|
|
200
|
+
|
|
201
|
+
auth = f"{self.email}/token", self.token
|
|
202
|
+
self._client = httpx.Client(auth=auth)
|
|
203
|
+
self._async_client = httpx.AsyncClient(auth=auth)
|
|
204
|
+
self._base_url = f"https://{self.subdomain}.zendesk.com/api/v2"
|
|
205
|
+
|
|
206
|
+
# Run check
|
|
207
|
+
try:
|
|
208
|
+
url_to_check = f"{self._base_url}/groups.json"
|
|
209
|
+
resp = self._client.head(url_to_check)
|
|
210
|
+
resp.raise_for_status()
|
|
211
|
+
except Exception as e:
|
|
212
|
+
raise self.wrap_error(e=e)
|
|
213
|
+
|
|
214
|
+
@requires_dependencies(["httpx"], extras="zendesk")
|
|
215
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
216
|
+
import httpx
|
|
217
|
+
|
|
218
|
+
if not isinstance(e, httpx.HTTPStatusError):
|
|
219
|
+
logger.error(f"unhandled exception from Zendesk client: {e}", exc_info=True)
|
|
220
|
+
return UnstructuredIngestError(str(e))
|
|
221
|
+
url = e.request.url
|
|
222
|
+
response_code = e.response.status_code
|
|
223
|
+
if response_code == 401:
|
|
224
|
+
logger.error(
|
|
225
|
+
f"Failed to connect via auth,"
|
|
226
|
+
f"{url} using zendesk response, status code {response_code}"
|
|
227
|
+
)
|
|
228
|
+
return UserAuthError(e)
|
|
229
|
+
if response_code == 429:
|
|
230
|
+
logger.error(
|
|
231
|
+
f"Failed to connect via rate limits"
|
|
232
|
+
f"{url} using zendesk response, status code {response_code}"
|
|
233
|
+
)
|
|
234
|
+
return RateLimitError(e)
|
|
235
|
+
if 400 <= response_code < 500:
|
|
236
|
+
logger.error(
|
|
237
|
+
f"Failed to connect to {url} using zendesk response, status code {response_code}"
|
|
238
|
+
)
|
|
239
|
+
return UserError(e)
|
|
240
|
+
if response_code > 500:
|
|
241
|
+
logger.error(
|
|
242
|
+
f"Failed to connect to {url} using zendesk response, status code {response_code}"
|
|
243
|
+
)
|
|
244
|
+
return ProviderError(e)
|
|
245
|
+
logger.error(f"unhandled http status error from Zendesk client: {e}", exc_info=True)
|
|
246
|
+
return e
|
|
247
|
+
|
|
248
|
+
async def fetch_content(self, url: str, content_key: str) -> AsyncGenerator[dict, None]:
|
|
249
|
+
url = f"{url}?page[size]={self.max_page_size}"
|
|
250
|
+
while True:
|
|
251
|
+
try:
|
|
252
|
+
response = await self._async_client.get(url)
|
|
253
|
+
response.raise_for_status()
|
|
254
|
+
except Exception as e:
|
|
255
|
+
raise self.wrap_error(e=e)
|
|
256
|
+
|
|
257
|
+
data = response.json()
|
|
258
|
+
for content in data[content_key]:
|
|
259
|
+
yield content
|
|
260
|
+
|
|
261
|
+
has_more = data.get("meta", {}).get("has_more", False)
|
|
262
|
+
if not has_more:
|
|
263
|
+
break
|
|
264
|
+
|
|
265
|
+
url = data["links"]["next"]
|
|
266
|
+
|
|
267
|
+
async def get_articles(self) -> AsyncGenerator[ZendeskArticle, None]:
|
|
268
|
+
"""
|
|
269
|
+
Retrieves article content from Zendesk asynchronously.
|
|
270
|
+
"""
|
|
271
|
+
article_url = f"https://{self.subdomain}.zendesk.com/api/v2/help_center/articles.json"
|
|
272
|
+
|
|
273
|
+
try:
|
|
274
|
+
async for article_dict in self.fetch_content(url=article_url, content_key="articles"):
|
|
275
|
+
yield ZendeskArticle.model_validate(article_dict)
|
|
276
|
+
except Exception as e:
|
|
277
|
+
raise self.wrap_error(e=e)
|
|
278
|
+
|
|
279
|
+
async def get_comments(self, ticket_id: int) -> AsyncGenerator[ZendeskComment, None]:
|
|
280
|
+
comments_url = f"https://{self.subdomain}.zendesk.com/api/v2/tickets/{ticket_id}/comments"
|
|
281
|
+
|
|
282
|
+
try:
|
|
283
|
+
async for comment_dict in self.fetch_content(url=comments_url, content_key="comments"):
|
|
284
|
+
yield ZendeskComment.model_validate(comment_dict)
|
|
285
|
+
except Exception as e:
|
|
286
|
+
raise self.wrap_error(e=e)
|
|
287
|
+
|
|
288
|
+
async def get_tickets(self) -> AsyncGenerator[ZendeskTicket, None]:
|
|
289
|
+
tickets_url = f"https://{self.subdomain}.zendesk.com/api/v2/tickets"
|
|
290
|
+
|
|
291
|
+
try:
|
|
292
|
+
async for ticket_dict in self.fetch_content(url=tickets_url, content_key="tickets"):
|
|
293
|
+
yield ZendeskTicket.model_validate(ticket_dict)
|
|
294
|
+
except Exception as e:
|
|
295
|
+
raise self.wrap_error(e=e)
|
|
296
|
+
|
|
297
|
+
async def get_article_attachments(
|
|
298
|
+
self, article_id: int
|
|
299
|
+
) -> AsyncGenerator[ZendeskArticleAttachment, None]:
|
|
300
|
+
"""
|
|
301
|
+
Handles article attachments such as images and stores them as UTF-8 encoded bytes.
|
|
302
|
+
"""
|
|
303
|
+
article_attachment_url = (
|
|
304
|
+
f"https://{self.subdomain}.zendesk.com/api/v2/help_center/"
|
|
305
|
+
f"articles/{article_id}/attachments"
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
try:
|
|
309
|
+
async for attachment_dict in self.fetch_content(
|
|
310
|
+
url=article_attachment_url, content_key="article_attachments"
|
|
311
|
+
):
|
|
312
|
+
yield ZendeskArticleAttachment.model_validate(attachment_dict)
|
|
313
|
+
except Exception as e:
|
|
314
|
+
raise self.wrap_error(e=e)
|