unstructured-ingest 0.5.15__py3-none-any.whl → 0.5.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_zendesk.py +31 -53
- test/integration/connectors/utils/validation/source.py +5 -3
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/pipeline/steps/download.py +3 -3
- unstructured_ingest/v2/pipeline/steps/index.py +4 -4
- unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +1 -1
- unstructured_ingest/v2/processes/connectors/zendesk/client.py +221 -156
- unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py +83 -274
- {unstructured_ingest-0.5.15.dist-info → unstructured_ingest-0.5.16.dist-info}/METADATA +19 -19
- {unstructured_ingest-0.5.15.dist-info → unstructured_ingest-0.5.16.dist-info}/RECORD +14 -14
- {unstructured_ingest-0.5.15.dist-info → unstructured_ingest-0.5.16.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.5.15.dist-info → unstructured_ingest-0.5.16.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.5.15.dist-info → unstructured_ingest-0.5.16.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.5.15.dist-info → unstructured_ingest-0.5.16.dist-info}/top_level.txt +0 -0
|
@@ -1,64 +1,210 @@
|
|
|
1
|
-
import
|
|
2
|
-
from
|
|
3
|
-
from typing import
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, Literal, Optional, Union
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field, HttpUrl
|
|
4
6
|
|
|
5
7
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
8
|
+
from unstructured_ingest.utils.string_and_date_utils import fix_unescaped_unicode
|
|
6
9
|
from unstructured_ingest.v2.errors import ProviderError, RateLimitError, UserAuthError, UserError
|
|
7
10
|
from unstructured_ingest.v2.logger import logger
|
|
8
11
|
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from httpx import AsyncClient, Client
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Attachment(BaseModel):
|
|
17
|
+
# https://developer.zendesk.com/api-reference/ticketing/tickets/ticket-attachments/#json-format
|
|
18
|
+
content_type: Optional[str] = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Via(BaseModel):
|
|
22
|
+
# https://developer.zendesk.com/documentation/ticketing/reference-guides/via-object-reference/
|
|
23
|
+
channel: Union[int, str]
|
|
24
|
+
source: dict = Field(default_factory=dict)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ZendeskComment(BaseModel):
|
|
28
|
+
# https://developer.zendesk.com/api-reference/ticketing/tickets/ticket_comments/#json-format
|
|
29
|
+
attachments: list[Attachment] = Field(default_factory=list)
|
|
30
|
+
audit_id: Optional[int] = None
|
|
31
|
+
author_id: Optional[int] = None
|
|
32
|
+
body: Optional[str] = None
|
|
33
|
+
created_at: Optional[datetime] = None
|
|
34
|
+
html_body: Optional[str] = None
|
|
35
|
+
id: Optional[int] = None
|
|
36
|
+
metadata: Optional[dict] = None
|
|
37
|
+
plain_body: Optional[str] = None
|
|
38
|
+
public: Optional[bool] = None
|
|
39
|
+
comment_type: Literal["Comment", "VoiceComment"] = Field(alias="type")
|
|
40
|
+
uploads: list[str] = Field(default_factory=list)
|
|
41
|
+
via: Optional[Via] = None
|
|
42
|
+
|
|
43
|
+
def as_text(self) -> str:
|
|
44
|
+
all_data = self.model_dump()
|
|
45
|
+
filtered_data = {
|
|
46
|
+
k: v
|
|
47
|
+
for k, v in all_data.items()
|
|
48
|
+
if k in ["id", "author_id", "body", "created_at"] and v is not None
|
|
49
|
+
}
|
|
50
|
+
return "".join(
|
|
51
|
+
[f"{v}\n" for v in ["comment"] + [f"{k}: {v}" for k, v in filtered_data.items()]]
|
|
52
|
+
)
|
|
9
53
|
|
|
10
|
-
@dataclass
|
|
11
|
-
class Comment:
|
|
12
|
-
id: int
|
|
13
|
-
author_id: str
|
|
14
|
-
body: str
|
|
15
|
-
parent_ticket_id: str
|
|
16
|
-
metadata: dict
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
@dataclass
|
|
20
|
-
class ZendeskTicket:
|
|
21
|
-
id: int
|
|
22
|
-
subject: str
|
|
23
|
-
description: str
|
|
24
|
-
generated_ts: int
|
|
25
|
-
metadata: dict
|
|
26
54
|
|
|
27
|
-
|
|
28
|
-
|
|
55
|
+
class ZendeskTicket(BaseModel):
|
|
56
|
+
# https://developer.zendesk.com/api-reference/ticketing/tickets/tickets/#json-format
|
|
57
|
+
allow_attachments: bool = True
|
|
58
|
+
allow_channelback: bool = True
|
|
59
|
+
assignee_email: Optional[str] = None
|
|
60
|
+
assignee_id: Optional[int] = None
|
|
61
|
+
attribute_value_ids: list[int] = Field(default_factory=list)
|
|
62
|
+
brand_id: Optional[int] = None
|
|
63
|
+
collaborator_ids: list[int] = Field(default_factory=list)
|
|
64
|
+
collaborators: list[Union[int, str, dict[str, str]]] = Field(default_factory=list)
|
|
65
|
+
comment: Optional[ZendeskComment] = None
|
|
66
|
+
created_at: Optional[datetime] = None
|
|
67
|
+
custom_fields: list[dict[str, Any]] = Field(default_factory=list)
|
|
68
|
+
custom_status_id: Optional[int] = None
|
|
69
|
+
description: Optional[str] = None
|
|
70
|
+
due_at: Optional[datetime] = None
|
|
71
|
+
email_cc_ids: list[int] = Field(default_factory=list)
|
|
72
|
+
email_ccs: list[dict[str, str]] = Field(default_factory=list)
|
|
73
|
+
external_id: Optional[str] = None
|
|
74
|
+
follower_ids: list[int] = Field(default_factory=list)
|
|
75
|
+
followers: list[dict[str, str]] = Field(default_factory=list)
|
|
76
|
+
followup_ids: list[int] = Field(default_factory=list)
|
|
77
|
+
forum_topic_id: Optional[int] = None
|
|
78
|
+
from_messaging_channel: bool
|
|
79
|
+
generated_timestamp: Optional[datetime] = None
|
|
80
|
+
group_id: Optional[int] = None
|
|
81
|
+
has_incidents: bool = False
|
|
82
|
+
id: Optional[int] = None
|
|
83
|
+
is_public: bool = False
|
|
84
|
+
macro_id: Optional[int] = None
|
|
85
|
+
macro_ids: list[int] = Field(default_factory=list)
|
|
86
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
87
|
+
organization_id: Optional[int] = None
|
|
88
|
+
priority: Optional[Literal["urgent", "high", "normal", "low"]] = None
|
|
89
|
+
problem_id: Optional[int] = None
|
|
90
|
+
raw_subject: Optional[str] = None
|
|
91
|
+
recipient: Optional[str] = None
|
|
92
|
+
requester: dict[str, str] = Field(default_factory=dict)
|
|
93
|
+
requester_id: int
|
|
94
|
+
safe_update: Optional[bool] = None
|
|
95
|
+
satisfaction_rating: Optional[Union[str, dict[str, Any]]] = None
|
|
96
|
+
sharing_agreement_ids: list[int] = Field(default_factory=list)
|
|
97
|
+
status: Optional[Literal["new", "open", "pending", "hold", "solved", "closed"]] = None
|
|
98
|
+
subject: Optional[str] = None
|
|
99
|
+
submitter_id: Optional[int] = None
|
|
100
|
+
tags: list[str] = Field(default_factory=list)
|
|
101
|
+
ticket_form_id: Optional[int] = None
|
|
102
|
+
ticket_type: Optional[Literal["problem", "incident", "question", "task"]] = Field(
|
|
103
|
+
default=None, alias="type"
|
|
104
|
+
)
|
|
105
|
+
updated_at: Optional[datetime] = None
|
|
106
|
+
updated_stamp: Optional[str] = None
|
|
107
|
+
url: Optional[HttpUrl] = None
|
|
108
|
+
via: Optional[Via] = None
|
|
109
|
+
via_followup_source_id: Optional[int] = None
|
|
110
|
+
via_id: Optional[int] = None
|
|
111
|
+
voice_comment: Optional[dict] = None
|
|
112
|
+
|
|
113
|
+
def as_text(self) -> str:
|
|
114
|
+
all_data = self.model_dump()
|
|
115
|
+
filtered_data = {
|
|
116
|
+
k: v
|
|
117
|
+
for k, v in all_data.items()
|
|
118
|
+
if k in ["id", "subject", "description", "created_at"] and v is not None
|
|
119
|
+
}
|
|
120
|
+
return "".join(
|
|
121
|
+
[f"{v}\n" for v in ["ticket"] + [f"{k}: {v}" for k, v in filtered_data.items()]]
|
|
122
|
+
)
|
|
29
123
|
|
|
30
124
|
|
|
31
|
-
|
|
32
|
-
|
|
125
|
+
class ZendeskArticle(BaseModel):
|
|
126
|
+
# https://developer.zendesk.com/api-reference/help_center/help-center-api/articles/#json-format
|
|
127
|
+
author_id: Optional[int] = None
|
|
128
|
+
body: Optional[str] = None
|
|
129
|
+
comments_disabled: bool = False
|
|
130
|
+
content_tag_ids: list[str] = Field(default_factory=list)
|
|
131
|
+
created_at: Optional[datetime] = None
|
|
132
|
+
draft: bool = False
|
|
133
|
+
edited_at: Optional[datetime] = None
|
|
134
|
+
html_url: Optional[HttpUrl] = None
|
|
33
135
|
id: int
|
|
34
|
-
|
|
136
|
+
label_names: list[str] = Field(default_factory=list)
|
|
137
|
+
locale: str
|
|
138
|
+
outdated: bool = False
|
|
139
|
+
outdated_locales: list[str] = Field(default_factory=list)
|
|
140
|
+
permission_group_id: int
|
|
141
|
+
position: Optional[int] = None
|
|
142
|
+
promoted: bool = False
|
|
143
|
+
section_id: Optional[int] = None
|
|
144
|
+
source_locale: Optional[str] = None
|
|
35
145
|
title: str
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
146
|
+
updated_at: Optional[datetime] = None
|
|
147
|
+
url: Optional[HttpUrl] = None
|
|
148
|
+
user_segment_id: Optional[int] = None
|
|
149
|
+
user_segment_ids: list[int] = Field(default_factory=list)
|
|
150
|
+
vote_count: Optional[int] = None
|
|
151
|
+
vote_sum: Optional[int] = None
|
|
152
|
+
|
|
153
|
+
def as_html(self) -> str:
|
|
154
|
+
html = self.body
|
|
155
|
+
if title := self.title:
|
|
156
|
+
html = f"<h1>{title}</h1>{html}"
|
|
157
|
+
return fix_unescaped_unicode(f"<body class='Document' >{html}</body>")
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class ZendeskArticleAttachment(BaseModel):
|
|
161
|
+
# https://developer.zendesk.com/api-reference/help_center/help-center-api/article_attachments/#json-format
|
|
162
|
+
article_id: Optional[int] = None
|
|
163
|
+
content_type: Optional[str] = None
|
|
164
|
+
content_url: Optional[HttpUrl] = None
|
|
165
|
+
created_at: Optional[datetime] = None
|
|
166
|
+
guide_media_id: Optional[str] = None
|
|
167
|
+
id: Optional[int] = None
|
|
168
|
+
inline: bool = False
|
|
169
|
+
locale: Optional[str] = None
|
|
170
|
+
size: Optional[int] = None
|
|
171
|
+
updated_at: Optional[datetime] = None
|
|
172
|
+
url: Optional[HttpUrl] = None
|
|
40
173
|
|
|
41
174
|
|
|
175
|
+
@dataclass
|
|
42
176
|
class ZendeskClient:
|
|
177
|
+
token: str
|
|
178
|
+
subdomain: str
|
|
179
|
+
email: str
|
|
180
|
+
max_page_size: int = 100
|
|
181
|
+
_async_client: "AsyncClient" = field(init=False, default=None)
|
|
182
|
+
_client: "Client" = field(init=False, default=None)
|
|
183
|
+
_base_url: str = field(init=False, default=None)
|
|
184
|
+
|
|
185
|
+
async def __aenter__(self) -> "ZendeskClient":
|
|
186
|
+
return self
|
|
187
|
+
|
|
188
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
189
|
+
await self._async_client.aclose()
|
|
43
190
|
|
|
44
191
|
@requires_dependencies(["httpx"], extras="zendesk")
|
|
45
|
-
def
|
|
192
|
+
def __post_init__(self):
|
|
46
193
|
import httpx
|
|
47
194
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
195
|
+
auth = f"{self.email}/token", self.token
|
|
196
|
+
self._client = httpx.Client(auth=auth)
|
|
197
|
+
self._async_client = httpx.AsyncClient(auth=auth)
|
|
198
|
+
self._base_url = f"https://{self.subdomain}.zendesk.com/api/v2"
|
|
51
199
|
|
|
200
|
+
# Run check
|
|
52
201
|
try:
|
|
53
|
-
|
|
202
|
+
url_to_check = f"{self._base_url}/groups.json"
|
|
203
|
+
resp = self._client.head(url_to_check)
|
|
204
|
+
resp.raise_for_status()
|
|
54
205
|
except Exception as e:
|
|
55
206
|
raise self.wrap_error(e=e)
|
|
56
207
|
|
|
57
|
-
self._token = token
|
|
58
|
-
self._subdomain = subdomain
|
|
59
|
-
self._email = email
|
|
60
|
-
self._auth = auth
|
|
61
|
-
|
|
62
208
|
@requires_dependencies(["httpx"], extras="zendesk")
|
|
63
209
|
def wrap_error(self, e: Exception) -> Exception:
|
|
64
210
|
import httpx
|
|
@@ -93,151 +239,70 @@ class ZendeskClient:
|
|
|
93
239
|
logger.error(f"unhandled http status error from Zendesk client: {e}", exc_info=True)
|
|
94
240
|
return e
|
|
95
241
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
import httpx
|
|
102
|
-
|
|
103
|
-
articles: List[ZendeskArticle] = []
|
|
104
|
-
|
|
105
|
-
article_url = f"https://{self._subdomain}.zendesk.com/api/v2/help_center/articles.json"
|
|
106
|
-
|
|
107
|
-
try:
|
|
108
|
-
async with httpx.AsyncClient() as client:
|
|
109
|
-
response = await client.get(article_url, auth=self._auth)
|
|
242
|
+
async def fetch_content(self, url: str, content_key: str) -> AsyncGenerator[dict, None]:
|
|
243
|
+
url = f"{url}?page[size]={self.max_page_size}"
|
|
244
|
+
while True:
|
|
245
|
+
try:
|
|
246
|
+
response = await self._async_client.get(url)
|
|
110
247
|
response.raise_for_status()
|
|
111
|
-
|
|
112
|
-
|
|
248
|
+
except Exception as e:
|
|
249
|
+
raise self.wrap_error(e=e)
|
|
113
250
|
|
|
114
|
-
|
|
251
|
+
data = response.json()
|
|
252
|
+
for content in data[content_key]:
|
|
253
|
+
yield content
|
|
115
254
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
author_id=str(entry["author_id"]),
|
|
120
|
-
title=str(entry["title"]),
|
|
121
|
-
content=entry["body"],
|
|
122
|
-
)
|
|
123
|
-
for entry in articles_in_response
|
|
124
|
-
]
|
|
125
|
-
return articles
|
|
255
|
+
has_more = data.get("meta", {}).get("has_more", False)
|
|
256
|
+
if not has_more:
|
|
257
|
+
break
|
|
126
258
|
|
|
127
|
-
|
|
128
|
-
async def get_comments_async(self, ticket_id: int) -> List["Comment"]:
|
|
129
|
-
import httpx
|
|
259
|
+
url = data["links"]["next"]
|
|
130
260
|
|
|
131
|
-
|
|
261
|
+
async def get_articles(self) -> AsyncGenerator[ZendeskArticle, None]:
|
|
262
|
+
"""
|
|
263
|
+
Retrieves article content from Zendesk asynchronously.
|
|
264
|
+
"""
|
|
265
|
+
article_url = f"https://{self.subdomain}.zendesk.com/api/v2/help_center/articles.json"
|
|
132
266
|
|
|
133
267
|
try:
|
|
134
|
-
async
|
|
135
|
-
|
|
136
|
-
response.raise_for_status()
|
|
268
|
+
async for article_dict in self.fetch_content(url=article_url, content_key="articles"):
|
|
269
|
+
yield ZendeskArticle.model_validate(article_dict)
|
|
137
270
|
except Exception as e:
|
|
138
271
|
raise self.wrap_error(e=e)
|
|
139
272
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
id=int(entry["id"]),
|
|
143
|
-
author_id=entry["author_id"],
|
|
144
|
-
body=entry["body"],
|
|
145
|
-
metadata=entry,
|
|
146
|
-
parent_ticket_id=ticket_id,
|
|
147
|
-
)
|
|
148
|
-
for entry in response.json()["comments"]
|
|
149
|
-
]
|
|
150
|
-
|
|
151
|
-
@requires_dependencies(["httpx"], extras="zendesk")
|
|
152
|
-
def get_users(self) -> List[dict]:
|
|
153
|
-
import httpx
|
|
154
|
-
|
|
155
|
-
users: List[dict] = []
|
|
273
|
+
async def get_comments(self, ticket_id: int) -> AsyncGenerator[ZendeskComment, None]:
|
|
274
|
+
comments_url = f"https://{self.subdomain}.zendesk.com/api/v2/tickets/{ticket_id}/comments"
|
|
156
275
|
|
|
157
|
-
users_url = f"https://{self._subdomain}.zendesk.com/api/v2/users"
|
|
158
276
|
try:
|
|
159
|
-
|
|
160
|
-
|
|
277
|
+
async for comment_dict in self.fetch_content(url=comments_url, content_key="comments"):
|
|
278
|
+
yield ZendeskComment.model_validate(comment_dict)
|
|
161
279
|
except Exception as e:
|
|
162
280
|
raise self.wrap_error(e=e)
|
|
163
281
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
return users
|
|
168
|
-
|
|
169
|
-
@requires_dependencies(["httpx"], extras="zendesk")
|
|
170
|
-
async def get_tickets_async(self) -> List["ZendeskTicket"]:
|
|
171
|
-
import httpx
|
|
172
|
-
|
|
173
|
-
tickets: List["ZendeskTicket"] = []
|
|
174
|
-
tickets_url = f"https://{self._subdomain}.zendesk.com/api/v2/tickets"
|
|
282
|
+
async def get_tickets(self) -> AsyncGenerator[ZendeskTicket, None]:
|
|
283
|
+
tickets_url = f"https://{self.subdomain}.zendesk.com/api/v2/tickets"
|
|
175
284
|
|
|
176
285
|
try:
|
|
177
|
-
async
|
|
178
|
-
|
|
179
|
-
response.raise_for_status()
|
|
286
|
+
async for ticket_dict in self.fetch_content(url=tickets_url, content_key="tickets"):
|
|
287
|
+
yield ZendeskTicket.model_validate(ticket_dict)
|
|
180
288
|
except Exception as e:
|
|
181
289
|
raise self.wrap_error(e=e)
|
|
182
290
|
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
ticket = ZendeskTicket(
|
|
187
|
-
id=int(entry["id"]),
|
|
188
|
-
subject=entry["subject"],
|
|
189
|
-
description=entry["description"],
|
|
190
|
-
generated_ts=entry["generated_timestamp"],
|
|
191
|
-
metadata=entry,
|
|
192
|
-
)
|
|
193
|
-
tickets.append(ticket)
|
|
194
|
-
|
|
195
|
-
return tickets
|
|
196
|
-
|
|
197
|
-
@requires_dependencies(["httpx"], extras="zendesk")
|
|
198
|
-
async def get_article_attachments_async(self, article_id: str):
|
|
291
|
+
async def get_article_attachments(
|
|
292
|
+
self, article_id: int
|
|
293
|
+
) -> AsyncGenerator[ZendeskArticleAttachment, None]:
|
|
199
294
|
"""
|
|
200
295
|
Handles article attachments such as images and stores them as UTF-8 encoded bytes.
|
|
201
296
|
"""
|
|
202
|
-
import httpx
|
|
203
|
-
|
|
204
297
|
article_attachment_url = (
|
|
205
|
-
f"https://{self.
|
|
298
|
+
f"https://{self.subdomain}.zendesk.com/api/v2/help_center/"
|
|
206
299
|
f"articles/{article_id}/attachments"
|
|
207
300
|
)
|
|
208
301
|
|
|
209
302
|
try:
|
|
210
|
-
async
|
|
211
|
-
|
|
212
|
-
|
|
303
|
+
async for attachment_dict in self.fetch_content(
|
|
304
|
+
url=article_attachment_url, content_key="article_attachments"
|
|
305
|
+
):
|
|
306
|
+
yield ZendeskArticleAttachment.model_validate(attachment_dict)
|
|
213
307
|
except Exception as e:
|
|
214
308
|
raise self.wrap_error(e=e)
|
|
215
|
-
|
|
216
|
-
attachments_in_response: List[Dict] = response.json().get("article_attachments", [])
|
|
217
|
-
attachments = []
|
|
218
|
-
|
|
219
|
-
for attachment in attachments_in_response:
|
|
220
|
-
attachment_data = {
|
|
221
|
-
"id": attachment["id"],
|
|
222
|
-
"file_name": attachment["file_name"],
|
|
223
|
-
"content_type": attachment["content_type"],
|
|
224
|
-
"size": attachment["size"],
|
|
225
|
-
"url": attachment["url"],
|
|
226
|
-
"content_url": attachment["content_url"],
|
|
227
|
-
}
|
|
228
|
-
|
|
229
|
-
try:
|
|
230
|
-
async with httpx.AsyncClient() as client:
|
|
231
|
-
download_response = await client.get(attachment["content_url"], auth=self._auth)
|
|
232
|
-
download_response.raise_for_status()
|
|
233
|
-
except Exception as e:
|
|
234
|
-
raise self.wrap_error(e=e)
|
|
235
|
-
|
|
236
|
-
encoded_content = base64.b64encode(download_response.content).decode("utf-8")
|
|
237
|
-
attachment_data["encoded_content"] = (
|
|
238
|
-
f"data:{attachment_data['content_type']};base64,{encoded_content}"
|
|
239
|
-
)
|
|
240
|
-
|
|
241
|
-
attachments.append(attachment_data)
|
|
242
|
-
|
|
243
|
-
return attachments
|