unstructured-ingest 0.5.15__py3-none-any.whl → 0.5.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1,64 +1,210 @@
1
- import base64
2
- from dataclasses import dataclass
3
- from typing import Dict, List
1
+ from dataclasses import dataclass, field
2
+ from datetime import datetime
3
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, Literal, Optional, Union
4
+
5
+ from pydantic import BaseModel, Field, HttpUrl
4
6
 
5
7
  from unstructured_ingest.utils.dep_check import requires_dependencies
8
+ from unstructured_ingest.utils.string_and_date_utils import fix_unescaped_unicode
6
9
  from unstructured_ingest.v2.errors import ProviderError, RateLimitError, UserAuthError, UserError
7
10
  from unstructured_ingest.v2.logger import logger
8
11
 
12
+ if TYPE_CHECKING:
13
+ from httpx import AsyncClient, Client
14
+
15
+
16
+ class Attachment(BaseModel):
17
+ # https://developer.zendesk.com/api-reference/ticketing/tickets/ticket-attachments/#json-format
18
+ content_type: Optional[str] = None
19
+
20
+
21
+ class Via(BaseModel):
22
+ # https://developer.zendesk.com/documentation/ticketing/reference-guides/via-object-reference/
23
+ channel: Union[int, str]
24
+ source: dict = Field(default_factory=dict)
25
+
26
+
27
+ class ZendeskComment(BaseModel):
28
+ # https://developer.zendesk.com/api-reference/ticketing/tickets/ticket_comments/#json-format
29
+ attachments: list[Attachment] = Field(default_factory=list)
30
+ audit_id: Optional[int] = None
31
+ author_id: Optional[int] = None
32
+ body: Optional[str] = None
33
+ created_at: Optional[datetime] = None
34
+ html_body: Optional[str] = None
35
+ id: Optional[int] = None
36
+ metadata: Optional[dict] = None
37
+ plain_body: Optional[str] = None
38
+ public: Optional[bool] = None
39
+ comment_type: Literal["Comment", "VoiceComment"] = Field(alias="type")
40
+ uploads: list[str] = Field(default_factory=list)
41
+ via: Optional[Via] = None
42
+
43
+ def as_text(self) -> str:
44
+ all_data = self.model_dump()
45
+ filtered_data = {
46
+ k: v
47
+ for k, v in all_data.items()
48
+ if k in ["id", "author_id", "body", "created_at"] and v is not None
49
+ }
50
+ return "".join(
51
+ [f"{v}\n" for v in ["comment"] + [f"{k}: {v}" for k, v in filtered_data.items()]]
52
+ )
9
53
 
10
- @dataclass
11
- class Comment:
12
- id: int
13
- author_id: str
14
- body: str
15
- parent_ticket_id: str
16
- metadata: dict
17
-
18
-
19
- @dataclass
20
- class ZendeskTicket:
21
- id: int
22
- subject: str
23
- description: str
24
- generated_ts: int
25
- metadata: dict
26
54
 
27
- def __lt__(self, other):
28
- return int(self.id) < int(other.id)
55
+ class ZendeskTicket(BaseModel):
56
+ # https://developer.zendesk.com/api-reference/ticketing/tickets/tickets/#json-format
57
+ allow_attachments: bool = True
58
+ allow_channelback: bool = True
59
+ assignee_email: Optional[str] = None
60
+ assignee_id: Optional[int] = None
61
+ attribute_value_ids: list[int] = Field(default_factory=list)
62
+ brand_id: Optional[int] = None
63
+ collaborator_ids: list[int] = Field(default_factory=list)
64
+ collaborators: list[Union[int, str, dict[str, str]]] = Field(default_factory=list)
65
+ comment: Optional[ZendeskComment] = None
66
+ created_at: Optional[datetime] = None
67
+ custom_fields: list[dict[str, Any]] = Field(default_factory=list)
68
+ custom_status_id: Optional[int] = None
69
+ description: Optional[str] = None
70
+ due_at: Optional[datetime] = None
71
+ email_cc_ids: list[int] = Field(default_factory=list)
72
+ email_ccs: list[dict[str, str]] = Field(default_factory=list)
73
+ external_id: Optional[str] = None
74
+ follower_ids: list[int] = Field(default_factory=list)
75
+ followers: list[dict[str, str]] = Field(default_factory=list)
76
+ followup_ids: list[int] = Field(default_factory=list)
77
+ forum_topic_id: Optional[int] = None
78
+ from_messaging_channel: bool
79
+ generated_timestamp: Optional[datetime] = None
80
+ group_id: Optional[int] = None
81
+ has_incidents: bool = False
82
+ id: Optional[int] = None
83
+ is_public: bool = False
84
+ macro_id: Optional[int] = None
85
+ macro_ids: list[int] = Field(default_factory=list)
86
+ metadata: dict[str, Any] = Field(default_factory=dict)
87
+ organization_id: Optional[int] = None
88
+ priority: Optional[Literal["urgent", "high", "normal", "low"]] = None
89
+ problem_id: Optional[int] = None
90
+ raw_subject: Optional[str] = None
91
+ recipient: Optional[str] = None
92
+ requester: dict[str, str] = Field(default_factory=dict)
93
+ requester_id: int
94
+ safe_update: Optional[bool] = None
95
+ satisfaction_rating: Optional[Union[str, dict[str, Any]]] = None
96
+ sharing_agreement_ids: list[int] = Field(default_factory=list)
97
+ status: Optional[Literal["new", "open", "pending", "hold", "solved", "closed"]] = None
98
+ subject: Optional[str] = None
99
+ submitter_id: Optional[int] = None
100
+ tags: list[str] = Field(default_factory=list)
101
+ ticket_form_id: Optional[int] = None
102
+ ticket_type: Optional[Literal["problem", "incident", "question", "task"]] = Field(
103
+ default=None, alias="type"
104
+ )
105
+ updated_at: Optional[datetime] = None
106
+ updated_stamp: Optional[str] = None
107
+ url: Optional[HttpUrl] = None
108
+ via: Optional[Via] = None
109
+ via_followup_source_id: Optional[int] = None
110
+ via_id: Optional[int] = None
111
+ voice_comment: Optional[dict] = None
112
+
113
+ def as_text(self) -> str:
114
+ all_data = self.model_dump()
115
+ filtered_data = {
116
+ k: v
117
+ for k, v in all_data.items()
118
+ if k in ["id", "subject", "description", "created_at"] and v is not None
119
+ }
120
+ return "".join(
121
+ [f"{v}\n" for v in ["ticket"] + [f"{k}: {v}" for k, v in filtered_data.items()]]
122
+ )
29
123
 
30
124
 
31
- @dataclass
32
- class ZendeskArticle:
125
+ class ZendeskArticle(BaseModel):
126
+ # https://developer.zendesk.com/api-reference/help_center/help-center-api/articles/#json-format
127
+ author_id: Optional[int] = None
128
+ body: Optional[str] = None
129
+ comments_disabled: bool = False
130
+ content_tag_ids: list[str] = Field(default_factory=list)
131
+ created_at: Optional[datetime] = None
132
+ draft: bool = False
133
+ edited_at: Optional[datetime] = None
134
+ html_url: Optional[HttpUrl] = None
33
135
  id: int
34
- author_id: str
136
+ label_names: list[str] = Field(default_factory=list)
137
+ locale: str
138
+ outdated: bool = False
139
+ outdated_locales: list[str] = Field(default_factory=list)
140
+ permission_group_id: int
141
+ position: Optional[int] = None
142
+ promoted: bool = False
143
+ section_id: Optional[int] = None
144
+ source_locale: Optional[str] = None
35
145
  title: str
36
- content: str
37
-
38
- def __lt__(self, other):
39
- return int(self.id) < int(other.id)
146
+ updated_at: Optional[datetime] = None
147
+ url: Optional[HttpUrl] = None
148
+ user_segment_id: Optional[int] = None
149
+ user_segment_ids: list[int] = Field(default_factory=list)
150
+ vote_count: Optional[int] = None
151
+ vote_sum: Optional[int] = None
152
+
153
+ def as_html(self) -> str:
154
+ html = self.body
155
+ if title := self.title:
156
+ html = f"<h1>{title}</h1>{html}"
157
+ return fix_unescaped_unicode(f"<body class='Document' >{html}</body>")
158
+
159
+
160
+ class ZendeskArticleAttachment(BaseModel):
161
+ # https://developer.zendesk.com/api-reference/help_center/help-center-api/article_attachments/#json-format
162
+ article_id: Optional[int] = None
163
+ content_type: Optional[str] = None
164
+ content_url: Optional[HttpUrl] = None
165
+ created_at: Optional[datetime] = None
166
+ guide_media_id: Optional[str] = None
167
+ id: Optional[int] = None
168
+ inline: bool = False
169
+ locale: Optional[str] = None
170
+ size: Optional[int] = None
171
+ updated_at: Optional[datetime] = None
172
+ url: Optional[HttpUrl] = None
40
173
 
41
174
 
175
+ @dataclass
42
176
  class ZendeskClient:
177
+ token: str
178
+ subdomain: str
179
+ email: str
180
+ max_page_size: int = 100
181
+ _async_client: "AsyncClient" = field(init=False, default=None)
182
+ _client: "Client" = field(init=False, default=None)
183
+ _base_url: str = field(init=False, default=None)
184
+
185
+ async def __aenter__(self) -> "ZendeskClient":
186
+ return self
187
+
188
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
189
+ await self._async_client.aclose()
43
190
 
44
191
  @requires_dependencies(["httpx"], extras="zendesk")
45
- def __init__(self, token: str, subdomain: str, email: str):
192
+ def __post_init__(self):
46
193
  import httpx
47
194
 
48
- # should be okay to be blocking.
49
- url_to_check = f"https://{subdomain}.zendesk.com/api/v2/groups.json"
50
- auth = f"{email}/token", token
195
+ auth = f"{self.email}/token", self.token
196
+ self._client = httpx.Client(auth=auth)
197
+ self._async_client = httpx.AsyncClient(auth=auth)
198
+ self._base_url = f"https://{self.subdomain}.zendesk.com/api/v2"
51
199
 
200
+ # Run check
52
201
  try:
53
- _ = httpx.get(url_to_check, auth=auth)
202
+ url_to_check = f"{self._base_url}/groups.json"
203
+ resp = self._client.head(url_to_check)
204
+ resp.raise_for_status()
54
205
  except Exception as e:
55
206
  raise self.wrap_error(e=e)
56
207
 
57
- self._token = token
58
- self._subdomain = subdomain
59
- self._email = email
60
- self._auth = auth
61
-
62
208
  @requires_dependencies(["httpx"], extras="zendesk")
63
209
  def wrap_error(self, e: Exception) -> Exception:
64
210
  import httpx
@@ -93,151 +239,70 @@ class ZendeskClient:
93
239
  logger.error(f"unhandled http status error from Zendesk client: {e}", exc_info=True)
94
240
  return e
95
241
 
96
- @requires_dependencies(["httpx"], extras="zendesk")
97
- async def get_articles_async(self) -> List[ZendeskArticle]:
98
- """
99
- Retrieves article content from Zendesk asynchronously.
100
- """
101
- import httpx
102
-
103
- articles: List[ZendeskArticle] = []
104
-
105
- article_url = f"https://{self._subdomain}.zendesk.com/api/v2/help_center/articles.json"
106
-
107
- try:
108
- async with httpx.AsyncClient() as client:
109
- response = await client.get(article_url, auth=self._auth)
242
+ async def fetch_content(self, url: str, content_key: str) -> AsyncGenerator[dict, None]:
243
+ url = f"{url}?page[size]={self.max_page_size}"
244
+ while True:
245
+ try:
246
+ response = await self._async_client.get(url)
110
247
  response.raise_for_status()
111
- except Exception as e:
112
- raise self.wrap_error(e=e)
248
+ except Exception as e:
249
+ raise self.wrap_error(e=e)
113
250
 
114
- articles_in_response: List[dict] = response.json()["articles"]
251
+ data = response.json()
252
+ for content in data[content_key]:
253
+ yield content
115
254
 
116
- articles = [
117
- ZendeskArticle(
118
- id=int(entry["id"]),
119
- author_id=str(entry["author_id"]),
120
- title=str(entry["title"]),
121
- content=entry["body"],
122
- )
123
- for entry in articles_in_response
124
- ]
125
- return articles
255
+ has_more = data.get("meta", {}).get("has_more", False)
256
+ if not has_more:
257
+ break
126
258
 
127
- @requires_dependencies(["httpx"], extras="zendesk")
128
- async def get_comments_async(self, ticket_id: int) -> List["Comment"]:
129
- import httpx
259
+ url = data["links"]["next"]
130
260
 
131
- comments_url = f"https://{self._subdomain}.zendesk.com/api/v2/tickets/{ticket_id}/comments"
261
+ async def get_articles(self) -> AsyncGenerator[ZendeskArticle, None]:
262
+ """
263
+ Retrieves article content from Zendesk asynchronously.
264
+ """
265
+ article_url = f"https://{self.subdomain}.zendesk.com/api/v2/help_center/articles.json"
132
266
 
133
267
  try:
134
- async with httpx.AsyncClient() as client:
135
- response = await client.get(comments_url, auth=self._auth)
136
- response.raise_for_status()
268
+ async for article_dict in self.fetch_content(url=article_url, content_key="articles"):
269
+ yield ZendeskArticle.model_validate(article_dict)
137
270
  except Exception as e:
138
271
  raise self.wrap_error(e=e)
139
272
 
140
- return [
141
- Comment(
142
- id=int(entry["id"]),
143
- author_id=entry["author_id"],
144
- body=entry["body"],
145
- metadata=entry,
146
- parent_ticket_id=ticket_id,
147
- )
148
- for entry in response.json()["comments"]
149
- ]
150
-
151
- @requires_dependencies(["httpx"], extras="zendesk")
152
- def get_users(self) -> List[dict]:
153
- import httpx
154
-
155
- users: List[dict] = []
273
+ async def get_comments(self, ticket_id: int) -> AsyncGenerator[ZendeskComment, None]:
274
+ comments_url = f"https://{self.subdomain}.zendesk.com/api/v2/tickets/{ticket_id}/comments"
156
275
 
157
- users_url = f"https://{self._subdomain}.zendesk.com/api/v2/users"
158
276
  try:
159
- response = httpx.get(users_url, auth=self._auth)
160
- response.raise_for_status()
277
+ async for comment_dict in self.fetch_content(url=comments_url, content_key="comments"):
278
+ yield ZendeskComment.model_validate(comment_dict)
161
279
  except Exception as e:
162
280
  raise self.wrap_error(e=e)
163
281
 
164
- users_in_response: List[dict] = response.json()["users"]
165
- users = users_in_response
166
-
167
- return users
168
-
169
- @requires_dependencies(["httpx"], extras="zendesk")
170
- async def get_tickets_async(self) -> List["ZendeskTicket"]:
171
- import httpx
172
-
173
- tickets: List["ZendeskTicket"] = []
174
- tickets_url = f"https://{self._subdomain}.zendesk.com/api/v2/tickets"
282
+ async def get_tickets(self) -> AsyncGenerator[ZendeskTicket, None]:
283
+ tickets_url = f"https://{self.subdomain}.zendesk.com/api/v2/tickets"
175
284
 
176
285
  try:
177
- async with httpx.AsyncClient() as client:
178
- response = await client.get(tickets_url, auth=self._auth)
179
- response.raise_for_status()
286
+ async for ticket_dict in self.fetch_content(url=tickets_url, content_key="tickets"):
287
+ yield ZendeskTicket.model_validate(ticket_dict)
180
288
  except Exception as e:
181
289
  raise self.wrap_error(e=e)
182
290
 
183
- tickets_in_response: List[dict] = response.json()["tickets"]
184
-
185
- for entry in tickets_in_response:
186
- ticket = ZendeskTicket(
187
- id=int(entry["id"]),
188
- subject=entry["subject"],
189
- description=entry["description"],
190
- generated_ts=entry["generated_timestamp"],
191
- metadata=entry,
192
- )
193
- tickets.append(ticket)
194
-
195
- return tickets
196
-
197
- @requires_dependencies(["httpx"], extras="zendesk")
198
- async def get_article_attachments_async(self, article_id: str):
291
+ async def get_article_attachments(
292
+ self, article_id: int
293
+ ) -> AsyncGenerator[ZendeskArticleAttachment, None]:
199
294
  """
200
295
  Handles article attachments such as images and stores them as UTF-8 encoded bytes.
201
296
  """
202
- import httpx
203
-
204
297
  article_attachment_url = (
205
- f"https://{self._subdomain}.zendesk.com/api/v2/help_center/"
298
+ f"https://{self.subdomain}.zendesk.com/api/v2/help_center/"
206
299
  f"articles/{article_id}/attachments"
207
300
  )
208
301
 
209
302
  try:
210
- async with httpx.AsyncClient() as client:
211
- response = await client.get(article_attachment_url, auth=self._auth)
212
- response.raise_for_status()
303
+ async for attachment_dict in self.fetch_content(
304
+ url=article_attachment_url, content_key="article_attachments"
305
+ ):
306
+ yield ZendeskArticleAttachment.model_validate(attachment_dict)
213
307
  except Exception as e:
214
308
  raise self.wrap_error(e=e)
215
-
216
- attachments_in_response: List[Dict] = response.json().get("article_attachments", [])
217
- attachments = []
218
-
219
- for attachment in attachments_in_response:
220
- attachment_data = {
221
- "id": attachment["id"],
222
- "file_name": attachment["file_name"],
223
- "content_type": attachment["content_type"],
224
- "size": attachment["size"],
225
- "url": attachment["url"],
226
- "content_url": attachment["content_url"],
227
- }
228
-
229
- try:
230
- async with httpx.AsyncClient() as client:
231
- download_response = await client.get(attachment["content_url"], auth=self._auth)
232
- download_response.raise_for_status()
233
- except Exception as e:
234
- raise self.wrap_error(e=e)
235
-
236
- encoded_content = base64.b64encode(download_response.content).decode("utf-8")
237
- attachment_data["encoded_content"] = (
238
- f"data:{attachment_data['content_type']};base64,{encoded_content}"
239
- )
240
-
241
- attachments.append(attachment_data)
242
-
243
- return attachments