unstructured-ingest 0.5.9__py3-none-any.whl → 0.5.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (28) hide show
  1. test/integration/connectors/test_astradb.py +21 -0
  2. test/integration/connectors/test_dropbox.py +151 -0
  3. test/integration/connectors/test_jira.py +67 -0
  4. test/integration/connectors/test_zendesk.py +142 -0
  5. test/integration/connectors/utils/validation/destination.py +2 -1
  6. test/unit/test_utils.py +27 -0
  7. test/unit/v2/connectors/test_jira.py +401 -0
  8. unstructured_ingest/__version__.py +1 -1
  9. unstructured_ingest/embed/openai.py +4 -3
  10. unstructured_ingest/utils/string_and_date_utils.py +25 -0
  11. unstructured_ingest/v2/interfaces/downloader.py +2 -3
  12. unstructured_ingest/v2/processes/connectors/__init__.py +4 -0
  13. unstructured_ingest/v2/processes/connectors/astradb.py +36 -28
  14. unstructured_ingest/v2/processes/connectors/confluence.py +2 -2
  15. unstructured_ingest/v2/processes/connectors/delta_table.py +2 -0
  16. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +78 -15
  17. unstructured_ingest/v2/processes/connectors/jira.py +453 -0
  18. unstructured_ingest/v2/processes/connectors/zendesk/__init__.py +31 -0
  19. unstructured_ingest/v2/processes/connectors/zendesk/client.py +225 -0
  20. unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py +419 -0
  21. unstructured_ingest/v2/processes/partitioner.py +2 -5
  22. unstructured_ingest/v2/unstructured_api.py +7 -0
  23. {unstructured_ingest-0.5.9.dist-info → unstructured_ingest-0.5.11.dist-info}/METADATA +26 -26
  24. {unstructured_ingest-0.5.9.dist-info → unstructured_ingest-0.5.11.dist-info}/RECORD +28 -20
  25. {unstructured_ingest-0.5.9.dist-info → unstructured_ingest-0.5.11.dist-info}/LICENSE.md +0 -0
  26. {unstructured_ingest-0.5.9.dist-info → unstructured_ingest-0.5.11.dist-info}/WHEEL +0 -0
  27. {unstructured_ingest-0.5.9.dist-info → unstructured_ingest-0.5.11.dist-info}/entry_points.txt +0 -0
  28. {unstructured_ingest-0.5.9.dist-info → unstructured_ingest-0.5.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,225 @@
1
+ import base64
2
+ from dataclasses import dataclass
3
+ from typing import Dict, List
4
+
5
+ import httpx
6
+
7
+ from unstructured_ingest.v2.errors import ProviderError, RateLimitError, UserAuthError, UserError
8
+ from unstructured_ingest.v2.logger import logger
9
+
10
+
11
+ @dataclass
12
+ class Comment:
13
+ id: int
14
+ author_id: str
15
+ body: str
16
+ parent_ticket_id: str
17
+ metadata: dict
18
+
19
+
20
+ @dataclass
21
+ class ZendeskTicket:
22
+ id: int
23
+ subject: str
24
+ description: str
25
+ generated_ts: int
26
+ metadata: dict
27
+
28
+ def __lt__(self, other):
29
+ return int(self.id) < int(other.id)
30
+
31
+
32
+ @dataclass
33
+ class ZendeskArticle:
34
+ id: int
35
+ author_id: str
36
+ title: str
37
+ content: str
38
+
39
+ def __lt__(self, other):
40
+ return int(self.id) < int(other.id)
41
+
42
+
43
+ class ZendeskClient:
44
+
45
+ def __init__(self, token: str, subdomain: str, email: str):
46
+ # should be okay to be blocking.
47
+ url_to_check = f"https://{subdomain}.zendesk.com/api/v2/groups.json"
48
+ auth = f"{email}/token", token
49
+
50
+ try:
51
+ _ = httpx.get(url_to_check, auth=auth)
52
+ except Exception as e:
53
+ raise self.wrap_error(e=e)
54
+
55
+ self._token = token
56
+ self._subdomain = subdomain
57
+ self._email = email
58
+ self._auth = auth
59
+
60
+ def wrap_error(self, e: Exception) -> Exception:
61
+ if not isinstance(e, httpx.HTTPStatusError):
62
+ logger.error(f"unhandled exception from Zendesk client: {e}", exc_info=True)
63
+ return e
64
+ url = e.request.url
65
+ response_code = e.response.status_code
66
+ if response_code == 401:
67
+ logger.error(
68
+ f"Failed to connect via auth,"
69
+ f"{url} using zendesk response, status code {response_code}"
70
+ )
71
+ return UserAuthError(e)
72
+ if response_code == 429:
73
+ logger.error(
74
+ f"Failed to connect via rate limits"
75
+ f"{url} using zendesk response, status code {response_code}"
76
+ )
77
+ return RateLimitError(e)
78
+ if 400 <= response_code < 500:
79
+ logger.error(
80
+ f"Failed to connect to {url} using zendesk response, status code {response_code}"
81
+ )
82
+ return UserError(e)
83
+ if response_code > 500:
84
+ logger.error(
85
+ f"Failed to connect to {url} using zendesk response, status code {response_code}"
86
+ )
87
+ return ProviderError(e)
88
+ logger.error(f"unhandled http status error from Zendesk client: {e}", exc_info=True)
89
+ return e
90
+
91
+ async def get_articles_async(self) -> List[ZendeskArticle]:
92
+ """
93
+ Retrieves article content from Zendesk asynchronously.
94
+ """
95
+
96
+ articles: List[ZendeskArticle] = []
97
+
98
+ article_url = f"https://{self._subdomain}.zendesk.com/api/v2/help_center/articles.json"
99
+
100
+ try:
101
+ async with httpx.AsyncClient() as client:
102
+ response = await client.get(article_url, auth=self._auth)
103
+ response.raise_for_status()
104
+ except Exception as e:
105
+ raise self.wrap_error(e=e)
106
+
107
+ articles_in_response: List[dict] = response.json()["articles"]
108
+
109
+ articles = [
110
+ ZendeskArticle(
111
+ id=int(entry["id"]),
112
+ author_id=str(entry["author_id"]),
113
+ title=str(entry["title"]),
114
+ content=entry["body"],
115
+ )
116
+ for entry in articles_in_response
117
+ ]
118
+ return articles
119
+
120
+ async def get_comments_async(self, ticket_id: int) -> List["Comment"]:
121
+ comments_url = f"https://{self._subdomain}.zendesk.com/api/v2/tickets/{ticket_id}/comments"
122
+
123
+ try:
124
+ async with httpx.AsyncClient() as client:
125
+ response = await client.get(comments_url, auth=self._auth)
126
+ response.raise_for_status()
127
+ except Exception as e:
128
+ raise self.wrap_error(e=e)
129
+
130
+ return [
131
+ Comment(
132
+ id=int(entry["id"]),
133
+ author_id=entry["author_id"],
134
+ body=entry["body"],
135
+ metadata=entry,
136
+ parent_ticket_id=ticket_id,
137
+ )
138
+ for entry in response.json()["comments"]
139
+ ]
140
+
141
+ def get_users(self) -> List[dict]:
142
+
143
+ users: List[dict] = []
144
+
145
+ users_url = f"https://{self._subdomain}.zendesk.com/api/v2/users"
146
+ try:
147
+ response = httpx.get(users_url, auth=self._auth)
148
+ response.raise_for_status()
149
+ except Exception as e:
150
+ raise self.wrap_error(e=e)
151
+
152
+ users_in_response: List[dict] = response.json()["users"]
153
+ users = users_in_response
154
+
155
+ return users
156
+
157
+ async def get_tickets_async(self) -> List["ZendeskTicket"]:
158
+ tickets: List["ZendeskTicket"] = []
159
+ tickets_url = f"https://{self._subdomain}.zendesk.com/api/v2/tickets"
160
+
161
+ try:
162
+ async with httpx.AsyncClient() as client:
163
+ response = await client.get(tickets_url, auth=self._auth)
164
+ response.raise_for_status()
165
+ except Exception as e:
166
+ raise self.wrap_error(e=e)
167
+
168
+ tickets_in_response: List[dict] = response.json()["tickets"]
169
+
170
+ for entry in tickets_in_response:
171
+ ticket = ZendeskTicket(
172
+ id=int(entry["id"]),
173
+ subject=entry["subject"],
174
+ description=entry["description"],
175
+ generated_ts=entry["generated_timestamp"],
176
+ metadata=entry,
177
+ )
178
+ tickets.append(ticket)
179
+
180
+ return tickets
181
+
182
+ async def get_article_attachments_async(self, article_id: str):
183
+ """
184
+ Handles article attachments such as images and stores them as UTF-8 encoded bytes.
185
+ """
186
+ article_attachment_url = (
187
+ f"https://{self._subdomain}.zendesk.com/api/v2/help_center/"
188
+ f"articles/{article_id}/attachments"
189
+ )
190
+
191
+ try:
192
+ async with httpx.AsyncClient() as client:
193
+ response = await client.get(article_attachment_url, auth=self._auth)
194
+ response.raise_for_status()
195
+ except Exception as e:
196
+ raise self.wrap_error(e=e)
197
+
198
+ attachments_in_response: List[Dict] = response.json().get("article_attachments", [])
199
+ attachments = []
200
+
201
+ for attachment in attachments_in_response:
202
+ attachment_data = {
203
+ "id": attachment["id"],
204
+ "file_name": attachment["file_name"],
205
+ "content_type": attachment["content_type"],
206
+ "size": attachment["size"],
207
+ "url": attachment["url"],
208
+ "content_url": attachment["content_url"],
209
+ }
210
+
211
+ try:
212
+ async with httpx.AsyncClient() as client:
213
+ download_response = await client.get(attachment["content_url"], auth=self._auth)
214
+ download_response.raise_for_status()
215
+ except Exception as e:
216
+ raise self.wrap_error(e=e)
217
+
218
+ encoded_content = base64.b64encode(download_response.content).decode("utf-8")
219
+ attachment_data["encoded_content"] = (
220
+ f"data:{attachment_data['content_type']};base64,{encoded_content}"
221
+ )
222
+
223
+ attachments.append(attachment_data)
224
+
225
+ return attachments
@@ -0,0 +1,419 @@
1
+ from __future__ import annotations
2
+
3
+ import datetime
4
+ import hashlib
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from time import time
8
+ from typing import Any, AsyncGenerator, List, Literal
9
+
10
+ import aiofiles
11
+ import bs4
12
+ from pydantic import BaseModel, Field, Secret
13
+
14
+ from unstructured_ingest.utils.data_prep import batch_generator
15
+ from unstructured_ingest.utils.html import HtmlMixin
16
+ from unstructured_ingest.v2.errors import UserAuthError
17
+ from unstructured_ingest.v2.interfaces import (
18
+ AccessConfig,
19
+ BatchFileData,
20
+ BatchItem,
21
+ ConnectionConfig,
22
+ Downloader,
23
+ DownloaderConfig,
24
+ DownloadResponse,
25
+ FileData,
26
+ FileDataSourceMetadata,
27
+ Indexer,
28
+ IndexerConfig,
29
+ SourceIdentifiers,
30
+ )
31
+ from unstructured_ingest.v2.logger import logger
32
+ from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
33
+
34
+ from .client import ZendeskArticle, ZendeskClient, ZendeskTicket
35
+
36
+ CONNECTOR_TYPE = "zendesk"
37
+
38
+
39
+ class ZendeskAdditionalMetadata(BaseModel):
40
+ item_type: str
41
+ leading_id: str # is the same as id just being verbose.
42
+ tail_id: str # last id in the batch.
43
+
44
+
45
+ class ZendeskFileDataSourceMetadata(FileDataSourceMetadata):
46
+ """
47
+ inherits metadata object as tickets and articles
48
+ are treated in single batch, we need to denote indices ticket/article
49
+ as the source metadata.
50
+ """
51
+
52
+
53
+ class ZendeskBatchFileData(BatchFileData):
54
+ additional_metadata: ZendeskAdditionalMetadata
55
+
56
+
57
+ class ZendeskAccessConfig(AccessConfig):
58
+ api_token: str = Field(
59
+ description="API token for zendesk generated under Apps and Integrations"
60
+ )
61
+
62
+
63
+ class ZendeskBatchItemTicket(BatchItem):
64
+ subject: str
65
+ description: str
66
+ item_type: str = "tickets" # placeholder for downloader
67
+
68
+
69
+ class ZendeskBatchItemArticle(BatchItem):
70
+ title: str
71
+ author_id: str
72
+ title: str
73
+ content: str
74
+
75
+
76
+ class ZendeskConnectionConfig(ConnectionConfig):
77
+ subdomain: str = Field(description="Subdomain for zendesk site, <sub-domain>.company.com")
78
+ email: str = Field(description="Email for zendesk site registered at the subdomain")
79
+ access_config: Secret[ZendeskAccessConfig]
80
+
81
+ async def get_client_async(self) -> ZendeskClient:
82
+ """Provides an async manager for ZendeskClient."""
83
+ access_config = self.access_config.get_secret_value()
84
+
85
+ client = ZendeskClient(
86
+ email=self.email, subdomain=self.subdomain, token=access_config.api_token
87
+ )
88
+ return client
89
+
90
+ def get_client(self) -> ZendeskClient:
91
+
92
+ access_config = self.access_config.get_secret_value()
93
+
94
+ client = ZendeskClient(
95
+ email=self.email, subdomain=self.subdomain, token=access_config.api_token
96
+ )
97
+ return client
98
+
99
+
100
+ class ZendeskIndexerConfig(IndexerConfig):
101
+ batch_size: int = Field(
102
+ default=2,
103
+ description="Number of tickets or articles.",
104
+ )
105
+ item_type: Literal["tickets", "articles", "all"] = Field(
106
+ default="tickets",
107
+ description="Type of item from zendesk to parse, can only be `tickets` or `articles`.",
108
+ )
109
+
110
+
111
+ @dataclass
112
+ class ZendeskIndexer(Indexer):
113
+ connection_config: ZendeskConnectionConfig
114
+ index_config: ZendeskIndexerConfig
115
+ connector_type: str = CONNECTOR_TYPE
116
+
117
+ def precheck(self) -> None:
118
+ """Validates connection to Zendesk API."""
119
+ try:
120
+ client = self.connection_config.get_client()
121
+ if not client.get_users():
122
+ subdomain_endpoint = f"{self.connection_config.subdomain}.zendesk.com"
123
+ raise UserAuthError(f"Users do not exist in subdomain {subdomain_endpoint}")
124
+ except UserAuthError as e:
125
+ logger.error(f"Source connection error: {e}", exc_info=True)
126
+ raise
127
+ except Exception as e:
128
+ logger.error(f"Failed to validate connection to Zendesk: {e}", exc_info=True)
129
+ raise UserAuthError(f"Failed to validate connection: {e}")
130
+
131
+ def is_async(self) -> bool:
132
+ return True
133
+
134
+ async def _list_articles_async(self) -> List[ZendeskArticle]:
135
+ client = await self.connection_config.get_client_async()
136
+ return await client.get_articles_async()
137
+
138
+ async def _list_tickets_async(self) -> List[ZendeskTicket]:
139
+ client = await self.connection_config.get_client_async()
140
+ return await client.get_tickets_async()
141
+
142
+ def _generate_fullpath(self, identifier: str) -> Path:
143
+ return Path(hashlib.sha256(identifier.encode("utf-8")).hexdigest()[:16] + ".txt")
144
+
145
+ async def handle_articles_async(
146
+ self, articles: List[ZendeskArticle], batch_size: int
147
+ ) -> AsyncGenerator[ZendeskBatchFileData, None]:
148
+ """Parses articles from a list and yields FileData objects asynchronously in batches."""
149
+ for article_batch in batch_generator(articles, batch_size=batch_size):
150
+
151
+ article_batch = sorted(article_batch)
152
+
153
+ additional_metadata = ZendeskAdditionalMetadata(
154
+ item_type="articles",
155
+ leading_id=str(article_batch[0].id),
156
+ tail_id=str(article_batch[-1].id),
157
+ )
158
+
159
+ metadata = ZendeskFileDataSourceMetadata(
160
+ date_processed=str(time()),
161
+ record_locator={
162
+ "id": str(article_batch[0].id),
163
+ "item_type": "articles",
164
+ },
165
+ )
166
+
167
+ batch_items: List[ZendeskBatchItemArticle] = [
168
+ ZendeskBatchItemArticle(
169
+ identifier=str(article.id),
170
+ author_id=str(article.author_id),
171
+ title=str(article.title),
172
+ content=str(article.content),
173
+ )
174
+ for article in article_batch
175
+ ]
176
+
177
+ full_path = self._generate_fullpath(str(article_batch[0].id))
178
+ full_path = Path(str(full_path).replace(".txt", ".html"))
179
+
180
+ source_identifiers = SourceIdentifiers(filename=full_path.name, fullpath=str(full_path))
181
+
182
+ batched_file_data = ZendeskBatchFileData(
183
+ identifier=str(article_batch[0].id),
184
+ connector_type=self.connector_type,
185
+ metadata=metadata,
186
+ batch_items=batch_items,
187
+ additional_metadata=additional_metadata,
188
+ source_identifiers=source_identifiers,
189
+ )
190
+
191
+ yield batched_file_data
192
+
193
+ async def handle_tickets_async(
194
+ self, tickets: List[ZendeskTicket], batch_size: int
195
+ ) -> AsyncGenerator[ZendeskBatchFileData, None]:
196
+ """Parses tickets from a list and yields FileData objects asynchronously in batches."""
197
+ for ticket_batch in batch_generator(tickets, batch_size=batch_size):
198
+
199
+ sorted_batch = sorted(ticket_batch)
200
+
201
+ additional_metadata = ZendeskAdditionalMetadata(
202
+ item_type="tickets",
203
+ leading_id=str(sorted_batch[0].id),
204
+ tail_id=str(sorted_batch[-1].id),
205
+ )
206
+
207
+ metadata = ZendeskFileDataSourceMetadata(
208
+ date_processed=str(time()),
209
+ record_locator={
210
+ "id": str(sorted_batch[0].id),
211
+ "item_type": "tickets",
212
+ },
213
+ )
214
+
215
+ batch_items: List[ZendeskBatchItemTicket] = [
216
+ ZendeskBatchItemTicket(
217
+ identifier=str(ticket.id),
218
+ subject=str(ticket.subject),
219
+ description=str(ticket.description),
220
+ )
221
+ for ticket in sorted_batch
222
+ ]
223
+
224
+ full_path = self._generate_fullpath(str(sorted_batch[0].id))
225
+ source_identifiers = SourceIdentifiers(filename=full_path.name, fullpath=str(full_path))
226
+
227
+ batched_file_data = ZendeskBatchFileData(
228
+ connector_type=self.connector_type,
229
+ metadata=metadata,
230
+ batch_items=batch_items,
231
+ additional_metadata=additional_metadata,
232
+ source_identifiers=source_identifiers,
233
+ )
234
+
235
+ yield batched_file_data
236
+
237
+ async def run_async(self, **kwargs: Any) -> AsyncGenerator[FileData, None]:
238
+ """Determines item type and processes accordingly asynchronously."""
239
+ item_type = self.index_config.item_type
240
+ batch_size = self.index_config.batch_size
241
+
242
+ if item_type == "articles":
243
+ articles = await self._list_articles_async()
244
+ async for file_data in self.handle_articles_async(
245
+ articles, batch_size
246
+ ): # Using async version
247
+ yield file_data
248
+
249
+ elif item_type == "tickets":
250
+ tickets = await self._list_tickets_async()
251
+ async for file_data in self.handle_tickets_async(
252
+ tickets, batch_size
253
+ ): # Using async version
254
+ yield file_data
255
+
256
+
257
+ class ZendeskDownloaderConfig(DownloaderConfig, HtmlMixin):
258
+ pass
259
+
260
+
261
+ @dataclass
262
+ class ZendeskDownloader(Downloader):
263
+ download_config: ZendeskDownloaderConfig
264
+ connection_config: ZendeskConnectionConfig
265
+ connector_type: str = CONNECTOR_TYPE
266
+
267
+ def is_async(self) -> bool:
268
+ return True
269
+
270
+ def download_embedded_files(
271
+ self, session, html: str, current_file_data: FileData
272
+ ) -> list[DownloadResponse]:
273
+ if not self.download_config.extract_files:
274
+ return []
275
+ url = current_file_data.metadata.url
276
+ if url is None:
277
+ logger.warning(
278
+ f"""Missing URL for file: {current_file_data.source_identifiers.filename}.
279
+ Skipping file extraction."""
280
+ )
281
+ return []
282
+ filepath = current_file_data.source_identifiers.relative_path
283
+ download_path = Path(self.download_dir) / filepath
284
+ download_dir = download_path.with_suffix("")
285
+ return self.download_config.extract_embedded_files(
286
+ url=url,
287
+ download_dir=download_dir,
288
+ original_filedata=current_file_data,
289
+ html=html,
290
+ session=session,
291
+ )
292
+
293
+ async def handle_articles_async(
294
+ self, client: ZendeskClient, batch_file_data: ZendeskBatchFileData
295
+ ):
296
+ """
297
+ Processes the article information, downloads the attachments for each article,
298
+ and updates the content accordingly.
299
+ """
300
+ # Determine the download path
301
+ download_path = self.get_download_path(batch_file_data)
302
+
303
+ if download_path is None:
304
+ raise ValueError("Download path could not be determined")
305
+
306
+ download_path.parent.mkdir(parents=True, exist_ok=True)
307
+
308
+ async with aiofiles.open(download_path, "a", encoding="utf8") as f:
309
+ for article in batch_file_data.batch_items:
310
+ html_data_str = article.content
311
+ soup = bs4.BeautifulSoup(html_data_str, "html.parser")
312
+
313
+ if self.download_config.extract_images:
314
+ # Get article attachments asynchronously
315
+ image_data_decoded: List = await client.get_article_attachments_async(
316
+ article_id=article.identifier
317
+ )
318
+ img_tags = soup.find_all("img")
319
+
320
+ # Ensure we don't exceed the available images
321
+ for img_tag, img_data in zip(img_tags, image_data_decoded):
322
+ img_tag["src"] = img_data.get("encoded_content", "")
323
+
324
+ await f.write(soup.prettify())
325
+
326
+ return super().generate_download_response(
327
+ file_data=batch_file_data, download_path=download_path
328
+ )
329
+
330
+ async def handle_tickets_async(
331
+ self, client: ZendeskClient, batch_file_data: ZendeskBatchFileData
332
+ ) -> DownloadResponse:
333
+ """
334
+ Processes a batch of tickets asynchronously, writing their details and comments to a file.
335
+ """
336
+ # Determine the download path
337
+ download_path = self.get_download_path(batch_file_data)
338
+ if download_path is None:
339
+ raise ValueError("Download path could not be determined")
340
+
341
+ download_path.parent.mkdir(parents=True, exist_ok=True)
342
+
343
+ # Process each ticket in the batch
344
+ async with aiofiles.open(download_path, "a", encoding="utf8") as f:
345
+ for batch_item in batch_file_data.batch_items:
346
+ ticket_identifier = batch_item.identifier
347
+ first_date = None
348
+ comments: List[dict] = []
349
+
350
+ # Fetch comments asynchronously
351
+ comments_list = await client.get_comments_async(ticket_id=int(ticket_identifier))
352
+
353
+ for comment in comments_list: # Iterate over the resolved list
354
+ date_created = (
355
+ comment.metadata["created_at"].isoformat()
356
+ if isinstance(comment.metadata["created_at"], datetime.datetime)
357
+ else str(comment.metadata["created_at"])
358
+ )
359
+
360
+ if first_date is None:
361
+ first_date = date_created
362
+
363
+ comments.append(
364
+ {
365
+ "comment_id": comment.id,
366
+ "author_id": comment.author_id,
367
+ "body": comment.body,
368
+ "date_created": date_created,
369
+ }
370
+ )
371
+
372
+ # Write ticket details to file
373
+ content = (
374
+ "\nticket\n"
375
+ f"{batch_item.identifier}\n"
376
+ f"{batch_file_data.metadata.record_locator.get('subject', '')}\n"
377
+ f"{batch_file_data.metadata.record_locator.get('description', '')}\n"
378
+ f"{first_date}\n"
379
+ )
380
+
381
+ # Append comments
382
+ for comment in comments:
383
+ content += (
384
+ "comment\n"
385
+ f"{comment.get('comment_id', '')}\n"
386
+ f"{comment.get('author_id', '')}\n"
387
+ f"{comment.get('body', '')}\n"
388
+ f"{comment.get('date_created', '')}\n"
389
+ )
390
+
391
+ await f.write(content)
392
+
393
+ return super().generate_download_response(
394
+ file_data=batch_file_data, download_path=download_path
395
+ )
396
+
397
+ async def run_async(self, file_data: ZendeskBatchFileData, **kwargs: Any) -> DownloadResponse:
398
+
399
+ zendesk_filedata: FileData = FileData.cast(file_data=file_data)
400
+
401
+ client = await self.connection_config.get_client_async()
402
+ item_type = zendesk_filedata.metadata.record_locator["item_type"]
403
+
404
+ if item_type == "articles":
405
+ return await self.handle_articles_async(client, file_data)
406
+ elif item_type == "tickets":
407
+ return await self.handle_tickets_async(client, file_data)
408
+ else:
409
+ raise RuntimeError(f"Item type {item_type} cannot be handled by the downloader")
410
+
411
+
412
+ # create entry
413
+ zendesk_source_entry = SourceRegistryEntry(
414
+ connection_config=ZendeskConnectionConfig,
415
+ indexer_config=ZendeskIndexerConfig,
416
+ indexer=ZendeskIndexer,
417
+ downloader=ZendeskDownloader,
418
+ downloader_config=ZendeskDownloaderConfig,
419
+ )
@@ -1,4 +1,3 @@
1
- import json
2
1
  from abc import ABC
3
2
  from dataclasses import dataclass
4
3
  from pathlib import Path
@@ -206,10 +205,8 @@ class Partitioner(BaseProcess, ABC):
206
205
 
207
206
  def is_client_error_unsupported_filetype(self, error: UserError) -> bool:
208
207
  error_msg = error.args[0]
209
- error_dict = json.loads(error_msg)
210
- details = error_dict["detail"]
211
- return "fast strategy is not available for image files" in details or (
212
- "file type" in details.lower() and "is not supported" in details.lower()
208
+ return "fast strategy is not available for image files" in error_msg or (
209
+ "file type" in error_msg.lower() and "is not supported" in error_msg.lower()
213
210
  )
214
211
 
215
212
  def run(self, filename: Path, metadata: Optional[dict] = None, **kwargs) -> list[dict]:
@@ -53,7 +53,14 @@ def create_partition_request(filename: Path, parameters_dict: dict) -> "Partitio
53
53
 
54
54
 
55
55
  def wrap_error(e: Exception) -> Exception:
56
+ from unstructured_client.models.errors.httpvalidationerror import HTTPValidationError
56
57
  from unstructured_client.models.errors.sdkerror import SDKError
58
+ from unstructured_client.models.errors.servererror import ServerError
59
+
60
+ if isinstance(e, HTTPValidationError):
61
+ return UserError(e.data.detail)
62
+ if isinstance(e, ServerError):
63
+ return ProviderError(e.data.detail)
57
64
 
58
65
  if not isinstance(e, SDKError):
59
66
  logger.error(f"Uncaught Error calling API: {e}")