unstructured-ingest 0.5.10__py3-none-any.whl → 0.5.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -31,6 +31,7 @@ from unstructured_ingest.v2.processes.connectors.astradb import (
31
31
  AstraDBUploader,
32
32
  AstraDBUploaderConfig,
33
33
  AstraDBUploadStager,
34
+ AstraDBUploadStagerConfig,
34
35
  DestinationConnectionError,
35
36
  SourceConnectionError,
36
37
  )
@@ -258,3 +259,23 @@ def test_astra_stager(
258
259
  stager=stager,
259
260
  tmp_dir=tmp_path,
260
261
  )
262
+
263
+
264
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
265
+ @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
266
+ def test_astra_stager_flatten_metadata(
267
+ request: TopRequest,
268
+ upload_file_str: str,
269
+ tmp_path: Path,
270
+ ):
271
+ stager_config = AstraDBUploadStagerConfig(flatten_metadata=True)
272
+ upload_file: Path = request.getfixturevalue(upload_file_str)
273
+ stager = AstraDBUploadStager(upload_stager_config=stager_config)
274
+ stager_validation(
275
+ configs=StagerValidationConfigs(
276
+ test_id=CONNECTOR_TYPE, expected_count=22, expected_folder="stager_flatten_metadata"
277
+ ),
278
+ input_file=upload_file,
279
+ stager=stager,
280
+ tmp_dir=tmp_path,
281
+ )
@@ -0,0 +1,142 @@
1
+ import os
2
+ from pathlib import Path
3
+ from typing import Optional
4
+
5
+ import pytest
6
+
7
+ from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
8
+ from test.integration.connectors.utils.validation.source import (
9
+ SourceValidationConfigs,
10
+ source_connector_validation,
11
+ )
12
+ from test.integration.utils import requires_env
13
+ from unstructured_ingest.v2.errors import UserAuthError
14
+ from unstructured_ingest.v2.processes.connectors.zendesk import (
15
+ CONNECTOR_TYPE,
16
+ ZendeskAccessConfig,
17
+ ZendeskConnectionConfig,
18
+ ZendeskDownloader,
19
+ ZendeskDownloaderConfig,
20
+ ZendeskIndexer,
21
+ ZendeskIndexerConfig,
22
+ )
23
+
24
+
25
+ async def zendesk_source_test(
26
+ tmp_path: Path,
27
+ token: Optional[str] = None,
28
+ email: Optional[str] = None,
29
+ subdomain: Optional[str] = None,
30
+ ):
31
+
32
+ access_config = ZendeskAccessConfig(api_token=token)
33
+ connection_config = ZendeskConnectionConfig(
34
+ subdomain=subdomain, email=email, access_config=access_config
35
+ )
36
+
37
+ index_config = ZendeskIndexerConfig(batch_size=2, item_type="tickets")
38
+
39
+ indexer = ZendeskIndexer(
40
+ connection_config=connection_config,
41
+ index_config=index_config,
42
+ connector_type=CONNECTOR_TYPE,
43
+ )
44
+
45
+ # handle downloader.
46
+ download_config = ZendeskDownloaderConfig(download_dir=tmp_path)
47
+
48
+ downloader = ZendeskDownloader(
49
+ connection_config=connection_config,
50
+ download_config=download_config,
51
+ connector_type=CONNECTOR_TYPE,
52
+ )
53
+
54
+ # Run the source connector validation
55
+ await source_connector_validation(
56
+ indexer=indexer,
57
+ downloader=downloader,
58
+ configs=SourceValidationConfigs(
59
+ test_id="zendesk-tickets",
60
+ expected_num_files=4,
61
+ validate_file_data=False,
62
+ validate_downloaded_files=True,
63
+ ),
64
+ )
65
+
66
+
67
+ async def zendesk_source_articles_test(
68
+ tmp_path: Path,
69
+ token: Optional[str] = None,
70
+ email: Optional[str] = None,
71
+ subdomain: Optional[str] = None,
72
+ ):
73
+
74
+ access_config = ZendeskAccessConfig(api_token=token)
75
+ connection_config = ZendeskConnectionConfig(
76
+ subdomain=subdomain, email=email, access_config=access_config
77
+ )
78
+
79
+ index_config = ZendeskIndexerConfig(batch_size=2, item_type="articles")
80
+
81
+ indexer = ZendeskIndexer(
82
+ connection_config=connection_config,
83
+ index_config=index_config,
84
+ connector_type=CONNECTOR_TYPE,
85
+ )
86
+
87
+ # handle downloader.
88
+ download_config = ZendeskDownloaderConfig(download_dir=tmp_path, extract_images=True)
89
+
90
+ downloader = ZendeskDownloader(
91
+ connection_config=connection_config,
92
+ download_config=download_config,
93
+ connector_type=CONNECTOR_TYPE,
94
+ )
95
+
96
+ # Run the source connector validation
97
+ await source_connector_validation(
98
+ indexer=indexer,
99
+ downloader=downloader,
100
+ configs=SourceValidationConfigs(
101
+ test_id="zendesk-articles",
102
+ expected_num_files=4,
103
+ validate_file_data=False,
104
+ validate_downloaded_files=True,
105
+ ),
106
+ )
107
+
108
+
109
+ @pytest.mark.asyncio
110
+ @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
111
+ @requires_env("ZENDESK_TOKEN")
112
+ async def test_zendesk_source(temp_dir):
113
+ await zendesk_source_test(
114
+ tmp_path=temp_dir,
115
+ token=os.environ["ZENDESK_TOKEN"],
116
+ email="test@unstructured.io",
117
+ subdomain="unstructuredhelp",
118
+ )
119
+
120
+
121
+ @pytest.mark.asyncio
122
+ @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
123
+ @requires_env("ZENDESK_TOKEN")
124
+ async def test_zendesk_source_articles(temp_dir):
125
+ await zendesk_source_articles_test(
126
+ tmp_path=temp_dir,
127
+ token=os.environ["ZENDESK_TOKEN"],
128
+ email="test@unstructured.io",
129
+ subdomain="unstructuredhelp",
130
+ )
131
+
132
+
133
+ @pytest.mark.asyncio
134
+ @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
135
+ async def test_zendesk_source_articles_fail(temp_dir):
136
+ with pytest.raises(expected_exception=UserAuthError):
137
+ await zendesk_source_articles_test(
138
+ tmp_path=temp_dir,
139
+ token="FORCE_FAIL_TOKEN",
140
+ email="test@unstructured.io",
141
+ subdomain="unstructuredhelp",
142
+ )
@@ -9,9 +9,10 @@ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers, Uploa
9
9
 
10
10
  class StagerValidationConfigs(ValidationConfig):
11
11
  expected_count: int
12
+ expected_folder: str = "stager"
12
13
 
13
14
  def stager_output_dir(self) -> Path:
14
- dir = self.test_output_dir() / "stager"
15
+ dir = self.test_output_dir() / self.expected_folder
15
16
  dir.mkdir(exist_ok=True, parents=True)
16
17
  return dir
17
18
 
@@ -1 +1 @@
1
- __version__ = "0.5.10" # pragma: no cover
1
+ __version__ = "0.5.11" # pragma: no cover
@@ -1,5 +1,5 @@
1
1
  import os
2
- from abc import ABC, abstractmethod
2
+ from abc import ABC
3
3
  from pathlib import Path
4
4
  from typing import Any, Optional, TypedDict, TypeVar, Union
5
5
 
@@ -81,9 +81,8 @@ class Downloader(BaseProcess, BaseConnector, ABC):
81
81
  def is_async(self) -> bool:
82
82
  return True
83
83
 
84
- @abstractmethod
85
84
  def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
86
- pass
85
+ raise NotImplementedError()
87
86
 
88
87
  async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
89
88
  return self.run(file_data=file_data, **kwargs)
@@ -144,10 +144,6 @@ async def get_async_astra_collection(
144
144
  return async_astra_db_collection
145
145
 
146
146
 
147
- class AstraDBUploadStagerConfig(UploadStagerConfig):
148
- pass
149
-
150
-
151
147
  class AstraDBIndexerConfig(IndexerConfig):
152
148
  collection_name: str = Field(
153
149
  description="The name of the Astra DB collection. "
@@ -158,30 +154,6 @@ class AstraDBIndexerConfig(IndexerConfig):
158
154
  batch_size: int = Field(default=20, description="Number of records per batch")
159
155
 
160
156
 
161
- class AstraDBDownloaderConfig(DownloaderConfig):
162
- fields: list[str] = field(default_factory=list)
163
-
164
-
165
- class AstraDBUploaderConfig(UploaderConfig):
166
- collection_name: Optional[str] = Field(
167
- description="The name of the Astra DB collection. "
168
- "Note that the collection name must only include letters, "
169
- "numbers, and underscores.",
170
- default=None,
171
- )
172
- keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
173
- requested_indexing_policy: Optional[dict[str, Any]] = Field(
174
- default=None,
175
- description="The indexing policy to use for the collection.",
176
- examples=['{"deny": ["metadata"]}'],
177
- )
178
- batch_size: int = Field(default=20, description="Number of records per batch")
179
- record_id_key: str = Field(
180
- default=RECORD_ID_LABEL,
181
- description="searchable key to find entries for the same record on previous runs",
182
- )
183
-
184
-
185
157
  @dataclass
186
158
  class AstraDBIndexer(Indexer):
187
159
  connection_config: AstraDBConnectionConfig
@@ -239,6 +211,10 @@ class AstraDBIndexer(Indexer):
239
211
  yield fd
240
212
 
241
213
 
214
+ class AstraDBDownloaderConfig(DownloaderConfig):
215
+ fields: list[str] = field(default_factory=list)
216
+
217
+
242
218
  @dataclass
243
219
  class AstraDBDownloader(Downloader):
244
220
  connection_config: AstraDBConnectionConfig
@@ -315,6 +291,12 @@ class AstraDBDownloader(Downloader):
315
291
  return download_responses
316
292
 
317
293
 
294
+ class AstraDBUploadStagerConfig(UploadStagerConfig):
295
+ flatten_metadata: Optional[bool] = Field(
296
+ default=False, description="Move metadata to top level of the record."
297
+ )
298
+
299
+
318
300
  @dataclass
319
301
  class AstraDBUploadStager(UploadStager):
320
302
  upload_stager_config: AstraDBUploadStagerConfig = field(
@@ -336,6 +318,12 @@ class AstraDBUploadStager(UploadStager):
336
318
 
337
319
  def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
338
320
  self.truncate_dict_elements(element_dict)
321
+ if self.upload_stager_config.flatten_metadata:
322
+ # move metadata to top level so it isn't nested in metadata column
323
+ metadata = element_dict.pop("metadata", None)
324
+ if metadata:
325
+ element_dict.update(metadata)
326
+
339
327
  return {
340
328
  "$vector": element_dict.pop("embeddings", None),
341
329
  "content": element_dict.pop("text", None),
@@ -344,6 +332,26 @@ class AstraDBUploadStager(UploadStager):
344
332
  }
345
333
 
346
334
 
335
+ class AstraDBUploaderConfig(UploaderConfig):
336
+ collection_name: Optional[str] = Field(
337
+ description="The name of the Astra DB collection. "
338
+ "Note that the collection name must only include letters, "
339
+ "numbers, and underscores.",
340
+ default=None,
341
+ )
342
+ keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
343
+ requested_indexing_policy: Optional[dict[str, Any]] = Field(
344
+ default=None,
345
+ description="The indexing policy to use for the collection.",
346
+ examples=['{"deny": ["metadata"]}'],
347
+ )
348
+ batch_size: int = Field(default=20, description="Number of records per batch")
349
+ record_id_key: str = Field(
350
+ default=RECORD_ID_LABEL,
351
+ description="searchable key to find entries for the same record on previous runs",
352
+ )
353
+
354
+
347
355
  @dataclass
348
356
  class AstraDBUploader(Uploader):
349
357
  connection_config: AstraDBConnectionConfig
@@ -0,0 +1,31 @@
1
+ from unstructured_ingest.v2.processes.connector_registry import (
2
+ add_source_entry,
3
+ )
4
+
5
+ from .zendesk import (
6
+ CONNECTOR_TYPE,
7
+ ZendeskAccessConfig,
8
+ ZendeskClient,
9
+ ZendeskConnectionConfig,
10
+ ZendeskDownloader,
11
+ ZendeskDownloaderConfig,
12
+ ZendeskIndexer,
13
+ ZendeskIndexerConfig,
14
+ ZendeskTicket,
15
+ zendesk_source_entry,
16
+ )
17
+
18
+ __all__ = [
19
+ "add_source_entry",
20
+ "zendesk_source_entry",
21
+ "ZendeskAccessConfig",
22
+ "ZendeskClient",
23
+ "ZendeskConnectionConfig",
24
+ "ZendeskDownloader",
25
+ "ZendeskDownloaderConfig",
26
+ "ZendeskIndexer",
27
+ "ZendeskIndexerConfig",
28
+ "ZendeskTicket",
29
+ ]
30
+
31
+ add_source_entry(source_type=CONNECTOR_TYPE, entry=zendesk_source_entry)
@@ -0,0 +1,225 @@
1
+ import base64
2
+ from dataclasses import dataclass
3
+ from typing import Dict, List
4
+
5
+ import httpx
6
+
7
+ from unstructured_ingest.v2.errors import ProviderError, RateLimitError, UserAuthError, UserError
8
+ from unstructured_ingest.v2.logger import logger
9
+
10
+
11
+ @dataclass
12
+ class Comment:
13
+ id: int
14
+ author_id: str
15
+ body: str
16
+ parent_ticket_id: str
17
+ metadata: dict
18
+
19
+
20
+ @dataclass
21
+ class ZendeskTicket:
22
+ id: int
23
+ subject: str
24
+ description: str
25
+ generated_ts: int
26
+ metadata: dict
27
+
28
+ def __lt__(self, other):
29
+ return int(self.id) < int(other.id)
30
+
31
+
32
+ @dataclass
33
+ class ZendeskArticle:
34
+ id: int
35
+ author_id: str
36
+ title: str
37
+ content: str
38
+
39
+ def __lt__(self, other):
40
+ return int(self.id) < int(other.id)
41
+
42
+
43
+ class ZendeskClient:
44
+
45
+ def __init__(self, token: str, subdomain: str, email: str):
46
+ # should be okay to be blocking.
47
+ url_to_check = f"https://{subdomain}.zendesk.com/api/v2/groups.json"
48
+ auth = f"{email}/token", token
49
+
50
+ try:
51
+ _ = httpx.get(url_to_check, auth=auth)
52
+ except Exception as e:
53
+ raise self.wrap_error(e=e)
54
+
55
+ self._token = token
56
+ self._subdomain = subdomain
57
+ self._email = email
58
+ self._auth = auth
59
+
60
+ def wrap_error(self, e: Exception) -> Exception:
61
+ if not isinstance(e, httpx.HTTPStatusError):
62
+ logger.error(f"unhandled exception from Zendesk client: {e}", exc_info=True)
63
+ return e
64
+ url = e.request.url
65
+ response_code = e.response.status_code
66
+ if response_code == 401:
67
+ logger.error(
68
+ f"Failed to connect via auth,"
69
+ f"{url} using zendesk response, status code {response_code}"
70
+ )
71
+ return UserAuthError(e)
72
+ if response_code == 429:
73
+ logger.error(
74
+ f"Failed to connect via rate limits"
75
+ f"{url} using zendesk response, status code {response_code}"
76
+ )
77
+ return RateLimitError(e)
78
+ if 400 <= response_code < 500:
79
+ logger.error(
80
+ f"Failed to connect to {url} using zendesk response, status code {response_code}"
81
+ )
82
+ return UserError(e)
83
+ if response_code > 500:
84
+ logger.error(
85
+ f"Failed to connect to {url} using zendesk response, status code {response_code}"
86
+ )
87
+ return ProviderError(e)
88
+ logger.error(f"unhandled http status error from Zendesk client: {e}", exc_info=True)
89
+ return e
90
+
91
+ async def get_articles_async(self) -> List[ZendeskArticle]:
92
+ """
93
+ Retrieves article content from Zendesk asynchronously.
94
+ """
95
+
96
+ articles: List[ZendeskArticle] = []
97
+
98
+ article_url = f"https://{self._subdomain}.zendesk.com/api/v2/help_center/articles.json"
99
+
100
+ try:
101
+ async with httpx.AsyncClient() as client:
102
+ response = await client.get(article_url, auth=self._auth)
103
+ response.raise_for_status()
104
+ except Exception as e:
105
+ raise self.wrap_error(e=e)
106
+
107
+ articles_in_response: List[dict] = response.json()["articles"]
108
+
109
+ articles = [
110
+ ZendeskArticle(
111
+ id=int(entry["id"]),
112
+ author_id=str(entry["author_id"]),
113
+ title=str(entry["title"]),
114
+ content=entry["body"],
115
+ )
116
+ for entry in articles_in_response
117
+ ]
118
+ return articles
119
+
120
+ async def get_comments_async(self, ticket_id: int) -> List["Comment"]:
121
+ comments_url = f"https://{self._subdomain}.zendesk.com/api/v2/tickets/{ticket_id}/comments"
122
+
123
+ try:
124
+ async with httpx.AsyncClient() as client:
125
+ response = await client.get(comments_url, auth=self._auth)
126
+ response.raise_for_status()
127
+ except Exception as e:
128
+ raise self.wrap_error(e=e)
129
+
130
+ return [
131
+ Comment(
132
+ id=int(entry["id"]),
133
+ author_id=entry["author_id"],
134
+ body=entry["body"],
135
+ metadata=entry,
136
+ parent_ticket_id=ticket_id,
137
+ )
138
+ for entry in response.json()["comments"]
139
+ ]
140
+
141
+ def get_users(self) -> List[dict]:
142
+
143
+ users: List[dict] = []
144
+
145
+ users_url = f"https://{self._subdomain}.zendesk.com/api/v2/users"
146
+ try:
147
+ response = httpx.get(users_url, auth=self._auth)
148
+ response.raise_for_status()
149
+ except Exception as e:
150
+ raise self.wrap_error(e=e)
151
+
152
+ users_in_response: List[dict] = response.json()["users"]
153
+ users = users_in_response
154
+
155
+ return users
156
+
157
+ async def get_tickets_async(self) -> List["ZendeskTicket"]:
158
+ tickets: List["ZendeskTicket"] = []
159
+ tickets_url = f"https://{self._subdomain}.zendesk.com/api/v2/tickets"
160
+
161
+ try:
162
+ async with httpx.AsyncClient() as client:
163
+ response = await client.get(tickets_url, auth=self._auth)
164
+ response.raise_for_status()
165
+ except Exception as e:
166
+ raise self.wrap_error(e=e)
167
+
168
+ tickets_in_response: List[dict] = response.json()["tickets"]
169
+
170
+ for entry in tickets_in_response:
171
+ ticket = ZendeskTicket(
172
+ id=int(entry["id"]),
173
+ subject=entry["subject"],
174
+ description=entry["description"],
175
+ generated_ts=entry["generated_timestamp"],
176
+ metadata=entry,
177
+ )
178
+ tickets.append(ticket)
179
+
180
+ return tickets
181
+
182
+ async def get_article_attachments_async(self, article_id: str):
183
+ """
184
+ Handles article attachments such as images and stores them as UTF-8 encoded bytes.
185
+ """
186
+ article_attachment_url = (
187
+ f"https://{self._subdomain}.zendesk.com/api/v2/help_center/"
188
+ f"articles/{article_id}/attachments"
189
+ )
190
+
191
+ try:
192
+ async with httpx.AsyncClient() as client:
193
+ response = await client.get(article_attachment_url, auth=self._auth)
194
+ response.raise_for_status()
195
+ except Exception as e:
196
+ raise self.wrap_error(e=e)
197
+
198
+ attachments_in_response: List[Dict] = response.json().get("article_attachments", [])
199
+ attachments = []
200
+
201
+ for attachment in attachments_in_response:
202
+ attachment_data = {
203
+ "id": attachment["id"],
204
+ "file_name": attachment["file_name"],
205
+ "content_type": attachment["content_type"],
206
+ "size": attachment["size"],
207
+ "url": attachment["url"],
208
+ "content_url": attachment["content_url"],
209
+ }
210
+
211
+ try:
212
+ async with httpx.AsyncClient() as client:
213
+ download_response = await client.get(attachment["content_url"], auth=self._auth)
214
+ download_response.raise_for_status()
215
+ except Exception as e:
216
+ raise self.wrap_error(e=e)
217
+
218
+ encoded_content = base64.b64encode(download_response.content).decode("utf-8")
219
+ attachment_data["encoded_content"] = (
220
+ f"data:{attachment_data['content_type']};base64,{encoded_content}"
221
+ )
222
+
223
+ attachments.append(attachment_data)
224
+
225
+ return attachments
@@ -0,0 +1,419 @@
1
+ from __future__ import annotations
2
+
3
+ import datetime
4
+ import hashlib
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from time import time
8
+ from typing import Any, AsyncGenerator, List, Literal
9
+
10
+ import aiofiles
11
+ import bs4
12
+ from pydantic import BaseModel, Field, Secret
13
+
14
+ from unstructured_ingest.utils.data_prep import batch_generator
15
+ from unstructured_ingest.utils.html import HtmlMixin
16
+ from unstructured_ingest.v2.errors import UserAuthError
17
+ from unstructured_ingest.v2.interfaces import (
18
+ AccessConfig,
19
+ BatchFileData,
20
+ BatchItem,
21
+ ConnectionConfig,
22
+ Downloader,
23
+ DownloaderConfig,
24
+ DownloadResponse,
25
+ FileData,
26
+ FileDataSourceMetadata,
27
+ Indexer,
28
+ IndexerConfig,
29
+ SourceIdentifiers,
30
+ )
31
+ from unstructured_ingest.v2.logger import logger
32
+ from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
33
+
34
+ from .client import ZendeskArticle, ZendeskClient, ZendeskTicket
35
+
36
+ CONNECTOR_TYPE = "zendesk"
37
+
38
+
39
+ class ZendeskAdditionalMetadata(BaseModel):
40
+ item_type: str
41
+ leading_id: str # is the same as id just being verbose.
42
+ tail_id: str # last id in the batch.
43
+
44
+
45
+ class ZendeskFileDataSourceMetadata(FileDataSourceMetadata):
46
+ """
47
+ inherits metadata object as tickets and articles
48
+ are treated in single batch, we need to denote indices ticket/article
49
+ as the source metadata.
50
+ """
51
+
52
+
53
+ class ZendeskBatchFileData(BatchFileData):
54
+ additional_metadata: ZendeskAdditionalMetadata
55
+
56
+
57
+ class ZendeskAccessConfig(AccessConfig):
58
+ api_token: str = Field(
59
+ description="API token for zendesk generated under Apps and Integrations"
60
+ )
61
+
62
+
63
+ class ZendeskBatchItemTicket(BatchItem):
64
+ subject: str
65
+ description: str
66
+ item_type: str = "tickets" # placeholder for downloader
67
+
68
+
69
+ class ZendeskBatchItemArticle(BatchItem):
70
+ title: str
71
+ author_id: str
72
+ title: str
73
+ content: str
74
+
75
+
76
+ class ZendeskConnectionConfig(ConnectionConfig):
77
+ subdomain: str = Field(description="Subdomain for zendesk site, <sub-domain>.company.com")
78
+ email: str = Field(description="Email for zendesk site registered at the subdomain")
79
+ access_config: Secret[ZendeskAccessConfig]
80
+
81
+ async def get_client_async(self) -> ZendeskClient:
82
+ """Provides an async manager for ZendeskClient."""
83
+ access_config = self.access_config.get_secret_value()
84
+
85
+ client = ZendeskClient(
86
+ email=self.email, subdomain=self.subdomain, token=access_config.api_token
87
+ )
88
+ return client
89
+
90
+ def get_client(self) -> ZendeskClient:
91
+
92
+ access_config = self.access_config.get_secret_value()
93
+
94
+ client = ZendeskClient(
95
+ email=self.email, subdomain=self.subdomain, token=access_config.api_token
96
+ )
97
+ return client
98
+
99
+
100
+ class ZendeskIndexerConfig(IndexerConfig):
101
+ batch_size: int = Field(
102
+ default=2,
103
+ description="Number of tickets or articles.",
104
+ )
105
+ item_type: Literal["tickets", "articles", "all"] = Field(
106
+ default="tickets",
107
+ description="Type of item from zendesk to parse, can only be `tickets` or `articles`.",
108
+ )
109
+
110
+
111
+ @dataclass
112
+ class ZendeskIndexer(Indexer):
113
+ connection_config: ZendeskConnectionConfig
114
+ index_config: ZendeskIndexerConfig
115
+ connector_type: str = CONNECTOR_TYPE
116
+
117
+ def precheck(self) -> None:
118
+ """Validates connection to Zendesk API."""
119
+ try:
120
+ client = self.connection_config.get_client()
121
+ if not client.get_users():
122
+ subdomain_endpoint = f"{self.connection_config.subdomain}.zendesk.com"
123
+ raise UserAuthError(f"Users do not exist in subdomain {subdomain_endpoint}")
124
+ except UserAuthError as e:
125
+ logger.error(f"Source connection error: {e}", exc_info=True)
126
+ raise
127
+ except Exception as e:
128
+ logger.error(f"Failed to validate connection to Zendesk: {e}", exc_info=True)
129
+ raise UserAuthError(f"Failed to validate connection: {e}")
130
+
131
+ def is_async(self) -> bool:
132
+ return True
133
+
134
+ async def _list_articles_async(self) -> List[ZendeskArticle]:
135
+ client = await self.connection_config.get_client_async()
136
+ return await client.get_articles_async()
137
+
138
+ async def _list_tickets_async(self) -> List[ZendeskTicket]:
139
+ client = await self.connection_config.get_client_async()
140
+ return await client.get_tickets_async()
141
+
142
+ def _generate_fullpath(self, identifier: str) -> Path:
143
+ return Path(hashlib.sha256(identifier.encode("utf-8")).hexdigest()[:16] + ".txt")
144
+
145
+ async def handle_articles_async(
146
+ self, articles: List[ZendeskArticle], batch_size: int
147
+ ) -> AsyncGenerator[ZendeskBatchFileData, None]:
148
+ """Parses articles from a list and yields FileData objects asynchronously in batches."""
149
+ for article_batch in batch_generator(articles, batch_size=batch_size):
150
+
151
+ article_batch = sorted(article_batch)
152
+
153
+ additional_metadata = ZendeskAdditionalMetadata(
154
+ item_type="articles",
155
+ leading_id=str(article_batch[0].id),
156
+ tail_id=str(article_batch[-1].id),
157
+ )
158
+
159
+ metadata = ZendeskFileDataSourceMetadata(
160
+ date_processed=str(time()),
161
+ record_locator={
162
+ "id": str(article_batch[0].id),
163
+ "item_type": "articles",
164
+ },
165
+ )
166
+
167
+ batch_items: List[ZendeskBatchItemArticle] = [
168
+ ZendeskBatchItemArticle(
169
+ identifier=str(article.id),
170
+ author_id=str(article.author_id),
171
+ title=str(article.title),
172
+ content=str(article.content),
173
+ )
174
+ for article in article_batch
175
+ ]
176
+
177
+ full_path = self._generate_fullpath(str(article_batch[0].id))
178
+ full_path = Path(str(full_path).replace(".txt", ".html"))
179
+
180
+ source_identifiers = SourceIdentifiers(filename=full_path.name, fullpath=str(full_path))
181
+
182
+ batched_file_data = ZendeskBatchFileData(
183
+ identifier=str(article_batch[0].id),
184
+ connector_type=self.connector_type,
185
+ metadata=metadata,
186
+ batch_items=batch_items,
187
+ additional_metadata=additional_metadata,
188
+ source_identifiers=source_identifiers,
189
+ )
190
+
191
+ yield batched_file_data
192
+
193
+ async def handle_tickets_async(
194
+ self, tickets: List[ZendeskTicket], batch_size: int
195
+ ) -> AsyncGenerator[ZendeskBatchFileData, None]:
196
+ """Parses tickets from a list and yields FileData objects asynchronously in batches."""
197
+ for ticket_batch in batch_generator(tickets, batch_size=batch_size):
198
+
199
+ sorted_batch = sorted(ticket_batch)
200
+
201
+ additional_metadata = ZendeskAdditionalMetadata(
202
+ item_type="tickets",
203
+ leading_id=str(sorted_batch[0].id),
204
+ tail_id=str(sorted_batch[-1].id),
205
+ )
206
+
207
+ metadata = ZendeskFileDataSourceMetadata(
208
+ date_processed=str(time()),
209
+ record_locator={
210
+ "id": str(sorted_batch[0].id),
211
+ "item_type": "tickets",
212
+ },
213
+ )
214
+
215
+ batch_items: List[ZendeskBatchItemTicket] = [
216
+ ZendeskBatchItemTicket(
217
+ identifier=str(ticket.id),
218
+ subject=str(ticket.subject),
219
+ description=str(ticket.description),
220
+ )
221
+ for ticket in sorted_batch
222
+ ]
223
+
224
+ full_path = self._generate_fullpath(str(sorted_batch[0].id))
225
+ source_identifiers = SourceIdentifiers(filename=full_path.name, fullpath=str(full_path))
226
+
227
+ batched_file_data = ZendeskBatchFileData(
228
+ connector_type=self.connector_type,
229
+ metadata=metadata,
230
+ batch_items=batch_items,
231
+ additional_metadata=additional_metadata,
232
+ source_identifiers=source_identifiers,
233
+ )
234
+
235
+ yield batched_file_data
236
+
237
+ async def run_async(self, **kwargs: Any) -> AsyncGenerator[FileData, None]:
238
+ """Determines item type and processes accordingly asynchronously."""
239
+ item_type = self.index_config.item_type
240
+ batch_size = self.index_config.batch_size
241
+
242
+ if item_type == "articles":
243
+ articles = await self._list_articles_async()
244
+ async for file_data in self.handle_articles_async(
245
+ articles, batch_size
246
+ ): # Using async version
247
+ yield file_data
248
+
249
+ elif item_type == "tickets":
250
+ tickets = await self._list_tickets_async()
251
+ async for file_data in self.handle_tickets_async(
252
+ tickets, batch_size
253
+ ): # Using async version
254
+ yield file_data
255
+
256
+
257
+ class ZendeskDownloaderConfig(DownloaderConfig, HtmlMixin):
258
+ pass
259
+
260
+
261
+ @dataclass
262
+ class ZendeskDownloader(Downloader):
263
+ download_config: ZendeskDownloaderConfig
264
+ connection_config: ZendeskConnectionConfig
265
+ connector_type: str = CONNECTOR_TYPE
266
+
267
+ def is_async(self) -> bool:
268
+ return True
269
+
270
+ def download_embedded_files(
271
+ self, session, html: str, current_file_data: FileData
272
+ ) -> list[DownloadResponse]:
273
+ if not self.download_config.extract_files:
274
+ return []
275
+ url = current_file_data.metadata.url
276
+ if url is None:
277
+ logger.warning(
278
+ f"""Missing URL for file: {current_file_data.source_identifiers.filename}.
279
+ Skipping file extraction."""
280
+ )
281
+ return []
282
+ filepath = current_file_data.source_identifiers.relative_path
283
+ download_path = Path(self.download_dir) / filepath
284
+ download_dir = download_path.with_suffix("")
285
+ return self.download_config.extract_embedded_files(
286
+ url=url,
287
+ download_dir=download_dir,
288
+ original_filedata=current_file_data,
289
+ html=html,
290
+ session=session,
291
+ )
292
+
293
+ async def handle_articles_async(
294
+ self, client: ZendeskClient, batch_file_data: ZendeskBatchFileData
295
+ ):
296
+ """
297
+ Processes the article information, downloads the attachments for each article,
298
+ and updates the content accordingly.
299
+ """
300
+ # Determine the download path
301
+ download_path = self.get_download_path(batch_file_data)
302
+
303
+ if download_path is None:
304
+ raise ValueError("Download path could not be determined")
305
+
306
+ download_path.parent.mkdir(parents=True, exist_ok=True)
307
+
308
+ async with aiofiles.open(download_path, "a", encoding="utf8") as f:
309
+ for article in batch_file_data.batch_items:
310
+ html_data_str = article.content
311
+ soup = bs4.BeautifulSoup(html_data_str, "html.parser")
312
+
313
+ if self.download_config.extract_images:
314
+ # Get article attachments asynchronously
315
+ image_data_decoded: List = await client.get_article_attachments_async(
316
+ article_id=article.identifier
317
+ )
318
+ img_tags = soup.find_all("img")
319
+
320
+ # Ensure we don't exceed the available images
321
+ for img_tag, img_data in zip(img_tags, image_data_decoded):
322
+ img_tag["src"] = img_data.get("encoded_content", "")
323
+
324
+ await f.write(soup.prettify())
325
+
326
+ return super().generate_download_response(
327
+ file_data=batch_file_data, download_path=download_path
328
+ )
329
+
330
+ async def handle_tickets_async(
331
+ self, client: ZendeskClient, batch_file_data: ZendeskBatchFileData
332
+ ) -> DownloadResponse:
333
+ """
334
+ Processes a batch of tickets asynchronously, writing their details and comments to a file.
335
+ """
336
+ # Determine the download path
337
+ download_path = self.get_download_path(batch_file_data)
338
+ if download_path is None:
339
+ raise ValueError("Download path could not be determined")
340
+
341
+ download_path.parent.mkdir(parents=True, exist_ok=True)
342
+
343
+ # Process each ticket in the batch
344
+ async with aiofiles.open(download_path, "a", encoding="utf8") as f:
345
+ for batch_item in batch_file_data.batch_items:
346
+ ticket_identifier = batch_item.identifier
347
+ first_date = None
348
+ comments: List[dict] = []
349
+
350
+ # Fetch comments asynchronously
351
+ comments_list = await client.get_comments_async(ticket_id=int(ticket_identifier))
352
+
353
+ for comment in comments_list: # Iterate over the resolved list
354
+ date_created = (
355
+ comment.metadata["created_at"].isoformat()
356
+ if isinstance(comment.metadata["created_at"], datetime.datetime)
357
+ else str(comment.metadata["created_at"])
358
+ )
359
+
360
+ if first_date is None:
361
+ first_date = date_created
362
+
363
+ comments.append(
364
+ {
365
+ "comment_id": comment.id,
366
+ "author_id": comment.author_id,
367
+ "body": comment.body,
368
+ "date_created": date_created,
369
+ }
370
+ )
371
+
372
+ # Write ticket details to file
373
+ content = (
374
+ "\nticket\n"
375
+ f"{batch_item.identifier}\n"
376
+ f"{batch_file_data.metadata.record_locator.get('subject', '')}\n"
377
+ f"{batch_file_data.metadata.record_locator.get('description', '')}\n"
378
+ f"{first_date}\n"
379
+ )
380
+
381
+ # Append comments
382
+ for comment in comments:
383
+ content += (
384
+ "comment\n"
385
+ f"{comment.get('comment_id', '')}\n"
386
+ f"{comment.get('author_id', '')}\n"
387
+ f"{comment.get('body', '')}\n"
388
+ f"{comment.get('date_created', '')}\n"
389
+ )
390
+
391
+ await f.write(content)
392
+
393
+ return super().generate_download_response(
394
+ file_data=batch_file_data, download_path=download_path
395
+ )
396
+
397
+ async def run_async(self, file_data: ZendeskBatchFileData, **kwargs: Any) -> DownloadResponse:
398
+
399
+ zendesk_filedata: FileData = FileData.cast(file_data=file_data)
400
+
401
+ client = await self.connection_config.get_client_async()
402
+ item_type = zendesk_filedata.metadata.record_locator["item_type"]
403
+
404
+ if item_type == "articles":
405
+ return await self.handle_articles_async(client, file_data)
406
+ elif item_type == "tickets":
407
+ return await self.handle_tickets_async(client, file_data)
408
+ else:
409
+ raise RuntimeError(f"Item type {item_type} cannot be handled by the downloader")
410
+
411
+
412
+ # create entry
413
+ zendesk_source_entry = SourceRegistryEntry(
414
+ connection_config=ZendeskConnectionConfig,
415
+ indexer_config=ZendeskIndexerConfig,
416
+ indexer=ZendeskIndexer,
417
+ downloader=ZendeskDownloader,
418
+ downloader_config=ZendeskDownloaderConfig,
419
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: unstructured-ingest
3
- Version: 0.5.10
3
+ Version: 0.5.11
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -23,12 +23,12 @@ Requires-Python: >=3.9.0,<3.14
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
25
  Requires-Dist: tqdm
26
- Requires-Dist: dataclasses_json
27
- Requires-Dist: pydantic>=2.7
28
- Requires-Dist: pandas
29
26
  Requires-Dist: click
27
+ Requires-Dist: pandas
28
+ Requires-Dist: pydantic>=2.7
30
29
  Requires-Dist: opentelemetry-sdk
31
30
  Requires-Dist: python-dateutil
31
+ Requires-Dist: dataclasses_json
32
32
  Provides-Extra: remote
33
33
  Requires-Dist: unstructured-client>=0.30.0; extra == "remote"
34
34
  Provides-Extra: csv
@@ -66,13 +66,13 @@ Requires-Dist: pyairtable; extra == "airtable"
66
66
  Provides-Extra: astradb
67
67
  Requires-Dist: astrapy; extra == "astradb"
68
68
  Provides-Extra: azure
69
- Requires-Dist: adlfs; extra == "azure"
70
69
  Requires-Dist: fsspec; extra == "azure"
70
+ Requires-Dist: adlfs; extra == "azure"
71
71
  Provides-Extra: azure-ai-search
72
72
  Requires-Dist: azure-search-documents; extra == "azure-ai-search"
73
73
  Provides-Extra: biomed
74
- Requires-Dist: requests; extra == "biomed"
75
74
  Requires-Dist: bs4; extra == "biomed"
75
+ Requires-Dist: requests; extra == "biomed"
76
76
  Provides-Extra: box
77
77
  Requires-Dist: fsspec; extra == "box"
78
78
  Requires-Dist: boxfs; extra == "box"
@@ -91,19 +91,19 @@ Requires-Dist: deltalake; extra == "delta-table"
91
91
  Provides-Extra: discord
92
92
  Requires-Dist: discord.py; extra == "discord"
93
93
  Provides-Extra: dropbox
94
- Requires-Dist: fsspec; extra == "dropbox"
95
94
  Requires-Dist: dropboxdrivefs; extra == "dropbox"
95
+ Requires-Dist: fsspec; extra == "dropbox"
96
96
  Provides-Extra: duckdb
97
97
  Requires-Dist: duckdb; extra == "duckdb"
98
98
  Provides-Extra: elasticsearch
99
99
  Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
100
100
  Provides-Extra: gcs
101
- Requires-Dist: fsspec; extra == "gcs"
102
101
  Requires-Dist: bs4; extra == "gcs"
102
+ Requires-Dist: fsspec; extra == "gcs"
103
103
  Requires-Dist: gcsfs; extra == "gcs"
104
104
  Provides-Extra: github
105
- Requires-Dist: pygithub>1.58.0; extra == "github"
106
105
  Requires-Dist: requests; extra == "github"
106
+ Requires-Dist: pygithub>1.58.0; extra == "github"
107
107
  Provides-Extra: gitlab
108
108
  Requires-Dist: python-gitlab; extra == "gitlab"
109
109
  Provides-Extra: google-drive
@@ -124,18 +124,18 @@ Requires-Dist: pymilvus; extra == "milvus"
124
124
  Provides-Extra: mongodb
125
125
  Requires-Dist: pymongo; extra == "mongodb"
126
126
  Provides-Extra: neo4j
127
- Requires-Dist: cymple; extra == "neo4j"
128
- Requires-Dist: neo4j-rust-ext; extra == "neo4j"
129
127
  Requires-Dist: networkx; extra == "neo4j"
128
+ Requires-Dist: neo4j-rust-ext; extra == "neo4j"
129
+ Requires-Dist: cymple; extra == "neo4j"
130
130
  Provides-Extra: notion
131
+ Requires-Dist: htmlBuilder; extra == "notion"
132
+ Requires-Dist: httpx; extra == "notion"
131
133
  Requires-Dist: notion-client; extra == "notion"
132
134
  Requires-Dist: backoff; extra == "notion"
133
- Requires-Dist: httpx; extra == "notion"
134
- Requires-Dist: htmlBuilder; extra == "notion"
135
135
  Provides-Extra: onedrive
136
136
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
137
- Requires-Dist: msal; extra == "onedrive"
138
137
  Requires-Dist: bs4; extra == "onedrive"
138
+ Requires-Dist: msal; extra == "onedrive"
139
139
  Provides-Extra: opensearch
140
140
  Requires-Dist: opensearch-py; extra == "opensearch"
141
141
  Provides-Extra: outlook
@@ -178,18 +178,18 @@ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
178
178
  Provides-Extra: singlestore
179
179
  Requires-Dist: singlestoredb; extra == "singlestore"
180
180
  Provides-Extra: vectara
181
+ Requires-Dist: httpx; extra == "vectara"
181
182
  Requires-Dist: requests; extra == "vectara"
182
183
  Requires-Dist: aiofiles; extra == "vectara"
183
- Requires-Dist: httpx; extra == "vectara"
184
184
  Provides-Extra: vastdb
185
+ Requires-Dist: vastdb; extra == "vastdb"
185
186
  Requires-Dist: ibis; extra == "vastdb"
186
187
  Requires-Dist: pyarrow; extra == "vastdb"
187
- Requires-Dist: vastdb; extra == "vastdb"
188
188
  Provides-Extra: embed-huggingface
189
189
  Requires-Dist: sentence-transformers; extra == "embed-huggingface"
190
190
  Provides-Extra: embed-octoai
191
- Requires-Dist: openai; extra == "embed-octoai"
192
191
  Requires-Dist: tiktoken; extra == "embed-octoai"
192
+ Requires-Dist: openai; extra == "embed-octoai"
193
193
  Provides-Extra: embed-vertexai
194
194
  Requires-Dist: vertexai; extra == "embed-vertexai"
195
195
  Provides-Extra: embed-voyageai
@@ -197,11 +197,11 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
197
197
  Provides-Extra: embed-mixedbreadai
198
198
  Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
199
199
  Provides-Extra: openai
200
- Requires-Dist: openai; extra == "openai"
201
200
  Requires-Dist: tiktoken; extra == "openai"
201
+ Requires-Dist: openai; extra == "openai"
202
202
  Provides-Extra: bedrock
203
- Requires-Dist: boto3; extra == "bedrock"
204
203
  Requires-Dist: aioboto3; extra == "bedrock"
204
+ Requires-Dist: boto3; extra == "bedrock"
205
205
  Provides-Extra: togetherai
206
206
  Requires-Dist: together; extra == "togetherai"
207
207
  Dynamic: author
@@ -5,7 +5,7 @@ test/integration/chunkers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
5
5
  test/integration/chunkers/test_chunkers.py,sha256=USkltQN_mVVCxI0FkJsrS1gnLXlVr-fvsc0tPaK2sWI,1062
6
6
  test/integration/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  test/integration/connectors/conftest.py,sha256=vYs4WDlCuieAwwErkJxCk4a1lGvr3qpeiAm-YaDznSo,1018
8
- test/integration/connectors/test_astradb.py,sha256=pZmUItFzS91etJONk5HaX8ayarXmFH7RhKmtBxmCClQ,8995
8
+ test/integration/connectors/test_astradb.py,sha256=c9Lk0dvJVVdzHcokvsc4XMNJ4SIO1k2vGtT5py0cFVM,9753
9
9
  test/integration/connectors/test_azure_ai_search.py,sha256=MxFwk84vI_HT4taQTGrNpJ8ewGPqHSGrx626j8hC_Pw,9695
10
10
  test/integration/connectors/test_chroma.py,sha256=NuQv0PWPM0_LQfdPeUd6IYKqaKKXWmVaHGWjq5aBfOY,3721
11
11
  test/integration/connectors/test_confluence.py,sha256=Ju0gRQbD2g9l9iRf2HDZKi7RyPnBGtFRWcGpsqhO3F8,3588
@@ -25,6 +25,7 @@ test/integration/connectors/test_redis.py,sha256=1aKwOb-K4zCxZwHmgW_WzGJwqLntbWT
25
25
  test/integration/connectors/test_s3.py,sha256=E1dypeag_E3OIfpQWIz3jb7ctRHRD63UtyTrzyvJzpc,7473
26
26
  test/integration/connectors/test_sharepoint.py,sha256=weGby5YD6se7R7KLEq96hxUZYPzwoqZqXXTPhtQWZsQ,7646
27
27
  test/integration/connectors/test_vectara.py,sha256=4kKOOTGUjeZw2jKRcgVDI7ifbRPRZfjjVO4d_7H5C6I,8710
28
+ test/integration/connectors/test_zendesk.py,sha256=6Xsxxav9b1NBp_zd66S_sE4Nn5iO6Et4a5zgGR2-Y04,4159
28
29
  test/integration/connectors/databricks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
30
  test/integration/connectors/databricks/test_volumes_native.py,sha256=KqiapQAV0s_Zv0CO8BwYoiCk30dwrSZzuigUWNRIem0,9559
30
31
  test/integration/connectors/discord/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -49,7 +50,7 @@ test/integration/connectors/utils/constants.py,sha256=JhTk6YNw7JVpkk-Pl8zn2YYkEx
49
50
  test/integration/connectors/utils/docker.py,sha256=4g1STiSbYN5qcmDTXyPxVJgwx97O6wk7n-DJ-zgzgag,4971
50
51
  test/integration/connectors/utils/docker_compose.py,sha256=GVTB6Cel05c0VQ2n4AwkQQx_cBfz13ZTs1HpbaYipNU,2223
51
52
  test/integration/connectors/utils/validation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
52
- test/integration/connectors/utils/validation/destination.py,sha256=ZvMSvqz9in35xaoUJGx9rG8oWCU3FYlfLLQ6sfdI0pw,2649
53
+ test/integration/connectors/utils/validation/destination.py,sha256=HUdwpvszGOuGnKZFawGdxRNptbbJDIghyi-roQjhEs4,2697
53
54
  test/integration/connectors/utils/validation/equality.py,sha256=R6d_1c-Si5518WJcBcshF_wBRnywnZ0ORQ-NL0xNmGo,2602
54
55
  test/integration/connectors/utils/validation/source.py,sha256=xnAZI26ILdeMhgrWAGrU2N2fqK58YNGkfyUhJekZ0Ho,13541
55
56
  test/integration/connectors/utils/validation/utils.py,sha256=xYYvAbqP6_lZyH09_JjB4w2Sf8aQPvDVT5vZTs05ILs,1428
@@ -110,7 +111,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
110
111
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
111
112
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
112
113
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
113
- unstructured_ingest/__version__.py,sha256=zt01ptYEjMmXlL3j2UXGxQc1ECQ60nEk9hYEq2kojkc,43
114
+ unstructured_ingest/__version__.py,sha256=jn_Macoo3VuCWr-9TnO28WPJsWO8fYuvd5mexbdfL3c,43
114
115
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
115
116
  unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
116
117
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -397,7 +398,7 @@ unstructured_ingest/v2/cli/utils/click.py,sha256=1_eJgrwS2DFBl1jZPLsj1vgVgR7agFB
397
398
  unstructured_ingest/v2/cli/utils/model_conversion.py,sha256=7eEIkk1KU51-ZNiIfI1KRxlwITNW1xl1YxMAG8BcTk0,7604
398
399
  unstructured_ingest/v2/interfaces/__init__.py,sha256=Xp7-345QpM6MG7V7G4ZrVERjADAUBiPAY88PKaMRyqY,1005
399
400
  unstructured_ingest/v2/interfaces/connector.py,sha256=qUFFJ3qgDMenTCZMtVRjq1DIwsVak6pxNjQOH2eVkMw,1623
400
- unstructured_ingest/v2/interfaces/downloader.py,sha256=Lj3nTY1hPA71GfNeedFVCdHdZsHLle8qrx5RtXAy9GY,2940
401
+ unstructured_ingest/v2/interfaces/downloader.py,sha256=Qi_wISgUACZKEPu5p1kUaG3uiCXcr3zWg9z9uRDwoOk,2927
401
402
  unstructured_ingest/v2/interfaces/file_data.py,sha256=7MyRlj5dijQsCR6W18wQ8fEgJigGKwoOYc10g9A6PSo,3834
402
403
  unstructured_ingest/v2/interfaces/indexer.py,sha256=i0oftyifXefxfKa4a3sCfSwkzWGSPE6EvC9sg6fwZgk,833
403
404
  unstructured_ingest/v2/interfaces/process.py,sha256=S3A_9gkwwGC-iQxvnpj3Er6IJAjAT5npzpSgxuFAzUM,449
@@ -427,7 +428,7 @@ unstructured_ingest/v2/processes/partitioner.py,sha256=HxopDSbovLh_1epeGeVtuWEX7
427
428
  unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
428
429
  unstructured_ingest/v2/processes/connectors/__init__.py,sha256=rkEQVVgcHoY3jwgW_5PH_NzdXIEwtBLs9Dk4VzmTZMA,6387
429
430
  unstructured_ingest/v2/processes/connectors/airtable.py,sha256=eeZJe-bBNxt5Sa-XEFCdcGeJCguJU5WN2Mv9kLp5dVQ,8917
430
- unstructured_ingest/v2/processes/connectors/astradb.py,sha256=3WFJUNEjeuZFhsLW9KzOIOsiStCjpnqKokS1oIQLUR0,17816
431
+ unstructured_ingest/v2/processes/connectors/astradb.py,sha256=E6fB4anCd_gtSzVUsZ5pDrfdxs5AWERQM_NEfeenfEs,18202
431
432
  unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=ngPDpU0oZ6m5sxIlB6u5ebQpqCS_SJ-_amCC1KQ03EQ,11529
432
433
  unstructured_ingest/v2/processes/connectors/chroma.py,sha256=VHCnM56qNXuHzovJihrNfJnZbWLJShOe8j12PJFrbL0,7219
433
434
  unstructured_ingest/v2/processes/connectors/confluence.py,sha256=wTZewdbmCHaQuEJZ7Wf0NBOo8fS_n1I0DDwlhN96woE,11243
@@ -571,9 +572,12 @@ unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-
571
572
  unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
572
573
  unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
573
574
  unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=UZ_s8dnVNx9BWFG2fPah4VbQbgEDF4nP78bQeU3jg08,12821
574
- unstructured_ingest-0.5.10.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
575
- unstructured_ingest-0.5.10.dist-info/METADATA,sha256=uDnGDugbuWFqPuo9b-ZVLsuJ57ct-mfYQuAAvYIvE4c,8317
576
- unstructured_ingest-0.5.10.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
577
- unstructured_ingest-0.5.10.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
578
- unstructured_ingest-0.5.10.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
579
- unstructured_ingest-0.5.10.dist-info/RECORD,,
575
+ unstructured_ingest/v2/processes/connectors/zendesk/__init__.py,sha256=XMNocKJ3FHDfy36p_KHhH7ALi0-ji6NhGuQNCV2E4vY,699
576
+ unstructured_ingest/v2/processes/connectors/zendesk/client.py,sha256=wK2x5t2h0qXSwCYgli8Zegg8bujdSrgnmiTO-bu7nN4,7297
577
+ unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py,sha256=97yikyb6EQ70pjU2ZXpYnJeC55vkeXaEXlawx5qS6Oo,15228
578
+ unstructured_ingest-0.5.11.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
579
+ unstructured_ingest-0.5.11.dist-info/METADATA,sha256=5HEW821YxrURJHOb7OxOa8AggarvDctXU0V8p2z1gws,8317
580
+ unstructured_ingest-0.5.11.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
581
+ unstructured_ingest-0.5.11.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
582
+ unstructured_ingest-0.5.11.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
583
+ unstructured_ingest-0.5.11.dist-info/RECORD,,