unstructured-ingest 0.5.14__py3-none-any.whl → 0.5.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (39) hide show
  1. test/integration/connectors/test_confluence.py +2 -2
  2. test/integration/connectors/test_zendesk.py +31 -53
  3. test/integration/connectors/utils/validation/source.py +5 -3
  4. test/unit/v2/connectors/test_confluence.py +35 -3
  5. unstructured_ingest/__version__.py +1 -1
  6. unstructured_ingest/embed/huggingface.py +3 -7
  7. unstructured_ingest/utils/data_prep.py +4 -2
  8. unstructured_ingest/v2/interfaces/file_data.py +1 -1
  9. unstructured_ingest/v2/interfaces/upload_stager.py +3 -6
  10. unstructured_ingest/v2/pipeline/pipeline.py +7 -0
  11. unstructured_ingest/v2/pipeline/steps/chunk.py +1 -1
  12. unstructured_ingest/v2/pipeline/steps/download.py +3 -3
  13. unstructured_ingest/v2/pipeline/steps/embed.py +1 -1
  14. unstructured_ingest/v2/pipeline/steps/index.py +4 -4
  15. unstructured_ingest/v2/pipeline/steps/partition.py +1 -1
  16. unstructured_ingest/v2/processes/connectors/confluence.py +20 -3
  17. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +6 -0
  18. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +6 -0
  19. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +6 -0
  20. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +6 -0
  21. unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +1 -1
  22. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +6 -0
  23. unstructured_ingest/v2/processes/connectors/fsspec/box.py +6 -0
  24. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +6 -0
  25. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +6 -0
  26. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +6 -0
  27. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +6 -0
  28. unstructured_ingest/v2/processes/connectors/local.py +8 -1
  29. unstructured_ingest/v2/processes/connectors/zendesk/client.py +221 -156
  30. unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py +83 -274
  31. unstructured_ingest/v2/processes/embedder.py +3 -4
  32. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  33. unstructured_ingest/v2/processes/utils/blob_storage.py +31 -0
  34. {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.16.dist-info}/METADATA +20 -20
  35. {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.16.dist-info}/RECORD +39 -37
  36. {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.16.dist-info}/LICENSE.md +0 -0
  37. {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.16.dist-info}/WHEEL +0 -0
  38. {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.16.dist-info}/entry_points.txt +0 -0
  39. {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.16.dist-info}/top_level.txt +0 -0
@@ -1,22 +1,17 @@
1
1
  from __future__ import annotations
2
2
 
3
- import datetime
4
3
  import hashlib
5
4
  from dataclasses import dataclass
6
5
  from pathlib import Path
7
6
  from time import time
8
- from typing import Any, AsyncGenerator, List, Literal
7
+ from typing import Any, AsyncGenerator, Literal, Union
9
8
 
10
9
  from pydantic import BaseModel, Field, Secret
11
10
 
12
- from unstructured_ingest.utils.data_prep import batch_generator
13
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
14
12
  from unstructured_ingest.utils.html import HtmlMixin
15
- from unstructured_ingest.v2.errors import UserAuthError
16
13
  from unstructured_ingest.v2.interfaces import (
17
14
  AccessConfig,
18
- BatchFileData,
19
- BatchItem,
20
15
  ConnectionConfig,
21
16
  Downloader,
22
17
  DownloaderConfig,
@@ -36,20 +31,11 @@ CONNECTOR_TYPE = "zendesk"
36
31
 
37
32
 
38
33
  class ZendeskAdditionalMetadata(BaseModel):
39
- item_type: str
40
- leading_id: str # is the same as id just being verbose.
41
- tail_id: str # last id in the batch.
34
+ item_type: Literal["ticket", "article"]
35
+ content: Union[ZendeskTicket, ZendeskArticle]
42
36
 
43
37
 
44
- class ZendeskFileDataSourceMetadata(FileDataSourceMetadata):
45
- """
46
- inherits metadata object as tickets and articles
47
- are treated in single batch, we need to denote indices ticket/article
48
- as the source metadata.
49
- """
50
-
51
-
52
- class ZendeskBatchFileData(BatchFileData):
38
+ class ZendeskFileData(FileData):
53
39
  additional_metadata: ZendeskAdditionalMetadata
54
40
 
55
41
 
@@ -59,48 +45,21 @@ class ZendeskAccessConfig(AccessConfig):
59
45
  )
60
46
 
61
47
 
62
- class ZendeskBatchItemTicket(BatchItem):
63
- subject: str
64
- description: str
65
- item_type: str = "tickets" # placeholder for downloader
66
-
67
-
68
- class ZendeskBatchItemArticle(BatchItem):
69
- title: str
70
- author_id: str
71
- title: str
72
- content: str
73
-
74
-
75
48
  class ZendeskConnectionConfig(ConnectionConfig):
76
49
  subdomain: str = Field(description="Subdomain for zendesk site, <sub-domain>.company.com")
77
50
  email: str = Field(description="Email for zendesk site registered at the subdomain")
78
51
  access_config: Secret[ZendeskAccessConfig]
79
52
 
80
- async def get_client_async(self) -> ZendeskClient:
81
- """Provides an async manager for ZendeskClient."""
82
- access_config = self.access_config.get_secret_value()
83
-
84
- client = ZendeskClient(
85
- email=self.email, subdomain=self.subdomain, token=access_config.api_token
86
- )
87
- return client
88
-
89
53
  def get_client(self) -> ZendeskClient:
90
54
 
91
55
  access_config = self.access_config.get_secret_value()
92
56
 
93
- client = ZendeskClient(
57
+ return ZendeskClient(
94
58
  email=self.email, subdomain=self.subdomain, token=access_config.api_token
95
59
  )
96
- return client
97
60
 
98
61
 
99
62
  class ZendeskIndexerConfig(IndexerConfig):
100
- batch_size: int = Field(
101
- default=2,
102
- description="Number of tickets or articles.",
103
- )
104
63
  item_type: Literal["tickets", "articles", "all"] = Field(
105
64
  default="tickets",
106
65
  description="Type of item from zendesk to parse, can only be `tickets` or `articles`.",
@@ -115,142 +74,76 @@ class ZendeskIndexer(Indexer):
115
74
 
116
75
  def precheck(self) -> None:
117
76
  """Validates connection to Zendesk API."""
118
- try:
119
- client = self.connection_config.get_client()
120
- if not client.get_users():
121
- subdomain_endpoint = f"{self.connection_config.subdomain}.zendesk.com"
122
- raise UserAuthError(f"Users do not exist in subdomain {subdomain_endpoint}")
123
- except UserAuthError as e:
124
- logger.error(f"Source connection error: {e}", exc_info=True)
125
- raise
126
- except Exception as e:
127
- logger.error(f"Failed to validate connection to Zendesk: {e}", exc_info=True)
128
- raise UserAuthError(f"Failed to validate connection: {e}")
77
+ self.connection_config.get_client()
129
78
 
130
79
  def is_async(self) -> bool:
131
80
  return True
132
81
 
133
- async def _list_articles_async(self) -> List[ZendeskArticle]:
134
- client = await self.connection_config.get_client_async()
135
- return await client.get_articles_async()
136
-
137
- async def _list_tickets_async(self) -> List[ZendeskTicket]:
138
- client = await self.connection_config.get_client_async()
139
- return await client.get_tickets_async()
140
-
141
82
  def _generate_fullpath(self, identifier: str) -> Path:
142
83
  return Path(hashlib.sha256(identifier.encode("utf-8")).hexdigest()[:16] + ".txt")
143
84
 
144
- async def handle_articles_async(
145
- self, articles: List[ZendeskArticle], batch_size: int
146
- ) -> AsyncGenerator[ZendeskBatchFileData, None]:
147
- """Parses articles from a list and yields FileData objects asynchronously in batches."""
148
- for article_batch in batch_generator(articles, batch_size=batch_size):
149
-
150
- article_batch = sorted(article_batch)
151
-
152
- additional_metadata = ZendeskAdditionalMetadata(
153
- item_type="articles",
154
- leading_id=str(article_batch[0].id),
155
- tail_id=str(article_batch[-1].id),
156
- )
157
-
158
- metadata = ZendeskFileDataSourceMetadata(
159
- date_processed=str(time()),
160
- record_locator={
161
- "id": str(article_batch[0].id),
162
- "item_type": "articles",
163
- },
164
- )
165
-
166
- batch_items: List[ZendeskBatchItemArticle] = [
167
- ZendeskBatchItemArticle(
168
- identifier=str(article.id),
169
- author_id=str(article.author_id),
170
- title=str(article.title),
171
- content=str(article.content),
172
- )
173
- for article in article_batch
174
- ]
175
-
176
- full_path = self._generate_fullpath(str(article_batch[0].id))
177
- full_path = Path(str(full_path).replace(".txt", ".html"))
178
-
179
- source_identifiers = SourceIdentifiers(filename=full_path.name, fullpath=str(full_path))
180
-
181
- batched_file_data = ZendeskBatchFileData(
182
- identifier=str(article_batch[0].id),
183
- connector_type=self.connector_type,
184
- metadata=metadata,
185
- batch_items=batch_items,
186
- additional_metadata=additional_metadata,
187
- source_identifiers=source_identifiers,
188
- )
189
-
190
- yield batched_file_data
191
-
192
- async def handle_tickets_async(
193
- self, tickets: List[ZendeskTicket], batch_size: int
194
- ) -> AsyncGenerator[ZendeskBatchFileData, None]:
195
- """Parses tickets from a list and yields FileData objects asynchronously in batches."""
196
- for ticket_batch in batch_generator(tickets, batch_size=batch_size):
197
-
198
- sorted_batch = sorted(ticket_batch)
199
-
200
- additional_metadata = ZendeskAdditionalMetadata(
201
- item_type="tickets",
202
- leading_id=str(sorted_batch[0].id),
203
- tail_id=str(sorted_batch[-1].id),
204
- )
205
-
206
- metadata = ZendeskFileDataSourceMetadata(
207
- date_processed=str(time()),
208
- record_locator={
209
- "id": str(sorted_batch[0].id),
210
- "item_type": "tickets",
211
- },
212
- )
213
-
214
- batch_items: List[ZendeskBatchItemTicket] = [
215
- ZendeskBatchItemTicket(
85
+ async def get_tickets(self) -> AsyncGenerator[ZendeskFileData, None]:
86
+ async with self.connection_config.get_client() as client:
87
+ async for ticket in client.get_tickets():
88
+ yield ZendeskFileData(
216
89
  identifier=str(ticket.id),
217
- subject=str(ticket.subject),
218
- description=str(ticket.description),
90
+ connector_type=self.connector_type,
91
+ source_identifiers=SourceIdentifiers(
92
+ filename=f"{ticket.id}.txt", fullpath=f"tickets/{ticket.id}.txt"
93
+ ),
94
+ additional_metadata=ZendeskAdditionalMetadata(
95
+ item_type="ticket", content=ticket
96
+ ),
97
+ metadata=FileDataSourceMetadata(
98
+ url=str(ticket.url) if ticket.url else None,
99
+ date_created=ticket.created_at.isoformat() if ticket.created_at else None,
100
+ date_modified=ticket.updated_at.isoformat() if ticket.updated_at else None,
101
+ date_processed=str(time()),
102
+ ),
219
103
  )
220
- for ticket in sorted_batch
221
- ]
222
-
223
- full_path = self._generate_fullpath(str(sorted_batch[0].id))
224
- source_identifiers = SourceIdentifiers(filename=full_path.name, fullpath=str(full_path))
225
-
226
- batched_file_data = ZendeskBatchFileData(
227
- connector_type=self.connector_type,
228
- metadata=metadata,
229
- batch_items=batch_items,
230
- additional_metadata=additional_metadata,
231
- source_identifiers=source_identifiers,
232
- )
233
104
 
234
- yield batched_file_data
105
+ async def get_articles(self) -> AsyncGenerator[ZendeskFileData, None]:
106
+ async with self.connection_config.get_client() as client:
107
+ async for article in client.get_articles():
108
+ yield ZendeskFileData(
109
+ identifier=str(article.id),
110
+ connector_type=self.connector_type,
111
+ source_identifiers=SourceIdentifiers(
112
+ filename=f"{article.id}.html", fullpath=f"articles/{article.id}.html"
113
+ ),
114
+ additional_metadata=ZendeskAdditionalMetadata(
115
+ item_type="article", content=article
116
+ ),
117
+ metadata=FileDataSourceMetadata(
118
+ url=str(article.url) if article.url else None,
119
+ date_created=article.created_at.isoformat() if article.created_at else None,
120
+ date_modified=(
121
+ article.updated_at.isoformat() if article.updated_at else None
122
+ ),
123
+ date_processed=str(time()),
124
+ ),
125
+ )
235
126
 
236
- async def run_async(self, **kwargs: Any) -> AsyncGenerator[FileData, None]:
127
+ async def run_async(self, **kwargs: Any) -> AsyncGenerator[ZendeskFileData, None]:
237
128
  """Determines item type and processes accordingly asynchronously."""
238
129
  item_type = self.index_config.item_type
239
- batch_size = self.index_config.batch_size
240
130
 
241
131
  if item_type == "articles":
242
- articles = await self._list_articles_async()
243
- async for file_data in self.handle_articles_async(
244
- articles, batch_size
245
- ): # Using async version
246
- yield file_data
132
+ async for article_file_data in self.get_articles():
133
+ yield article_file_data
247
134
 
248
135
  elif item_type == "tickets":
249
- tickets = await self._list_tickets_async()
250
- async for file_data in self.handle_tickets_async(
251
- tickets, batch_size
252
- ): # Using async version
253
- yield file_data
136
+ async for ticket_file_data in self.get_tickets():
137
+ yield ticket_file_data
138
+
139
+ elif item_type == "all":
140
+ async for article_file_data in self.get_articles():
141
+ yield article_file_data
142
+ async for ticket_file_data in self.get_tickets():
143
+ yield ticket_file_data
144
+
145
+ else:
146
+ raise ValueError(f"Item type {item_type} is not supported by the indexer")
254
147
 
255
148
 
256
149
  class ZendeskDownloaderConfig(DownloaderConfig, HtmlMixin):
@@ -289,130 +182,46 @@ class ZendeskDownloader(Downloader):
289
182
  session=session,
290
183
  )
291
184
 
292
- @requires_dependencies(["bs4", "aiofiles"], extras="zendesk")
293
- async def handle_articles_async(
294
- self, client: ZendeskClient, batch_file_data: ZendeskBatchFileData
295
- ):
296
- """
297
- Processes the article information, downloads the attachments for each article,
298
- and updates the content accordingly.
299
- """
185
+ @requires_dependencies(["aiofiles", "bs4"], extras="zendesk")
186
+ async def download_article(self, article: ZendeskArticle, download_path: Path) -> None:
300
187
  import aiofiles
301
188
  import bs4
302
189
 
303
- # Determine the download path
304
- download_path = self.get_download_path(batch_file_data)
305
-
306
- if download_path is None:
307
- raise ValueError("Download path could not be determined")
308
-
309
- download_path.parent.mkdir(parents=True, exist_ok=True)
310
-
311
- async with aiofiles.open(download_path, "a", encoding="utf8") as f:
312
- for article in batch_file_data.batch_items:
313
- html_data_str = article.content
314
- soup = bs4.BeautifulSoup(html_data_str, "html.parser")
315
-
316
- if self.download_config.extract_images:
317
- # Get article attachments asynchronously
318
- image_data_decoded: List = await client.get_article_attachments_async(
319
- article_id=article.identifier
320
- )
321
- img_tags = soup.find_all("img")
322
-
323
- # Ensure we don't exceed the available images
324
- for img_tag, img_data in zip(img_tags, image_data_decoded):
325
- img_tag["src"] = img_data.get("encoded_content", "")
326
-
190
+ article_html = article.as_html()
191
+ soup = bs4.BeautifulSoup(article_html, "html.parser")
192
+ async with aiofiles.open(download_path, "w", encoding="utf8") as f:
327
193
  await f.write(soup.prettify())
328
194
 
329
- return super().generate_download_response(
330
- file_data=batch_file_data, download_path=download_path
331
- )
332
-
333
195
  @requires_dependencies(["aiofiles"], extras="zendesk")
334
- async def handle_tickets_async(
335
- self, client: ZendeskClient, batch_file_data: ZendeskBatchFileData
336
- ) -> DownloadResponse:
337
- """
338
- Processes a batch of tickets asynchronously, writing their details and comments to a file.
339
- """
196
+ async def download_ticket(self, ticket: ZendeskTicket, download_path: Path) -> None:
340
197
  import aiofiles
341
198
 
342
- # Determine the download path
343
- download_path = self.get_download_path(batch_file_data)
344
- if download_path is None:
345
- raise ValueError("Download path could not be determined")
346
-
347
- download_path.parent.mkdir(parents=True, exist_ok=True)
348
-
349
- # Process each ticket in the batch
350
- async with aiofiles.open(download_path, "a", encoding="utf8") as f:
351
- for batch_item in batch_file_data.batch_items:
352
- ticket_identifier = batch_item.identifier
353
- first_date = None
354
- comments: List[dict] = []
355
-
356
- # Fetch comments asynchronously
357
- comments_list = await client.get_comments_async(ticket_id=int(ticket_identifier))
358
-
359
- for comment in comments_list: # Iterate over the resolved list
360
- date_created = (
361
- comment.metadata["created_at"].isoformat()
362
- if isinstance(comment.metadata["created_at"], datetime.datetime)
363
- else str(comment.metadata["created_at"])
364
- )
365
-
366
- if first_date is None:
367
- first_date = date_created
368
-
369
- comments.append(
370
- {
371
- "comment_id": comment.id,
372
- "author_id": comment.author_id,
373
- "body": comment.body,
374
- "date_created": date_created,
375
- }
376
- )
377
-
378
- # Write ticket details to file
379
- content = (
380
- "\nticket\n"
381
- f"{batch_item.identifier}\n"
382
- f"{batch_file_data.metadata.record_locator.get('subject', '')}\n"
383
- f"{batch_file_data.metadata.record_locator.get('description', '')}\n"
384
- f"{first_date}\n"
385
- )
386
-
387
- # Append comments
199
+ async with aiofiles.open(download_path, "w", encoding="utf8") as f:
200
+ await f.write(ticket.as_text())
201
+ async with self.connection_config.get_client() as client:
202
+ comments = [comment async for comment in client.get_comments(ticket_id=ticket.id)]
388
203
  for comment in comments:
389
- content += (
390
- "comment\n"
391
- f"{comment.get('comment_id', '')}\n"
392
- f"{comment.get('author_id', '')}\n"
393
- f"{comment.get('body', '')}\n"
394
- f"{comment.get('date_created', '')}\n"
395
- )
396
-
397
- await f.write(content)
204
+ await f.write(comment.as_text())
398
205
 
399
- return super().generate_download_response(
400
- file_data=batch_file_data, download_path=download_path
401
- )
206
+ async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
402
207
 
403
- async def run_async(self, file_data: ZendeskBatchFileData, **kwargs: Any) -> DownloadResponse:
208
+ zendesk_filedata = ZendeskFileData.cast(file_data=file_data)
404
209
 
405
- zendesk_filedata: FileData = FileData.cast(file_data=file_data)
406
-
407
- client = await self.connection_config.get_client_async()
408
- item_type = zendesk_filedata.metadata.record_locator["item_type"]
210
+ item_type = zendesk_filedata.additional_metadata.item_type
211
+ download_path = self.get_download_path(file_data=zendesk_filedata)
212
+ download_path.parent.mkdir(parents=True, exist_ok=True)
409
213
 
410
- if item_type == "articles":
411
- return await self.handle_articles_async(client, file_data)
412
- elif item_type == "tickets":
413
- return await self.handle_tickets_async(client, file_data)
214
+ if item_type == "article":
215
+ article = ZendeskArticle.model_validate(zendesk_filedata.additional_metadata.content)
216
+ await self.download_article(article=article, download_path=download_path)
217
+ elif item_type == "ticket":
218
+ ticket = ZendeskTicket.model_validate(zendesk_filedata.additional_metadata.content)
219
+ await self.download_ticket(ticket=ticket, download_path=download_path)
414
220
  else:
415
221
  raise RuntimeError(f"Item type {item_type} cannot be handled by the downloader")
222
+ return super().generate_download_response(
223
+ file_data=zendesk_filedata, download_path=download_path
224
+ )
416
225
 
417
226
 
418
227
  # create entry
@@ -1,4 +1,3 @@
1
- import json
2
1
  from abc import ABC
3
2
  from dataclasses import dataclass
4
3
  from pathlib import Path
@@ -6,6 +5,7 @@ from typing import TYPE_CHECKING, Any, Literal, Optional
6
5
 
7
6
  from pydantic import BaseModel, Field, SecretStr
8
7
 
8
+ from unstructured_ingest.utils.data_prep import get_data
9
9
  from unstructured_ingest.v2.interfaces.process import BaseProcess
10
10
 
11
11
  if TYPE_CHECKING:
@@ -192,9 +192,8 @@ class Embedder(BaseProcess, ABC):
192
192
  def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
193
193
  # TODO update base embedder classes to support async
194
194
  embedder = self.config.get_embedder()
195
- with elements_filepath.open("r") as elements_file:
196
- elements = json.load(elements_file)
195
+ elements = get_data(path=elements_filepath)
197
196
  if not elements:
198
- return [e.to_dict() for e in elements]
197
+ return []
199
198
  embedded_elements = embedder.embed_documents(elements=elements)
200
199
  return embedded_elements
File without changes
@@ -0,0 +1,31 @@
1
+ from dataclasses import dataclass, field
2
+ from pathlib import Path
3
+ from typing import Any
4
+
5
+ from unstructured_ingest.utils.data_prep import get_data, write_data
6
+ from unstructured_ingest.v2.interfaces import FileData, UploadStager, UploadStagerConfig
7
+
8
+
9
+ class BlobStoreUploadStagerConfig(UploadStagerConfig):
10
+ pass
11
+
12
+
13
+ @dataclass
14
+ class BlobStoreUploadStager(UploadStager):
15
+ upload_stager_config: BlobStoreUploadStagerConfig = field(
16
+ default_factory=BlobStoreUploadStagerConfig
17
+ )
18
+
19
+ def run(
20
+ self,
21
+ elements_filepath: Path,
22
+ file_data: FileData,
23
+ output_dir: Path,
24
+ output_filename: str,
25
+ **kwargs: Any,
26
+ ) -> Path:
27
+ output_file = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
28
+ # Always save as json
29
+ data = get_data(elements_filepath)
30
+ write_data(path=output_file.with_suffix(".json"), data=data)
31
+ return output_file.with_suffix(".json")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: unstructured-ingest
3
- Version: 0.5.14
3
+ Version: 0.5.16
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -24,11 +24,11 @@ Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
25
  Requires-Dist: pandas
26
26
  Requires-Dist: opentelemetry-sdk
27
- Requires-Dist: python-dateutil
28
- Requires-Dist: pydantic>=2.7
29
- Requires-Dist: click
30
27
  Requires-Dist: tqdm
31
28
  Requires-Dist: dataclasses_json
29
+ Requires-Dist: pydantic>=2.7
30
+ Requires-Dist: python-dateutil
31
+ Requires-Dist: click
32
32
  Provides-Extra: remote
33
33
  Requires-Dist: unstructured-client>=0.30.0; extra == "remote"
34
34
  Provides-Extra: csv
@@ -86,21 +86,21 @@ Requires-Dist: atlassian-python-api; extra == "confluence"
86
86
  Provides-Extra: couchbase
87
87
  Requires-Dist: couchbase; extra == "couchbase"
88
88
  Provides-Extra: delta-table
89
- Requires-Dist: boto3; extra == "delta-table"
90
89
  Requires-Dist: deltalake; extra == "delta-table"
90
+ Requires-Dist: boto3; extra == "delta-table"
91
91
  Provides-Extra: discord
92
92
  Requires-Dist: discord.py; extra == "discord"
93
93
  Provides-Extra: dropbox
94
- Requires-Dist: dropboxdrivefs; extra == "dropbox"
95
94
  Requires-Dist: fsspec; extra == "dropbox"
95
+ Requires-Dist: dropboxdrivefs; extra == "dropbox"
96
96
  Provides-Extra: duckdb
97
97
  Requires-Dist: duckdb; extra == "duckdb"
98
98
  Provides-Extra: elasticsearch
99
99
  Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
100
100
  Provides-Extra: gcs
101
- Requires-Dist: gcsfs; extra == "gcs"
102
- Requires-Dist: bs4; extra == "gcs"
103
101
  Requires-Dist: fsspec; extra == "gcs"
102
+ Requires-Dist: bs4; extra == "gcs"
103
+ Requires-Dist: gcsfs; extra == "gcs"
104
104
  Provides-Extra: github
105
105
  Requires-Dist: requests; extra == "github"
106
106
  Requires-Dist: pygithub>1.58.0; extra == "github"
@@ -109,8 +109,8 @@ Requires-Dist: python-gitlab; extra == "gitlab"
109
109
  Provides-Extra: google-drive
110
110
  Requires-Dist: google-api-python-client; extra == "google-drive"
111
111
  Provides-Extra: hubspot
112
- Requires-Dist: hubspot-api-client; extra == "hubspot"
113
112
  Requires-Dist: urllib3; extra == "hubspot"
113
+ Requires-Dist: hubspot-api-client; extra == "hubspot"
114
114
  Provides-Extra: jira
115
115
  Requires-Dist: atlassian-python-api; extra == "jira"
116
116
  Provides-Extra: kafka
@@ -125,17 +125,17 @@ Provides-Extra: mongodb
125
125
  Requires-Dist: pymongo; extra == "mongodb"
126
126
  Provides-Extra: neo4j
127
127
  Requires-Dist: neo4j-rust-ext; extra == "neo4j"
128
- Requires-Dist: networkx; extra == "neo4j"
129
128
  Requires-Dist: cymple; extra == "neo4j"
129
+ Requires-Dist: networkx; extra == "neo4j"
130
130
  Provides-Extra: notion
131
131
  Requires-Dist: httpx; extra == "notion"
132
- Requires-Dist: backoff; extra == "notion"
133
- Requires-Dist: notion-client; extra == "notion"
134
132
  Requires-Dist: htmlBuilder; extra == "notion"
133
+ Requires-Dist: notion-client; extra == "notion"
134
+ Requires-Dist: backoff; extra == "notion"
135
135
  Provides-Extra: onedrive
136
136
  Requires-Dist: msal; extra == "onedrive"
137
- Requires-Dist: bs4; extra == "onedrive"
138
137
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
138
+ Requires-Dist: bs4; extra == "onedrive"
139
139
  Provides-Extra: opensearch
140
140
  Requires-Dist: opensearch-py; extra == "opensearch"
141
141
  Provides-Extra: outlook
@@ -160,8 +160,8 @@ Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
160
160
  Provides-Extra: salesforce
161
161
  Requires-Dist: simple-salesforce; extra == "salesforce"
162
162
  Provides-Extra: sftp
163
- Requires-Dist: paramiko; extra == "sftp"
164
163
  Requires-Dist: fsspec; extra == "sftp"
164
+ Requires-Dist: paramiko; extra == "sftp"
165
165
  Provides-Extra: slack
166
166
  Requires-Dist: slack_sdk[optional]; extra == "slack"
167
167
  Provides-Extra: snowflake
@@ -178,22 +178,22 @@ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
178
178
  Provides-Extra: singlestore
179
179
  Requires-Dist: singlestoredb; extra == "singlestore"
180
180
  Provides-Extra: vectara
181
- Requires-Dist: httpx; extra == "vectara"
182
- Requires-Dist: requests; extra == "vectara"
183
181
  Requires-Dist: aiofiles; extra == "vectara"
182
+ Requires-Dist: requests; extra == "vectara"
183
+ Requires-Dist: httpx; extra == "vectara"
184
184
  Provides-Extra: vastdb
185
185
  Requires-Dist: ibis; extra == "vastdb"
186
- Requires-Dist: pyarrow; extra == "vastdb"
187
186
  Requires-Dist: vastdb; extra == "vastdb"
187
+ Requires-Dist: pyarrow; extra == "vastdb"
188
188
  Provides-Extra: zendesk
189
189
  Requires-Dist: httpx; extra == "zendesk"
190
- Requires-Dist: aiofiles; extra == "zendesk"
191
190
  Requires-Dist: bs4; extra == "zendesk"
191
+ Requires-Dist: aiofiles; extra == "zendesk"
192
192
  Provides-Extra: embed-huggingface
193
193
  Requires-Dist: sentence-transformers; extra == "embed-huggingface"
194
194
  Provides-Extra: embed-octoai
195
- Requires-Dist: openai; extra == "embed-octoai"
196
195
  Requires-Dist: tiktoken; extra == "embed-octoai"
196
+ Requires-Dist: openai; extra == "embed-octoai"
197
197
  Provides-Extra: embed-vertexai
198
198
  Requires-Dist: vertexai; extra == "embed-vertexai"
199
199
  Provides-Extra: embed-voyageai
@@ -201,8 +201,8 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
201
201
  Provides-Extra: embed-mixedbreadai
202
202
  Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
203
203
  Provides-Extra: openai
204
- Requires-Dist: openai; extra == "openai"
205
204
  Requires-Dist: tiktoken; extra == "openai"
205
+ Requires-Dist: openai; extra == "openai"
206
206
  Provides-Extra: bedrock
207
207
  Requires-Dist: aioboto3; extra == "bedrock"
208
208
  Requires-Dist: boto3; extra == "bedrock"