unstructured-ingest 1.0.28__py3-none-any.whl → 1.0.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes.py +8 -3
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +4 -5
- unstructured_ingest/processes/connectors/google_drive.py +295 -61
- unstructured_ingest/processes/connectors/onedrive.py +5 -5
- unstructured_ingest/processes/connectors/redisdb.py +47 -20
- {unstructured_ingest-1.0.28.dist-info → unstructured_ingest-1.0.32.dist-info}/METADATA +3 -2
- {unstructured_ingest-1.0.28.dist-info → unstructured_ingest-1.0.32.dist-info}/RECORD +11 -11
- {unstructured_ingest-1.0.28.dist-info → unstructured_ingest-1.0.32.dist-info}/WHEEL +0 -0
- {unstructured_ingest-1.0.28.dist-info → unstructured_ingest-1.0.32.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-1.0.28.dist-info → unstructured_ingest-1.0.32.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.0.
|
|
1
|
+
__version__ = "1.0.32" # pragma: no cover
|
|
@@ -196,9 +196,14 @@ class DatabricksVolumesUploader(Uploader, ABC):
|
|
|
196
196
|
connection_config: DatabricksVolumesConnectionConfig
|
|
197
197
|
|
|
198
198
|
def get_output_path(self, file_data: FileData) -> str:
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
199
|
+
if file_data.source_identifiers.fullpath:
|
|
200
|
+
return os.path.join(
|
|
201
|
+
self.upload_config.path, f"{file_data.source_identifiers.fullpath}.json"
|
|
202
|
+
)
|
|
203
|
+
else:
|
|
204
|
+
return os.path.join(
|
|
205
|
+
self.upload_config.path, f"{file_data.source_identifiers.filename}.json"
|
|
206
|
+
)
|
|
202
207
|
|
|
203
208
|
def precheck(self) -> None:
|
|
204
209
|
try:
|
|
@@ -343,10 +343,9 @@ class FsspecUploader(Uploader):
|
|
|
343
343
|
raise self.wrap_error(e=e)
|
|
344
344
|
|
|
345
345
|
def get_upload_path(self, file_data: FileData) -> Path:
|
|
346
|
-
upload_path = (
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
)
|
|
346
|
+
upload_path = Path(
|
|
347
|
+
self.upload_config.path_without_protocol
|
|
348
|
+
) / file_data.source_identifiers.fullpath.lstrip("/")
|
|
350
349
|
updated_upload_path = upload_path.parent / f"{upload_path.name}.json"
|
|
351
350
|
return updated_upload_path
|
|
352
351
|
|
|
@@ -358,8 +357,8 @@ class FsspecUploader(Uploader):
|
|
|
358
357
|
client.upload(lpath=path_str, rpath=upload_path.as_posix())
|
|
359
358
|
|
|
360
359
|
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
361
|
-
upload_path = self.get_upload_path(file_data=file_data)
|
|
362
360
|
path_str = str(path.resolve())
|
|
361
|
+
upload_path = self.get_upload_path(file_data=file_data)
|
|
363
362
|
# Odd that fsspec doesn't run exists() as async even when client support async
|
|
364
363
|
logger.debug(f"writing local file {path_str} to {upload_path}")
|
|
365
364
|
with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
|
|
@@ -52,6 +52,10 @@ EXPORT_EXTENSION_MAP = {
|
|
|
52
52
|
"text/html": ".html",
|
|
53
53
|
}
|
|
54
54
|
|
|
55
|
+
# LRO Export Size Threshold is 10MB in real but the exported file might be slightly larger
|
|
56
|
+
# than the original Google Workspace file - thus the threshold is set to 9MB
|
|
57
|
+
LRO_EXPORT_SIZE_THRESHOLD = 9 * 1024 * 1024 # 9MB
|
|
58
|
+
|
|
55
59
|
|
|
56
60
|
class GoogleDriveAccessConfig(AccessConfig):
|
|
57
61
|
service_account_key: Optional[Annotated[dict, BeforeValidator(conform_string_to_dict)]] = Field(
|
|
@@ -142,8 +146,7 @@ class GoogleDriveIndexer(Indexer):
|
|
|
142
146
|
"originalFilename",
|
|
143
147
|
"capabilities",
|
|
144
148
|
"permissionIds",
|
|
145
|
-
"
|
|
146
|
-
"webContentLink",
|
|
149
|
+
"size",
|
|
147
150
|
]
|
|
148
151
|
)
|
|
149
152
|
|
|
@@ -178,7 +181,9 @@ class GoogleDriveIndexer(Indexer):
|
|
|
178
181
|
raise SourceConnectionError("Google drive API unreachable for an unknown reason!")
|
|
179
182
|
|
|
180
183
|
@staticmethod
|
|
181
|
-
def count_files_recursively(
|
|
184
|
+
def count_files_recursively(
|
|
185
|
+
files_client: "GoogleAPIResource", folder_id: str, extensions: list[str] = None
|
|
186
|
+
) -> int:
|
|
182
187
|
"""
|
|
183
188
|
Count non-folder files recursively under the given folder.
|
|
184
189
|
If `extensions` is provided, only count files
|
|
@@ -477,22 +482,26 @@ class GoogleDriveIndexer(Indexer):
|
|
|
477
482
|
|
|
478
483
|
|
|
479
484
|
class GoogleDriveDownloaderConfig(DownloaderConfig):
|
|
480
|
-
|
|
485
|
+
lro_max_tries: int = 10
|
|
486
|
+
lro_max_time: int = 10 * 60 # 10 minutes
|
|
481
487
|
|
|
482
488
|
|
|
483
|
-
|
|
484
|
-
|
|
489
|
+
def _get_extension(file_data: FileData) -> str:
|
|
490
|
+
"""
|
|
491
|
+
Returns the extension for a given source MIME type.
|
|
485
492
|
"""
|
|
486
|
-
|
|
487
|
-
|
|
493
|
+
source_mime_type = file_data.additional_metadata.get("export_mime_type", "")
|
|
494
|
+
export_mime_type = GOOGLE_EXPORT_MIME_MAP.get(source_mime_type, "")
|
|
495
|
+
if export_mime_type:
|
|
496
|
+
return EXPORT_EXTENSION_MAP.get(export_mime_type, "")
|
|
497
|
+
return ""
|
|
488
498
|
|
|
489
|
-
These links emulate the behavior of Google Drive's "File > Download as..." options
|
|
490
|
-
in the UI and bypass the size limitations of `files.export()`.
|
|
491
499
|
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
500
|
+
@dataclass
|
|
501
|
+
class GoogleDriveDownloader(Downloader):
|
|
502
|
+
"""
|
|
503
|
+
Downloads files from Google Drive using googleapis client. For native files, it uses the export
|
|
504
|
+
functionality for files <10MB and LRO (Long Running Operation) for files >10MB.
|
|
496
505
|
"""
|
|
497
506
|
|
|
498
507
|
connection_config: GoogleDriveConnectionConfig
|
|
@@ -501,73 +510,233 @@ class GoogleDriveDownloader(Downloader):
|
|
|
501
510
|
)
|
|
502
511
|
connector_type: str = CONNECTOR_TYPE
|
|
503
512
|
|
|
504
|
-
|
|
513
|
+
@requires_dependencies(["googleapiclient"], extras="google-drive")
|
|
514
|
+
def _direct_download_file(self, file_id, download_path: Path):
|
|
515
|
+
"""Downloads a file from Google Drive using the Drive API's media download functionality.
|
|
516
|
+
The method uses Google Drive API's media download functionality to stream the file
|
|
517
|
+
content directly to disk.
|
|
518
|
+
|
|
519
|
+
Args:
|
|
520
|
+
file_id (str): The ID of the file to download from Google Drive.
|
|
521
|
+
download_path (Path): The local path where the file should be saved.
|
|
522
|
+
|
|
523
|
+
Raises:
|
|
524
|
+
SourceConnectionError: If the download operation fails.
|
|
505
525
|
"""
|
|
506
|
-
|
|
526
|
+
from googleapiclient.errors import HttpError
|
|
527
|
+
from googleapiclient.http import MediaIoBaseDownload
|
|
507
528
|
|
|
508
|
-
|
|
509
|
-
|
|
529
|
+
try:
|
|
530
|
+
with self.connection_config.get_client() as client:
|
|
531
|
+
# pylint: disable=maybe-no-member
|
|
532
|
+
request = client.get_media(fileId=file_id)
|
|
533
|
+
|
|
534
|
+
with open(download_path, "wb") as file:
|
|
535
|
+
downloader = MediaIoBaseDownload(file, request)
|
|
536
|
+
done = False
|
|
537
|
+
while done is False:
|
|
538
|
+
status, done = downloader.next_chunk()
|
|
539
|
+
logger.debug(f"Download progress:{int(status.progress() * 100)}.")
|
|
540
|
+
|
|
541
|
+
except (HttpError, ValueError) as error:
|
|
542
|
+
logger.exception(f"Error downloading file {file_id} to {download_path}: {error}")
|
|
543
|
+
raise SourceConnectionError("Failed to download file") from error
|
|
544
|
+
|
|
545
|
+
@requires_dependencies(["googleapiclient"], extras="google-drive")
|
|
546
|
+
def _export_gdrive_file_with_lro(self, file_id: str, download_path: Path, mime_type: str):
|
|
547
|
+
"""Exports a Google Drive file using Long-Running Operation (LRO) for large files
|
|
548
|
+
(>10MB of the exported file size).
|
|
549
|
+
|
|
550
|
+
This method is used when the standard export method fails due to file size limitations.
|
|
551
|
+
It uses the Drive API's LRO functionality to handle large file exports.
|
|
510
552
|
|
|
553
|
+
Args:
|
|
554
|
+
file_id (str): The ID of the Google Drive file to export.
|
|
555
|
+
download_path (Path): The local path where the exported file should be saved.
|
|
556
|
+
mime_type (str): The target MIME type for the exported file.
|
|
557
|
+
Raises:
|
|
558
|
+
SourceConnectionError: If the export operation fails.
|
|
559
|
+
"""
|
|
560
|
+
|
|
561
|
+
import tenacity
|
|
562
|
+
from googleapiclient.errors import HttpError
|
|
563
|
+
|
|
564
|
+
max_time = self.download_config.lro_max_time
|
|
565
|
+
max_tries = self.download_config.lro_max_tries
|
|
566
|
+
|
|
567
|
+
class OperationNotFinished(Exception):
|
|
568
|
+
"""
|
|
569
|
+
Exception raised when the operation is not finished.
|
|
570
|
+
"""
|
|
571
|
+
|
|
572
|
+
pass
|
|
573
|
+
|
|
574
|
+
def is_fatal_code(e: Exception) -> bool:
|
|
575
|
+
"""
|
|
576
|
+
Returns True if the error is fatal and should not be retried.
|
|
577
|
+
403 and 429 can mean "Too many requests" or "User rate limit exceeded"
|
|
578
|
+
which should be retried.
|
|
579
|
+
"""
|
|
580
|
+
return (
|
|
581
|
+
isinstance(e, HttpError)
|
|
582
|
+
and 400 <= e.resp.status < 500
|
|
583
|
+
and e.resp.status not in [403, 429]
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
@tenacity.retry(
|
|
587
|
+
wait=tenacity.wait_exponential(),
|
|
588
|
+
retry=tenacity.retry_if_exception(
|
|
589
|
+
lambda e: (
|
|
590
|
+
isinstance(e, (HttpError, OperationNotFinished)) and not is_fatal_code(e)
|
|
591
|
+
)
|
|
592
|
+
),
|
|
593
|
+
stop=(tenacity.stop_after_attempt(max_tries) | tenacity.stop_after_delay(max_time)),
|
|
594
|
+
)
|
|
595
|
+
def _poll_operation(operation: dict, operations_client: "GoogleAPIResource") -> dict:
|
|
596
|
+
"""
|
|
597
|
+
Helper function to poll the operation until it's complete.
|
|
598
|
+
Uses backoff exponential retry logic.
|
|
599
|
+
|
|
600
|
+
Each `operations.get` call uses the Google API requests limit. Details:
|
|
601
|
+
https://developers.google.com/workspace/drive/api/guides/limits
|
|
602
|
+
|
|
603
|
+
The limits as of May 2025 are:
|
|
604
|
+
- 12.000 calls per 60 seconds
|
|
605
|
+
|
|
606
|
+
In case of request limitting, the API will return 403 `User rate limit exceeded` error
|
|
607
|
+
or 429 `Too many requests` error.
|
|
608
|
+
"""
|
|
609
|
+
if operation.get("done", False):
|
|
610
|
+
return operation
|
|
611
|
+
if "error" in operation:
|
|
612
|
+
raise SourceConnectionError(
|
|
613
|
+
f"Export operation failed: {operation['error']['message']}"
|
|
614
|
+
)
|
|
615
|
+
# Refresh the operation status:
|
|
616
|
+
# FYI: In some cases the `operations.get` call errors with 403 "User does not have
|
|
617
|
+
# permission" error even if the same user create the operation with `download` method.
|
|
618
|
+
updated_operation = operations_client.get(name=operation["name"]).execute()
|
|
619
|
+
if not updated_operation.get("done", False):
|
|
620
|
+
raise OperationNotFinished()
|
|
621
|
+
return updated_operation
|
|
622
|
+
|
|
623
|
+
try:
|
|
624
|
+
with self._get_files_and_operations_client() as (files_client, operations_client):
|
|
625
|
+
# Start the LRO
|
|
626
|
+
operation = files_client.download(fileId=file_id, mimeType=mime_type).execute()
|
|
627
|
+
|
|
628
|
+
# In case the operation is not finished, poll it until it's complete
|
|
629
|
+
updated_operation = _poll_operation(operation, operations_client)
|
|
630
|
+
|
|
631
|
+
# Get the download URI from the completed operation
|
|
632
|
+
download_uri = updated_operation["response"]["downloadUri"]
|
|
633
|
+
|
|
634
|
+
# Download the file using the URI
|
|
635
|
+
self._raw_download_google_drive_file(download_uri, download_path)
|
|
636
|
+
|
|
637
|
+
except HttpError as error:
|
|
638
|
+
raise SourceConnectionError(
|
|
639
|
+
f"Failed to export file using Google Drive LRO: {error}"
|
|
640
|
+
) from error
|
|
641
|
+
|
|
642
|
+
@requires_dependencies(["googleapiclient"], extras="google-drive")
|
|
643
|
+
def _export_gdrive_native_file(
|
|
644
|
+
self, file_id: str, download_path: Path, mime_type: str, file_size: int
|
|
645
|
+
):
|
|
646
|
+
"""Exports a Google Drive native file (Docs, Sheets, Slides) to a specified format.
|
|
647
|
+
|
|
648
|
+
This method uses the Google Drive API's export functionality to convert Google Workspace
|
|
649
|
+
files to other formats (e.g., Google Docs to PDF, Google Sheets to Excel).
|
|
650
|
+
For files larger than 10MB, it falls back to using Long-Running Operation (LRO).
|
|
651
|
+
|
|
652
|
+
Args:
|
|
653
|
+
file_id (str): The ID of the Google Drive file to export.
|
|
654
|
+
download_path (Path): The local path where the exported file should be saved.
|
|
655
|
+
mime_type (str): The target MIME type for the exported file (e.g., 'application/pdf').
|
|
656
|
+
file_size (int): The size of the file to export - used to determine if the
|
|
657
|
+
file is large enough to use LRO instead of direct export endpoint.
|
|
511
658
|
Returns:
|
|
512
|
-
|
|
659
|
+
bytes: The exported file content.
|
|
513
660
|
|
|
514
661
|
Raises:
|
|
515
|
-
|
|
662
|
+
HttpError: If the export operation fails.
|
|
516
663
|
"""
|
|
664
|
+
from googleapiclient.errors import HttpError
|
|
665
|
+
from googleapiclient.http import MediaIoBaseDownload
|
|
666
|
+
|
|
667
|
+
if file_size > LRO_EXPORT_SIZE_THRESHOLD:
|
|
668
|
+
self._export_gdrive_file_with_lro(file_id, download_path, mime_type)
|
|
669
|
+
return
|
|
670
|
+
|
|
517
671
|
with self.connection_config.get_client() as client:
|
|
518
|
-
|
|
672
|
+
try:
|
|
673
|
+
# pylint: disable=maybe-no-member
|
|
674
|
+
request = client.export_media(fileId=file_id, mimeType=mime_type)
|
|
675
|
+
with open(download_path, "wb") as file:
|
|
676
|
+
downloader = MediaIoBaseDownload(file, request)
|
|
677
|
+
done = False
|
|
678
|
+
while done is False:
|
|
679
|
+
status, done = downloader.next_chunk()
|
|
680
|
+
logger.debug(f"Download progress: {int(status.progress() * 100)}.")
|
|
681
|
+
except HttpError as error:
|
|
682
|
+
if error.resp.status == 403 and "too large" in error.reason.lower():
|
|
683
|
+
# Even though we have the LRO threashold, for some smaller files the
|
|
684
|
+
# export size might exceed 10MB and we get a 403 error.
|
|
685
|
+
# In that case, we use LRO as a fallback.
|
|
686
|
+
self._export_gdrive_file_with_lro(file_id, download_path, mime_type)
|
|
687
|
+
else:
|
|
688
|
+
raise SourceConnectionError(f"Failed to export file: {error}") from error
|
|
519
689
|
|
|
520
|
-
|
|
521
|
-
|
|
690
|
+
@requires_dependencies(["googleapiclient"], extras="google-drive")
|
|
691
|
+
@contextmanager
|
|
692
|
+
def _get_files_and_operations_client(
|
|
693
|
+
self,
|
|
694
|
+
) -> Generator[tuple["GoogleAPIResource", "GoogleAPIResource"], None, None]:
|
|
695
|
+
"""
|
|
696
|
+
Returns a context manager for the files and operations clients for the Google Drive API.
|
|
522
697
|
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
return url, ext
|
|
698
|
+
Yields:
|
|
699
|
+
Tuple[GoogleAPIResource, GoogleAPIResource]: A tuple of the files
|
|
700
|
+
and operations clients.
|
|
701
|
+
"""
|
|
702
|
+
from googleapiclient.discovery import build
|
|
529
703
|
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
704
|
+
creds = self._get_credentials()
|
|
705
|
+
service = build("drive", "v3", credentials=creds)
|
|
706
|
+
with (
|
|
707
|
+
service.operations() as operations_client,
|
|
708
|
+
service.files() as files_client,
|
|
709
|
+
):
|
|
710
|
+
yield files_client, operations_client
|
|
533
711
|
|
|
534
|
-
@requires_dependencies(["httpx"
|
|
535
|
-
def
|
|
712
|
+
@requires_dependencies(["httpx"])
|
|
713
|
+
def _raw_download_google_drive_file(self, url: str, download_path: Path) -> Path:
|
|
536
714
|
"""
|
|
537
715
|
Streams file content directly to disk using authenticated HTTP request.
|
|
716
|
+
Must use httpx to stream the file to disk as currently there's no google SDK
|
|
717
|
+
functionality to download a file like for get media or export operations.
|
|
538
718
|
|
|
539
719
|
Writes the file to the correct path in the download directory while downloading.
|
|
540
720
|
Avoids buffering large files in memory.
|
|
541
721
|
|
|
542
|
-
|
|
543
|
-
|
|
722
|
+
Args:
|
|
723
|
+
url (str): The URL of the file to download.
|
|
724
|
+
download_path (Path): The path to save the downloaded file.
|
|
544
725
|
|
|
545
|
-
|
|
546
|
-
|
|
726
|
+
Returns:
|
|
727
|
+
Path: The path to the downloaded file.
|
|
547
728
|
"""
|
|
548
729
|
import httpx
|
|
549
730
|
from google.auth.transport.requests import Request
|
|
550
|
-
from google.oauth2 import service_account
|
|
551
731
|
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
creds = service_account.Credentials.from_service_account_info(
|
|
555
|
-
key_data,
|
|
556
|
-
scopes=["https://www.googleapis.com/auth/drive.readonly"],
|
|
557
|
-
)
|
|
732
|
+
creds = self._get_credentials()
|
|
733
|
+
|
|
558
734
|
creds.refresh(Request())
|
|
559
735
|
|
|
560
736
|
headers = {
|
|
561
737
|
"Authorization": f"Bearer {creds.token}",
|
|
562
738
|
}
|
|
563
739
|
|
|
564
|
-
download_path = self.get_download_path(file_data)
|
|
565
|
-
if ext:
|
|
566
|
-
download_path = download_path.with_suffix(ext)
|
|
567
|
-
|
|
568
|
-
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
569
|
-
logger.debug(f"Streaming file to {download_path}")
|
|
570
|
-
|
|
571
740
|
with (
|
|
572
741
|
httpx.Client(timeout=None, follow_redirects=True) as client,
|
|
573
742
|
client.stream("GET", url, headers=headers) as response,
|
|
@@ -579,26 +748,91 @@ class GoogleDriveDownloader(Downloader):
|
|
|
579
748
|
with open(download_path, "wb") as f:
|
|
580
749
|
for chunk in response.iter_bytes():
|
|
581
750
|
f.write(chunk)
|
|
751
|
+
return download_path
|
|
752
|
+
|
|
753
|
+
@requires_dependencies(["google"], extras="google-drive")
|
|
754
|
+
def _get_credentials(self):
|
|
755
|
+
"""
|
|
756
|
+
Retrieves the credentials for Google Drive API access.
|
|
757
|
+
|
|
758
|
+
Returns:
|
|
759
|
+
Credentials: The credentials for Google Drive API access.
|
|
760
|
+
"""
|
|
761
|
+
from google.oauth2 import service_account
|
|
762
|
+
|
|
763
|
+
access_config = self.connection_config.access_config.get_secret_value()
|
|
764
|
+
key_data = access_config.get_service_account_key()
|
|
765
|
+
creds = service_account.Credentials.from_service_account_info(
|
|
766
|
+
key_data,
|
|
767
|
+
scopes=["https://www.googleapis.com/auth/drive.readonly"],
|
|
768
|
+
)
|
|
769
|
+
return creds
|
|
770
|
+
|
|
771
|
+
def _download_file(self, file_data: FileData) -> Path:
|
|
772
|
+
"""Downloads a file from Google Drive using either direct download or export based
|
|
773
|
+
on the source file's MIME type.
|
|
774
|
+
|
|
775
|
+
This method determines the appropriate download method based on the file's MIME type:
|
|
776
|
+
- For Google Workspace files (Docs, Sheets, Slides), uses export functionality
|
|
777
|
+
- For other files, uses direct download
|
|
778
|
+
|
|
779
|
+
Args:
|
|
780
|
+
file_data (FileData): The metadata of the file being downloaded.
|
|
781
|
+
|
|
782
|
+
Returns:
|
|
783
|
+
Path: The path to the downloaded file.
|
|
784
|
+
|
|
785
|
+
Raises:
|
|
786
|
+
SourceConnectionError: If the download fails.
|
|
787
|
+
"""
|
|
788
|
+
mime_type = file_data.additional_metadata.get("mimeType", "")
|
|
789
|
+
file_size = int(file_data.additional_metadata.get("size", 0))
|
|
790
|
+
file_id = file_data.identifier
|
|
791
|
+
|
|
792
|
+
download_path = self.get_download_path(file_data)
|
|
793
|
+
if not download_path:
|
|
794
|
+
raise SourceConnectionError(f"Failed to get download path for file {file_id}")
|
|
795
|
+
|
|
796
|
+
if mime_type in GOOGLE_EXPORT_MIME_MAP:
|
|
797
|
+
# For Google Workspace files, use export functionality
|
|
798
|
+
ext = _get_extension(file_data)
|
|
799
|
+
download_path = download_path.with_suffix(ext)
|
|
800
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
801
|
+
export_mime = GOOGLE_EXPORT_MIME_MAP[mime_type]
|
|
802
|
+
self._export_gdrive_native_file(
|
|
803
|
+
file_id=file_id,
|
|
804
|
+
download_path=download_path,
|
|
805
|
+
mime_type=export_mime,
|
|
806
|
+
file_size=file_size,
|
|
807
|
+
)
|
|
808
|
+
file_data.additional_metadata.update(
|
|
809
|
+
{
|
|
810
|
+
"export_mime_type": export_mime,
|
|
811
|
+
"export_extension": ext,
|
|
812
|
+
"download_method": "google_workspace_export",
|
|
813
|
+
}
|
|
814
|
+
)
|
|
815
|
+
else:
|
|
816
|
+
# For other files, use direct download
|
|
817
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
818
|
+
self._direct_download_file(file_id=file_id, download_path=download_path)
|
|
819
|
+
file_data.additional_metadata.update(
|
|
820
|
+
{
|
|
821
|
+
"download_method": "direct_download",
|
|
822
|
+
}
|
|
823
|
+
)
|
|
582
824
|
|
|
583
825
|
return download_path
|
|
584
826
|
|
|
585
827
|
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
586
828
|
mime_type = file_data.additional_metadata.get("mimeType", "")
|
|
587
|
-
record_id = file_data.identifier
|
|
588
829
|
|
|
589
830
|
logger.debug(
|
|
590
831
|
f"Downloading file {file_data.source_identifiers.fullpath} of type {mime_type}"
|
|
591
832
|
)
|
|
592
833
|
|
|
593
|
-
|
|
594
|
-
download_path = self._download_url(file_data, download_url, ext)
|
|
834
|
+
download_path = self._download_file(file_data)
|
|
595
835
|
|
|
596
|
-
file_data.additional_metadata.update(
|
|
597
|
-
{
|
|
598
|
-
"download_method": "export_link" if ext else "web_content_link",
|
|
599
|
-
"download_url_used": download_url,
|
|
600
|
-
}
|
|
601
|
-
)
|
|
602
836
|
file_data.local_download_path = str(download_path.resolve())
|
|
603
837
|
|
|
604
838
|
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
@@ -370,14 +370,14 @@ class OnedriveUploader(Uploader):
|
|
|
370
370
|
# Use the remote_url from upload_config as the base destination folder
|
|
371
371
|
base_destination_folder = self.upload_config.url
|
|
372
372
|
|
|
373
|
-
# Use the file's
|
|
374
|
-
if file_data.source_identifiers and file_data.source_identifiers.
|
|
375
|
-
# Combine the base destination folder with the file's
|
|
373
|
+
# Use the file's full path to maintain directory structure, if needed
|
|
374
|
+
if file_data.source_identifiers and file_data.source_identifiers.fullpath:
|
|
375
|
+
# Combine the base destination folder with the file's full path
|
|
376
376
|
destination_path = Path(base_destination_folder) / Path(
|
|
377
|
-
f"{file_data.source_identifiers.
|
|
377
|
+
f"{file_data.source_identifiers.fullpath}.json"
|
|
378
378
|
)
|
|
379
379
|
else:
|
|
380
|
-
# If no
|
|
380
|
+
# If no full path is provided, upload directly to the base destination folder
|
|
381
381
|
destination_path = Path(base_destination_folder) / f"{path.name}.json"
|
|
382
382
|
|
|
383
383
|
destination_folder = destination_path.parent
|
|
@@ -32,7 +32,9 @@ class RedisAccessConfig(AccessConfig):
|
|
|
32
32
|
default=None, description="If not anonymous, use this uri, if specified."
|
|
33
33
|
)
|
|
34
34
|
password: Optional[str] = Field(
|
|
35
|
-
default=None,
|
|
35
|
+
default=None,
|
|
36
|
+
description="Password used to connect to database if uri is "
|
|
37
|
+
"not specified and connection is not anonymous.",
|
|
36
38
|
)
|
|
37
39
|
|
|
38
40
|
|
|
@@ -41,20 +43,32 @@ class RedisConnectionConfig(ConnectionConfig):
|
|
|
41
43
|
default=RedisAccessConfig(), validate_default=True
|
|
42
44
|
)
|
|
43
45
|
host: Optional[str] = Field(
|
|
44
|
-
default=None,
|
|
46
|
+
default=None,
|
|
47
|
+
description="Hostname or IP address of a Redis instance to connect to "
|
|
48
|
+
"if uri is not specified.",
|
|
45
49
|
)
|
|
46
50
|
database: int = Field(default=0, description="Database index to connect to.")
|
|
47
|
-
port: int = Field(
|
|
51
|
+
port: Optional[int] = Field(
|
|
52
|
+
default=6379, description="Port used to connect to database if uri is not specified."
|
|
53
|
+
)
|
|
48
54
|
username: Optional[str] = Field(
|
|
49
|
-
default=None, description="Username used to connect to database."
|
|
55
|
+
default=None, description="Username used to connect to database if uri is not specified."
|
|
56
|
+
)
|
|
57
|
+
ssl: Optional[bool] = Field(
|
|
58
|
+
default=True,
|
|
59
|
+
description="Whether the connection should use SSL encryption if uri is not specified.",
|
|
50
60
|
)
|
|
51
|
-
ssl: bool = Field(default=True, description="Whether the connection should use SSL encryption.")
|
|
52
61
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
53
62
|
|
|
54
63
|
@model_validator(mode="after")
|
|
55
64
|
def validate_host_or_url(self) -> "RedisConnectionConfig":
|
|
56
|
-
if not self.access_config.get_secret_value().uri
|
|
57
|
-
|
|
65
|
+
if not self.access_config.get_secret_value().uri:
|
|
66
|
+
if not self.host:
|
|
67
|
+
raise ValueError("Please pass a hostname either directly or through uri")
|
|
68
|
+
if self.port is None:
|
|
69
|
+
raise ValueError("Since URI is not specified, port cannot be None")
|
|
70
|
+
if self.ssl is None:
|
|
71
|
+
raise ValueError("Since URI is not specified, ssl cannot be None")
|
|
58
72
|
return self
|
|
59
73
|
|
|
60
74
|
@requires_dependencies(["redis"], extras="redis")
|
|
@@ -64,21 +78,20 @@ class RedisConnectionConfig(ConnectionConfig):
|
|
|
64
78
|
|
|
65
79
|
access_config = self.access_config.get_secret_value()
|
|
66
80
|
|
|
67
|
-
options = {
|
|
68
|
-
"host": self.host,
|
|
69
|
-
"port": self.port,
|
|
70
|
-
"db": self.database,
|
|
71
|
-
"ssl": self.ssl,
|
|
72
|
-
"username": self.username,
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
if access_config.password:
|
|
76
|
-
options["password"] = access_config.password
|
|
77
|
-
|
|
78
81
|
if access_config.uri:
|
|
79
82
|
async with from_url(access_config.uri) as client:
|
|
80
83
|
yield client
|
|
81
84
|
else:
|
|
85
|
+
options = {
|
|
86
|
+
"host": self.host,
|
|
87
|
+
"port": self.port,
|
|
88
|
+
"db": self.database,
|
|
89
|
+
"ssl": self.ssl,
|
|
90
|
+
"username": self.username,
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
if access_config.password:
|
|
94
|
+
options["password"] = access_config.password
|
|
82
95
|
async with Redis(**options) as client:
|
|
83
96
|
yield client
|
|
84
97
|
|
|
@@ -113,6 +126,20 @@ class RedisUploaderConfig(UploaderConfig):
|
|
|
113
126
|
key_prefix: str = Field(default="", description="Prefix for Redis keys")
|
|
114
127
|
|
|
115
128
|
|
|
129
|
+
def _form_redis_pipeline_error_message(error: str) -> str:
|
|
130
|
+
"""
|
|
131
|
+
Form a user-friendly error message for Redis pipeline errors.
|
|
132
|
+
The error message has `$` character at the beginning and `) of pipeline` at the end.
|
|
133
|
+
Everything between these two strings is the value an should be removed.
|
|
134
|
+
"""
|
|
135
|
+
start = error.find("$")
|
|
136
|
+
end = error.find(") of pipeline")
|
|
137
|
+
if start != -1 and end != -1:
|
|
138
|
+
return error[: start + 1] + "<value>" + error[end:]
|
|
139
|
+
else:
|
|
140
|
+
return error
|
|
141
|
+
|
|
142
|
+
|
|
116
143
|
@dataclass
|
|
117
144
|
class RedisUploader(Uploader):
|
|
118
145
|
upload_config: RedisUploaderConfig
|
|
@@ -169,14 +196,14 @@ class RedisUploader(Uploader):
|
|
|
169
196
|
# Redis with stack extension supports JSON type
|
|
170
197
|
await pipe.json().set(key_with_prefix, "$", element).execute()
|
|
171
198
|
except redis_exceptions.ResponseError as e:
|
|
172
|
-
message = str(e)
|
|
199
|
+
message = _form_redis_pipeline_error_message(str(e))
|
|
173
200
|
if "unknown command `JSON.SET`" in message:
|
|
174
201
|
# if this error occurs, Redis server doesn't support JSON type,
|
|
175
202
|
# so save as string type instead
|
|
176
203
|
await pipe.set(key_with_prefix, json.dumps(element)).execute()
|
|
177
204
|
redis_stack = False
|
|
178
205
|
else:
|
|
179
|
-
raise e
|
|
206
|
+
raise redis_exceptions.ResponseError(message) from e
|
|
180
207
|
return redis_stack
|
|
181
208
|
|
|
182
209
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: unstructured_ingest
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.32
|
|
4
4
|
Summary: Local ETL data pipeline to get data RAG ready
|
|
5
5
|
Author-email: Unstructured Technologies <devops@unstructuredai.io>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -87,6 +87,7 @@ Provides-Extra: gitlab
|
|
|
87
87
|
Requires-Dist: python-gitlab; extra == 'gitlab'
|
|
88
88
|
Provides-Extra: google-drive
|
|
89
89
|
Requires-Dist: google-api-python-client; extra == 'google-drive'
|
|
90
|
+
Requires-Dist: tenacity; extra == 'google-drive'
|
|
90
91
|
Provides-Extra: hubspot
|
|
91
92
|
Requires-Dist: hubspot-api-client; extra == 'hubspot'
|
|
92
93
|
Requires-Dist: urllib3; extra == 'hubspot'
|
|
@@ -163,7 +164,7 @@ Requires-Dist: qdrant-client; extra == 'qdrant'
|
|
|
163
164
|
Provides-Extra: reddit
|
|
164
165
|
Requires-Dist: praw; extra == 'reddit'
|
|
165
166
|
Provides-Extra: redis
|
|
166
|
-
Requires-Dist: redis; extra == 'redis'
|
|
167
|
+
Requires-Dist: redis<=5.3.0; extra == 'redis'
|
|
167
168
|
Provides-Extra: remote
|
|
168
169
|
Requires-Dist: unstructured-client>=0.30.0; extra == 'remote'
|
|
169
170
|
Provides-Extra: rst
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
2
|
-
unstructured_ingest/__version__.py,sha256=
|
|
2
|
+
unstructured_ingest/__version__.py,sha256=tjMRa0J78uLr4Q1KetAAzKJ8jimjDywme7jVeFHwNx4,43
|
|
3
3
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
4
4
|
unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
|
|
5
5
|
unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
|
|
@@ -72,17 +72,17 @@ unstructured_ingest/processes/connectors/delta_table.py,sha256=2DFox_Vzoopt_D3Jy
|
|
|
72
72
|
unstructured_ingest/processes/connectors/discord.py,sha256=6yEJ_agfKUqsV43wFsbMkcd8lcLJC0uqbo4izjdZ3rU,5294
|
|
73
73
|
unstructured_ingest/processes/connectors/github.py,sha256=smHCz6jOH1p_hW2S25bYunBBj_pYjz8HTw6wkzaJz_A,7765
|
|
74
74
|
unstructured_ingest/processes/connectors/gitlab.py,sha256=6h1CdqznJmzeWxGfXrFLdNdT23PExGnUMMX7usK_4Kk,10013
|
|
75
|
-
unstructured_ingest/processes/connectors/google_drive.py,sha256=
|
|
75
|
+
unstructured_ingest/processes/connectors/google_drive.py,sha256=W6zjpuNS-mnLJtwTKAAPn0_4pcEc1bySO2u4V3fPXVo,35250
|
|
76
76
|
unstructured_ingest/processes/connectors/jira.py,sha256=a7OuVi4RFfr22Tqgk60lwmtWTRBw2fI1m8KPqfA8Ffo,18504
|
|
77
77
|
unstructured_ingest/processes/connectors/kdbai.py,sha256=XhxYpKSAoFPBsDQWwNuLX03DCxOVr7yquj9VYM55Rtc,5174
|
|
78
78
|
unstructured_ingest/processes/connectors/local.py,sha256=LluTLKv4g7FbJb4A6vuSxI9VhzKZuuQUpDS-cVNAQ2g,7426
|
|
79
79
|
unstructured_ingest/processes/connectors/milvus.py,sha256=Jr9cul7By03tGAPFnFBoqncnNWwbhKd-qbmkuqnin8U,8908
|
|
80
80
|
unstructured_ingest/processes/connectors/mongodb.py,sha256=1g_5bfbS6lah3nsOXqLAanR3zNYJ47_Njw_uV-uj3_U,14324
|
|
81
81
|
unstructured_ingest/processes/connectors/neo4j.py,sha256=ztxvI9KY8RF5kYUuMGSzzN5mz7Fu_4Ai9P7dqCpJLc0,20267
|
|
82
|
-
unstructured_ingest/processes/connectors/onedrive.py,sha256=
|
|
82
|
+
unstructured_ingest/processes/connectors/onedrive.py,sha256=k0bhQCCSIgmHAk3lQd4CMA3dc4fPAjegNlLxlDWGowc,19284
|
|
83
83
|
unstructured_ingest/processes/connectors/outlook.py,sha256=zHM5frO7CqQG0-KcTyX49aZeSlsvVrl8kh_lR_ESgQw,9275
|
|
84
84
|
unstructured_ingest/processes/connectors/pinecone.py,sha256=pSREUNsQqel6q1EFZsFWelg-uZgGubQY5m_6nVnBFKs,15090
|
|
85
|
-
unstructured_ingest/processes/connectors/redisdb.py,sha256=
|
|
85
|
+
unstructured_ingest/processes/connectors/redisdb.py,sha256=rTihbfv0Mlk1eo5Izn-JXRu5Ad5C-KD58nSqeKsaZJ8,8024
|
|
86
86
|
unstructured_ingest/processes/connectors/salesforce.py,sha256=OaKEWCqZrirHqFJ650K5jSPwYlWefPOapas8Y-4D9oc,11661
|
|
87
87
|
unstructured_ingest/processes/connectors/sharepoint.py,sha256=jI-erp4YUfHxPeUTcfHSPEG3w0wjSBYfAnMg1WT6lfw,4996
|
|
88
88
|
unstructured_ingest/processes/connectors/slack.py,sha256=EkFj9PcAu5_gF2xLogikKDADLbJYq-_jvchzYrTdLO4,9224
|
|
@@ -92,7 +92,7 @@ unstructured_ingest/processes/connectors/assets/__init__.py,sha256=47DEQpj8HBSa-
|
|
|
92
92
|
unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql,sha256=8a9HTcRWA6IuswSD632b_uZSO6Dax_0rUYnflqktcek,226
|
|
93
93
|
unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json,sha256=SJlIO0kXxy866tWQ8bEzvwLwflsoUMIS-OKlxMvHIuE,504
|
|
94
94
|
unstructured_ingest/processes/connectors/databricks/__init__.py,sha256=RtKAPyNtXh6fzEsOQ08pA0-vC1uMr3KqYG6cqiBoo70,2133
|
|
95
|
-
unstructured_ingest/processes/connectors/databricks/volumes.py,sha256=
|
|
95
|
+
unstructured_ingest/processes/connectors/databricks/volumes.py,sha256=fZeXRozTUM3JeZlmsxhn_glqRhxr8CGG-8I8QRhRcP8,8232
|
|
96
96
|
unstructured_ingest/processes/connectors/databricks/volumes_aws.py,sha256=WhGTp6aRTLSdc4GChCL4mz2b-IanderW8j1IqezX6YA,2958
|
|
97
97
|
unstructured_ingest/processes/connectors/databricks/volumes_azure.py,sha256=pF2d6uAIbwJJUeOIG5xknUMCGc5d9Aztmc2776wp-a0,3740
|
|
98
98
|
unstructured_ingest/processes/connectors/databricks/volumes_gcp.py,sha256=y9AvVl6PtnIxlTlrPj_wyHBDBRJNq3uoTOuZwTryNg8,2994
|
|
@@ -109,7 +109,7 @@ unstructured_ingest/processes/connectors/fsspec/__init__.py,sha256=3HTdw4L4mdN4W
|
|
|
109
109
|
unstructured_ingest/processes/connectors/fsspec/azure.py,sha256=31VNiG5YnXfhrFX7QJ2O1ubeWHxbe1sYVIztefbscAQ,7148
|
|
110
110
|
unstructured_ingest/processes/connectors/fsspec/box.py,sha256=1gLS7xR2vbjgKBrQ4ZpI1fKTsJuIDfXuAzx_a4FzxG4,5873
|
|
111
111
|
unstructured_ingest/processes/connectors/fsspec/dropbox.py,sha256=HwwKjQmjM7yFk9Esh_F20xDisRPXGUkFduzaasByRDE,8355
|
|
112
|
-
unstructured_ingest/processes/connectors/fsspec/fsspec.py,sha256=
|
|
112
|
+
unstructured_ingest/processes/connectors/fsspec/fsspec.py,sha256=4K8Q2D_6_HCqTVM3HBJv3SNz9gjbQhk44nzeSheDpzA,14462
|
|
113
113
|
unstructured_ingest/processes/connectors/fsspec/gcs.py,sha256=ouxISCKpZTAj3T6pWGYbASu93wytJjl5WSICvQcrgfE,7172
|
|
114
114
|
unstructured_ingest/processes/connectors/fsspec/s3.py,sha256=2ZV6b2E2pIsf_ab1Lty74FwpMnJZhpQUdamPgpwcKsQ,7141
|
|
115
115
|
unstructured_ingest/processes/connectors/fsspec/sftp.py,sha256=pR_a2SgLjt8ffNkariHrPB1E0HVSTj5h3pt7KxTU3TI,6371
|
|
@@ -231,8 +231,8 @@ unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01q
|
|
|
231
231
|
unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
|
|
232
232
|
unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
|
|
233
233
|
unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
|
|
234
|
-
unstructured_ingest-1.0.
|
|
235
|
-
unstructured_ingest-1.0.
|
|
236
|
-
unstructured_ingest-1.0.
|
|
237
|
-
unstructured_ingest-1.0.
|
|
238
|
-
unstructured_ingest-1.0.
|
|
234
|
+
unstructured_ingest-1.0.32.dist-info/METADATA,sha256=CaoYV49uBnrTClcA3h67r-wol4XH4KaZTo0WMWITX7Q,8747
|
|
235
|
+
unstructured_ingest-1.0.32.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
236
|
+
unstructured_ingest-1.0.32.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
237
|
+
unstructured_ingest-1.0.32.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
238
|
+
unstructured_ingest-1.0.32.dist-info/RECORD,,
|
|
File without changes
|
{unstructured_ingest-1.0.28.dist-info → unstructured_ingest-1.0.32.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.28.dist-info → unstructured_ingest-1.0.32.dist-info}/licenses/LICENSE.md
RENAMED
|
File without changes
|