unstructured-ingest 1.0.27__tar.gz → 1.0.31__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (239) hide show
  1. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/PKG-INFO +2 -2
  2. unstructured_ingest-1.0.31/unstructured_ingest/__version__.py +1 -0
  3. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/embed/vertexai.py +1 -1
  4. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/databricks/volumes.py +8 -3
  5. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/fsspec/fsspec.py +4 -5
  6. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/google_drive.py +295 -61
  7. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/onedrive.py +5 -5
  8. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/redisdb.py +47 -20
  9. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/sharepoint.py +5 -1
  10. unstructured_ingest-1.0.27/unstructured_ingest/__version__.py +0 -1
  11. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/.gitignore +0 -0
  12. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/LICENSE.md +0 -0
  13. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/README.md +0 -0
  14. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/pyproject.toml +0 -0
  15. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/__init__.py +0 -0
  16. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/cli/README.md +0 -0
  17. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/cli/__init__.py +0 -0
  18. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/cli/base/__init__.py +0 -0
  19. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/cli/base/cmd.py +0 -0
  20. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/cli/base/dest.py +0 -0
  21. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/cli/base/importer.py +0 -0
  22. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/cli/base/src.py +0 -0
  23. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/cli/cli.py +0 -0
  24. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/cli/cmds.py +0 -0
  25. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/cli/utils/__init__.py +0 -0
  26. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/cli/utils/click.py +0 -0
  27. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/cli/utils/model_conversion.py +0 -0
  28. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/data_types/__init__.py +0 -0
  29. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/data_types/entities.py +0 -0
  30. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/data_types/file_data.py +0 -0
  31. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/embed/__init__.py +0 -0
  32. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/embed/azure_openai.py +0 -0
  33. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/embed/bedrock.py +0 -0
  34. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/embed/huggingface.py +0 -0
  35. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/embed/interfaces.py +0 -0
  36. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/embed/mixedbreadai.py +0 -0
  37. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/embed/octoai.py +0 -0
  38. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/embed/openai.py +0 -0
  39. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/embed/togetherai.py +0 -0
  40. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/embed/voyageai.py +0 -0
  41. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/error.py +0 -0
  42. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/errors_v2.py +0 -0
  43. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/interfaces/__init__.py +0 -0
  44. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/interfaces/connector.py +0 -0
  45. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/interfaces/downloader.py +0 -0
  46. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/interfaces/indexer.py +0 -0
  47. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/interfaces/process.py +0 -0
  48. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/interfaces/processor.py +0 -0
  49. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/interfaces/upload_stager.py +0 -0
  50. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/interfaces/uploader.py +0 -0
  51. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/logger.py +0 -0
  52. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/main.py +0 -0
  53. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/otel.py +0 -0
  54. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/pipeline/__init__.py +0 -0
  55. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/pipeline/interfaces.py +0 -0
  56. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/pipeline/otel.py +0 -0
  57. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/pipeline/pipeline.py +0 -0
  58. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/pipeline/steps/__init__.py +0 -0
  59. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/pipeline/steps/chunk.py +0 -0
  60. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/pipeline/steps/download.py +0 -0
  61. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/pipeline/steps/embed.py +0 -0
  62. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/pipeline/steps/filter.py +0 -0
  63. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/pipeline/steps/index.py +0 -0
  64. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/pipeline/steps/partition.py +0 -0
  65. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/pipeline/steps/stage.py +0 -0
  66. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/pipeline/steps/uncompress.py +0 -0
  67. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/pipeline/steps/upload.py +0 -0
  68. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/__init__.py +0 -0
  69. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/chunker.py +0 -0
  70. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connector_registry.py +0 -0
  71. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/__init__.py +0 -0
  72. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/airtable.py +0 -0
  73. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  74. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +0 -0
  75. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +0 -0
  76. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/astradb.py +0 -0
  77. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/azure_ai_search.py +0 -0
  78. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/chroma.py +0 -0
  79. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/confluence.py +0 -0
  80. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/couchbase.py +0 -0
  81. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/databricks/__init__.py +0 -0
  82. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/databricks/volumes_aws.py +0 -0
  83. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/databricks/volumes_azure.py +0 -0
  84. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +0 -0
  85. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/databricks/volumes_native.py +0 -0
  86. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/databricks/volumes_table.py +0 -0
  87. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/delta_table.py +0 -0
  88. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/discord.py +0 -0
  89. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/duckdb/__init__.py +0 -0
  90. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/duckdb/base.py +0 -0
  91. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/duckdb/duckdb.py +0 -0
  92. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/duckdb/motherduck.py +0 -0
  93. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/elasticsearch/__init__.py +0 -0
  94. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +0 -0
  95. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +0 -0
  96. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/fsspec/__init__.py +0 -0
  97. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/fsspec/azure.py +0 -0
  98. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/fsspec/box.py +0 -0
  99. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/fsspec/dropbox.py +0 -0
  100. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/fsspec/gcs.py +0 -0
  101. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/fsspec/s3.py +0 -0
  102. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/fsspec/sftp.py +0 -0
  103. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/fsspec/utils.py +0 -0
  104. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/github.py +0 -0
  105. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/gitlab.py +0 -0
  106. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +0 -0
  107. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +0 -0
  108. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/jira.py +0 -0
  109. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/kafka/__init__.py +0 -0
  110. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/kafka/cloud.py +0 -0
  111. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/kafka/kafka.py +0 -0
  112. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/kafka/local.py +0 -0
  113. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/kdbai.py +0 -0
  114. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/lancedb/__init__.py +0 -0
  115. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/lancedb/aws.py +0 -0
  116. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/lancedb/azure.py +0 -0
  117. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/lancedb/cloud.py +0 -0
  118. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/lancedb/gcp.py +0 -0
  119. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/lancedb/lancedb.py +0 -0
  120. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/lancedb/local.py +0 -0
  121. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/local.py +0 -0
  122. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/milvus.py +0 -0
  123. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/mongodb.py +0 -0
  124. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/neo4j.py +0 -0
  125. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  126. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/client.py +0 -0
  127. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/connector.py +0 -0
  128. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/helpers.py +0 -0
  129. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +0 -0
  130. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +0 -0
  131. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +0 -0
  132. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +0 -0
  133. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/interfaces.py +0 -0
  134. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  135. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/block.py +0 -0
  136. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +0 -0
  137. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +0 -0
  138. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +0 -0
  139. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -0
  140. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +0 -0
  141. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +0 -0
  142. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +0 -0
  143. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/code.py +0 -0
  144. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +0 -0
  145. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +0 -0
  146. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +0 -0
  147. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +0 -0
  148. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/file.py +0 -0
  149. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +0 -0
  150. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/image.py +0 -0
  151. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +0 -0
  152. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +0 -0
  153. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +0 -0
  154. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +0 -0
  155. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +0 -0
  156. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +0 -0
  157. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +0 -0
  158. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/table.py +0 -0
  159. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +0 -0
  160. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/template.py +0 -0
  161. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +0 -0
  162. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +0 -0
  163. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +0 -0
  164. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/blocks/video.py +0 -0
  165. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/database.py +0 -0
  166. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +0 -0
  167. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +0 -0
  168. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +0 -0
  169. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +0 -0
  170. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +0 -0
  171. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +0 -0
  172. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +0 -0
  173. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +0 -0
  174. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -0
  175. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +0 -0
  176. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +0 -0
  177. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +0 -0
  178. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +0 -0
  179. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +0 -0
  180. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +0 -0
  181. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +0 -0
  182. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +0 -0
  183. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +0 -0
  184. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +0 -0
  185. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +0 -0
  186. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +0 -0
  187. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +0 -0
  188. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +0 -0
  189. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/date.py +0 -0
  190. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/file.py +0 -0
  191. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/page.py +0 -0
  192. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/parent.py +0 -0
  193. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/rich_text.py +0 -0
  194. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/notion/types/user.py +0 -0
  195. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/outlook.py +0 -0
  196. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/pinecone.py +0 -0
  197. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/qdrant/__init__.py +0 -0
  198. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/qdrant/cloud.py +0 -0
  199. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/qdrant/local.py +0 -0
  200. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/qdrant/qdrant.py +0 -0
  201. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/qdrant/server.py +0 -0
  202. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/salesforce.py +0 -0
  203. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/slack.py +0 -0
  204. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/sql/__init__.py +0 -0
  205. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +0 -0
  206. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/sql/postgres.py +0 -0
  207. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/sql/singlestore.py +0 -0
  208. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/sql/snowflake.py +0 -0
  209. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/sql/sql.py +0 -0
  210. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/sql/sqlite.py +0 -0
  211. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/sql/vastdb.py +0 -0
  212. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/utils.py +0 -0
  213. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/vectara.py +0 -0
  214. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/weaviate/__init__.py +0 -0
  215. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/weaviate/cloud.py +0 -0
  216. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/weaviate/embedded.py +0 -0
  217. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/weaviate/local.py +0 -0
  218. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/weaviate/weaviate.py +0 -0
  219. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  220. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/zendesk/client.py +0 -0
  221. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -0
  222. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/embedder.py +0 -0
  223. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/filter.py +0 -0
  224. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/partitioner.py +0 -0
  225. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/uncompress.py +0 -0
  226. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/utils/__init__.py +0 -0
  227. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/processes/utils/blob_storage.py +0 -0
  228. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/unstructured_api.py +0 -0
  229. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/utils/__init__.py +0 -0
  230. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/utils/chunking.py +0 -0
  231. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/utils/compression.py +0 -0
  232. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/utils/constants.py +0 -0
  233. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/utils/data_prep.py +0 -0
  234. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/utils/dep_check.py +0 -0
  235. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/utils/html.py +0 -0
  236. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/utils/ndjson.py +0 -0
  237. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/utils/pydantic_models.py +0 -0
  238. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/utils/string_and_date_utils.py +0 -0
  239. {unstructured_ingest-1.0.27 → unstructured_ingest-1.0.31}/unstructured_ingest/utils/table.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unstructured_ingest
3
- Version: 1.0.27
3
+ Version: 1.0.31
4
4
  Summary: Local ETL data pipeline to get data RAG ready
5
5
  Author-email: Unstructured Technologies <devops@unstructuredai.io>
6
6
  License-Expression: Apache-2.0
@@ -163,7 +163,7 @@ Requires-Dist: qdrant-client; extra == 'qdrant'
163
163
  Provides-Extra: reddit
164
164
  Requires-Dist: praw; extra == 'reddit'
165
165
  Provides-Extra: redis
166
- Requires-Dist: redis; extra == 'redis'
166
+ Requires-Dist: redis<=5.3.0; extra == 'redis'
167
167
  Provides-Extra: remote
168
168
  Requires-Dist: unstructured-client>=0.30.0; extra == 'remote'
169
169
  Provides-Extra: rst
@@ -0,0 +1 @@
1
+ __version__ = "1.0.31" # pragma: no cover
@@ -34,7 +34,7 @@ ApiKeyType = Secret[Annotated[dict, BeforeValidator(conform_string_to_dict)]]
34
34
  class VertexAIEmbeddingConfig(EmbeddingConfig):
35
35
  api_key: ApiKeyType = Field(description="API key for Vertex AI")
36
36
  embedder_model_name: Optional[str] = Field(
37
- default="textembedding-gecko@001", alias="model_name", description="Vertex AI model name"
37
+ default="text-embedding-005", alias="model_name", description="Vertex AI model name"
38
38
  )
39
39
 
40
40
  def wrap_error(self, e: Exception) -> Exception:
@@ -196,9 +196,14 @@ class DatabricksVolumesUploader(Uploader, ABC):
196
196
  connection_config: DatabricksVolumesConnectionConfig
197
197
 
198
198
  def get_output_path(self, file_data: FileData) -> str:
199
- return os.path.join(
200
- self.upload_config.path, f"{file_data.source_identifiers.filename}.json"
201
- )
199
+ if file_data.source_identifiers.fullpath:
200
+ return os.path.join(
201
+ self.upload_config.path, f"{file_data.source_identifiers.fullpath}.json"
202
+ )
203
+ else:
204
+ return os.path.join(
205
+ self.upload_config.path, f"{file_data.source_identifiers.filename}.json"
206
+ )
202
207
 
203
208
  def precheck(self) -> None:
204
209
  try:
@@ -343,10 +343,9 @@ class FsspecUploader(Uploader):
343
343
  raise self.wrap_error(e=e)
344
344
 
345
345
  def get_upload_path(self, file_data: FileData) -> Path:
346
- upload_path = (
347
- Path(self.upload_config.path_without_protocol)
348
- / file_data.source_identifiers.relative_path
349
- )
346
+ upload_path = Path(
347
+ self.upload_config.path_without_protocol
348
+ ) / file_data.source_identifiers.fullpath.lstrip("/")
350
349
  updated_upload_path = upload_path.parent / f"{upload_path.name}.json"
351
350
  return updated_upload_path
352
351
 
@@ -358,8 +357,8 @@ class FsspecUploader(Uploader):
358
357
  client.upload(lpath=path_str, rpath=upload_path.as_posix())
359
358
 
360
359
  async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
361
- upload_path = self.get_upload_path(file_data=file_data)
362
360
  path_str = str(path.resolve())
361
+ upload_path = self.get_upload_path(file_data=file_data)
363
362
  # Odd that fsspec doesn't run exists() as async even when client support async
364
363
  logger.debug(f"writing local file {path_str} to {upload_path}")
365
364
  with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
@@ -52,6 +52,10 @@ EXPORT_EXTENSION_MAP = {
52
52
  "text/html": ".html",
53
53
  }
54
54
 
55
+ # LRO Export Size Threshold is 10MB in real but the exported file might be slightly larger
56
+ # than the original Google Workspace file - thus the threshold is set to 9MB
57
+ LRO_EXPORT_SIZE_THRESHOLD = 9 * 1024 * 1024 # 9MB
58
+
55
59
 
56
60
  class GoogleDriveAccessConfig(AccessConfig):
57
61
  service_account_key: Optional[Annotated[dict, BeforeValidator(conform_string_to_dict)]] = Field(
@@ -142,8 +146,7 @@ class GoogleDriveIndexer(Indexer):
142
146
  "originalFilename",
143
147
  "capabilities",
144
148
  "permissionIds",
145
- "webViewLink",
146
- "webContentLink",
149
+ "size",
147
150
  ]
148
151
  )
149
152
 
@@ -178,7 +181,9 @@ class GoogleDriveIndexer(Indexer):
178
181
  raise SourceConnectionError("Google drive API unreachable for an unknown reason!")
179
182
 
180
183
  @staticmethod
181
- def count_files_recursively(files_client, folder_id: str, extensions: list[str] = None) -> int:
184
+ def count_files_recursively(
185
+ files_client: "GoogleAPIResource", folder_id: str, extensions: list[str] = None
186
+ ) -> int:
182
187
  """
183
188
  Count non-folder files recursively under the given folder.
184
189
  If `extensions` is provided, only count files
@@ -477,22 +482,26 @@ class GoogleDriveIndexer(Indexer):
477
482
 
478
483
 
479
484
  class GoogleDriveDownloaderConfig(DownloaderConfig):
480
- pass
485
+ lro_max_tries: int = 10
486
+ lro_max_time: int = 10 * 60 # 10 minutes
481
487
 
482
488
 
483
- @dataclass
484
- class GoogleDriveDownloader(Downloader):
489
+ def _get_extension(file_data: FileData) -> str:
490
+ """
491
+ Returns the extension for a given source MIME type.
485
492
  """
486
- Downloads files from Google Drive using authenticated direct HTTP requests
487
- via `exportLinks` (for Google-native files) and `webContentLink` (for binary files).
493
+ source_mime_type = file_data.additional_metadata.get("export_mime_type", "")
494
+ export_mime_type = GOOGLE_EXPORT_MIME_MAP.get(source_mime_type, "")
495
+ if export_mime_type:
496
+ return EXPORT_EXTENSION_MAP.get(export_mime_type, "")
497
+ return ""
488
498
 
489
- These links emulate the behavior of Google Drive's "File > Download as..." options
490
- in the UI and bypass the size limitations of `files.export()`.
491
499
 
492
- Behavior:
493
- - Google-native formats are downloaded using `exportLinks` in appropriate MIME formats.
494
- - Binary files (non-Google-native) are downloaded using `webContentLink`.
495
- - All downloads are performed via `requests.get()` using a valid bearer token.
500
+ @dataclass
501
+ class GoogleDriveDownloader(Downloader):
502
+ """
503
+ Downloads files from Google Drive using googleapis client. For native files, it uses the export
504
+ functionality for files <10MB and LRO (Long Running Operation) for files >10MB.
496
505
  """
497
506
 
498
507
  connection_config: GoogleDriveConnectionConfig
@@ -501,73 +510,233 @@ class GoogleDriveDownloader(Downloader):
501
510
  )
502
511
  connector_type: str = CONNECTOR_TYPE
503
512
 
504
- def _get_download_url_and_ext(self, file_id: str, mime_type: str) -> tuple[str, str]:
513
+ @requires_dependencies(["googleapiclient"], extras="google-drive")
514
+ def _direct_download_file(self, file_id, download_path: Path):
515
+ """Downloads a file from Google Drive using the Drive API's media download functionality.
516
+ The method uses Google Drive API's media download functionality to stream the file
517
+ content directly to disk.
518
+
519
+ Args:
520
+ file_id (str): The ID of the file to download from Google Drive.
521
+ download_path (Path): The local path where the file should be saved.
522
+
523
+ Raises:
524
+ SourceConnectionError: If the download operation fails.
505
525
  """
506
- Resolves the appropriate download URL and expected file extension for a Google Drive file.
526
+ from googleapiclient.errors import HttpError
527
+ from googleapiclient.http import MediaIoBaseDownload
507
528
 
508
- - Google-native files use export MIME types from exportLinks (e.g., .docx, .xlsx).
509
- - Binary files use webContentLink (e.g., uploaded PDFs or ZIPs).
529
+ try:
530
+ with self.connection_config.get_client() as client:
531
+ # pylint: disable=maybe-no-member
532
+ request = client.get_media(fileId=file_id)
533
+
534
+ with open(download_path, "wb") as file:
535
+ downloader = MediaIoBaseDownload(file, request)
536
+ done = False
537
+ while done is False:
538
+ status, done = downloader.next_chunk()
539
+ logger.debug(f"Download progress:{int(status.progress() * 100)}.")
540
+
541
+ except (HttpError, ValueError) as error:
542
+ logger.exception(f"Error downloading file {file_id} to {download_path}: {error}")
543
+ raise SourceConnectionError("Failed to download file") from error
544
+
545
+ @requires_dependencies(["googleapiclient"], extras="google-drive")
546
+ def _export_gdrive_file_with_lro(self, file_id: str, download_path: Path, mime_type: str):
547
+ """Exports a Google Drive file using Long-Running Operation (LRO) for large files
548
+ (>10MB of the exported file size).
549
+
550
+ This method is used when the standard export method fails due to file size limitations.
551
+ It uses the Drive API's LRO functionality to handle large file exports.
510
552
 
553
+ Args:
554
+ file_id (str): The ID of the Google Drive file to export.
555
+ download_path (Path): The local path where the exported file should be saved.
556
+ mime_type (str): The target MIME type for the exported file.
557
+ Raises:
558
+ SourceConnectionError: If the export operation fails.
559
+ """
560
+
561
+ import tenacity
562
+ from googleapiclient.errors import HttpError
563
+
564
+ max_time = self.download_config.lro_max_time
565
+ max_tries = self.download_config.lro_max_tries
566
+
567
+ class OperationNotFinished(Exception):
568
+ """
569
+ Exception raised when the operation is not finished.
570
+ """
571
+
572
+ pass
573
+
574
+ def is_fatal_code(e: Exception) -> bool:
575
+ """
576
+ Returns True if the error is fatal and should not be retried.
577
+ 403 and 429 can mean "Too many requests" or "User rate limit exceeded"
578
+ which should be retried.
579
+ """
580
+ return (
581
+ isinstance(e, HttpError)
582
+ and 400 <= e.resp.status < 500
583
+ and e.resp.status not in [403, 429]
584
+ )
585
+
586
+ @tenacity.retry(
587
+ wait=tenacity.wait_exponential(),
588
+ retry=tenacity.retry_if_exception(
589
+ lambda e: (
590
+ isinstance(e, (HttpError, OperationNotFinished)) and not is_fatal_code(e)
591
+ )
592
+ ),
593
+ stop=(tenacity.stop_after_attempt(max_tries) | tenacity.stop_after_delay(max_time)),
594
+ )
595
+ def _poll_operation(operation: dict, operations_client: "GoogleAPIResource") -> dict:
596
+ """
597
+ Helper function to poll the operation until it's complete.
598
+ Uses backoff exponential retry logic.
599
+
600
+ Each `operations.get` call uses the Google API requests limit. Details:
601
+ https://developers.google.com/workspace/drive/api/guides/limits
602
+
603
+ The limits as of May 2025 are:
604
+ - 12.000 calls per 60 seconds
605
+
606
+ In case of request limitting, the API will return 403 `User rate limit exceeded` error
607
+ or 429 `Too many requests` error.
608
+ """
609
+ if operation.get("done", False):
610
+ return operation
611
+ if "error" in operation:
612
+ raise SourceConnectionError(
613
+ f"Export operation failed: {operation['error']['message']}"
614
+ )
615
+ # Refresh the operation status:
616
+ # FYI: In some cases the `operations.get` call errors with 403 "User does not have
617
+ # permission" error even if the same user create the operation with `download` method.
618
+ updated_operation = operations_client.get(name=operation["name"]).execute()
619
+ if not updated_operation.get("done", False):
620
+ raise OperationNotFinished()
621
+ return updated_operation
622
+
623
+ try:
624
+ with self._get_files_and_operations_client() as (files_client, operations_client):
625
+ # Start the LRO
626
+ operation = files_client.download(fileId=file_id, mimeType=mime_type).execute()
627
+
628
+ # In case the operation is not finished, poll it until it's complete
629
+ updated_operation = _poll_operation(operation, operations_client)
630
+
631
+ # Get the download URI from the completed operation
632
+ download_uri = updated_operation["response"]["downloadUri"]
633
+
634
+ # Download the file using the URI
635
+ self._raw_download_google_drive_file(download_uri, download_path)
636
+
637
+ except HttpError as error:
638
+ raise SourceConnectionError(
639
+ f"Failed to export file using Google Drive LRO: {error}"
640
+ ) from error
641
+
642
+ @requires_dependencies(["googleapiclient"], extras="google-drive")
643
+ def _export_gdrive_native_file(
644
+ self, file_id: str, download_path: Path, mime_type: str, file_size: int
645
+ ):
646
+ """Exports a Google Drive native file (Docs, Sheets, Slides) to a specified format.
647
+
648
+ This method uses the Google Drive API's export functionality to convert Google Workspace
649
+ files to other formats (e.g., Google Docs to PDF, Google Sheets to Excel).
650
+ For files larger than 10MB, it falls back to using Long-Running Operation (LRO).
651
+
652
+ Args:
653
+ file_id (str): The ID of the Google Drive file to export.
654
+ download_path (Path): The local path where the exported file should be saved.
655
+ mime_type (str): The target MIME type for the exported file (e.g., 'application/pdf').
656
+ file_size (int): The size of the file to export - used to determine if the
657
+ file is large enough to use LRO instead of direct export endpoint.
511
658
  Returns:
512
- Tuple[str, str]: (download URL, file extension or "")
659
+ bytes: The exported file content.
513
660
 
514
661
  Raises:
515
- SourceConnectionError: If no valid export or download link is available.
662
+ HttpError: If the export operation fails.
516
663
  """
664
+ from googleapiclient.errors import HttpError
665
+ from googleapiclient.http import MediaIoBaseDownload
666
+
667
+ if file_size > LRO_EXPORT_SIZE_THRESHOLD:
668
+ self._export_gdrive_file_with_lro(file_id, download_path, mime_type)
669
+ return
670
+
517
671
  with self.connection_config.get_client() as client:
518
- metadata = client.get(fileId=file_id, fields="exportLinks,webContentLink").execute()
672
+ try:
673
+ # pylint: disable=maybe-no-member
674
+ request = client.export_media(fileId=file_id, mimeType=mime_type)
675
+ with open(download_path, "wb") as file:
676
+ downloader = MediaIoBaseDownload(file, request)
677
+ done = False
678
+ while done is False:
679
+ status, done = downloader.next_chunk()
680
+ logger.debug(f"Download progress: {int(status.progress() * 100)}.")
681
+ except HttpError as error:
682
+ if error.resp.status == 403 and "too large" in error.reason.lower():
683
+ # Even though we have the LRO threashold, for some smaller files the
684
+ # export size might exceed 10MB and we get a 403 error.
685
+ # In that case, we use LRO as a fallback.
686
+ self._export_gdrive_file_with_lro(file_id, download_path, mime_type)
687
+ else:
688
+ raise SourceConnectionError(f"Failed to export file: {error}") from error
519
689
 
520
- export_links = metadata.get("exportLinks", {})
521
- web_link = metadata.get("webContentLink")
690
+ @requires_dependencies(["googleapiclient"], extras="google-drive")
691
+ @contextmanager
692
+ def _get_files_and_operations_client(
693
+ self,
694
+ ) -> Generator[tuple["GoogleAPIResource", "GoogleAPIResource"], None, None]:
695
+ """
696
+ Returns a context manager for the files and operations clients for the Google Drive API.
522
697
 
523
- if export_mime := GOOGLE_EXPORT_MIME_MAP.get(mime_type):
524
- url = export_links.get(export_mime)
525
- if not url:
526
- raise SourceConnectionError(f"No export link found for {file_id} as {export_mime}")
527
- ext = EXPORT_EXTENSION_MAP.get(export_mime, "")
528
- return url, ext
698
+ Yields:
699
+ Tuple[GoogleAPIResource, GoogleAPIResource]: A tuple of the files
700
+ and operations clients.
701
+ """
702
+ from googleapiclient.discovery import build
529
703
 
530
- if not web_link:
531
- raise SourceConnectionError(f"No webContentLink available for file {file_id}")
532
- return web_link, ""
704
+ creds = self._get_credentials()
705
+ service = build("drive", "v3", credentials=creds)
706
+ with (
707
+ service.operations() as operations_client,
708
+ service.files() as files_client,
709
+ ):
710
+ yield files_client, operations_client
533
711
 
534
- @requires_dependencies(["httpx", "google.auth"], extras="google-drive")
535
- def _download_url(self, file_data: FileData, url: str, ext: str = "") -> Path:
712
+ @requires_dependencies(["httpx"])
713
+ def _raw_download_google_drive_file(self, url: str, download_path: Path) -> Path:
536
714
  """
537
715
  Streams file content directly to disk using authenticated HTTP request.
716
+ Must use httpx to stream the file to disk as currently there's no google SDK
717
+ functionality to download a file like for get media or export operations.
538
718
 
539
719
  Writes the file to the correct path in the download directory while downloading.
540
720
  Avoids buffering large files in memory.
541
721
 
542
- Returns:
543
- Path to the downloaded file.
722
+ Args:
723
+ url (str): The URL of the file to download.
724
+ download_path (Path): The path to save the downloaded file.
544
725
 
545
- Raises:
546
- SourceConnectionError: If the HTTP request fails.
726
+ Returns:
727
+ Path: The path to the downloaded file.
547
728
  """
548
729
  import httpx
549
730
  from google.auth.transport.requests import Request
550
- from google.oauth2 import service_account
551
731
 
552
- access_config = self.connection_config.access_config.get_secret_value()
553
- key_data = access_config.get_service_account_key()
554
- creds = service_account.Credentials.from_service_account_info(
555
- key_data,
556
- scopes=["https://www.googleapis.com/auth/drive.readonly"],
557
- )
732
+ creds = self._get_credentials()
733
+
558
734
  creds.refresh(Request())
559
735
 
560
736
  headers = {
561
737
  "Authorization": f"Bearer {creds.token}",
562
738
  }
563
739
 
564
- download_path = self.get_download_path(file_data)
565
- if ext:
566
- download_path = download_path.with_suffix(ext)
567
-
568
- download_path.parent.mkdir(parents=True, exist_ok=True)
569
- logger.debug(f"Streaming file to {download_path}")
570
-
571
740
  with (
572
741
  httpx.Client(timeout=None, follow_redirects=True) as client,
573
742
  client.stream("GET", url, headers=headers) as response,
@@ -579,26 +748,91 @@ class GoogleDriveDownloader(Downloader):
579
748
  with open(download_path, "wb") as f:
580
749
  for chunk in response.iter_bytes():
581
750
  f.write(chunk)
751
+ return download_path
752
+
753
+ @requires_dependencies(["google"], extras="google-drive")
754
+ def _get_credentials(self):
755
+ """
756
+ Retrieves the credentials for Google Drive API access.
757
+
758
+ Returns:
759
+ Credentials: The credentials for Google Drive API access.
760
+ """
761
+ from google.oauth2 import service_account
762
+
763
+ access_config = self.connection_config.access_config.get_secret_value()
764
+ key_data = access_config.get_service_account_key()
765
+ creds = service_account.Credentials.from_service_account_info(
766
+ key_data,
767
+ scopes=["https://www.googleapis.com/auth/drive.readonly"],
768
+ )
769
+ return creds
770
+
771
+ def _download_file(self, file_data: FileData) -> Path:
772
+ """Downloads a file from Google Drive using either direct download or export based
773
+ on the source file's MIME type.
774
+
775
+ This method determines the appropriate download method based on the file's MIME type:
776
+ - For Google Workspace files (Docs, Sheets, Slides), uses export functionality
777
+ - For other files, uses direct download
778
+
779
+ Args:
780
+ file_data (FileData): The metadata of the file being downloaded.
781
+
782
+ Returns:
783
+ Path: The path to the downloaded file.
784
+
785
+ Raises:
786
+ SourceConnectionError: If the download fails.
787
+ """
788
+ mime_type = file_data.additional_metadata.get("mimeType", "")
789
+ file_size = int(file_data.additional_metadata.get("size", 0))
790
+ file_id = file_data.identifier
791
+
792
+ download_path = self.get_download_path(file_data)
793
+ if not download_path:
794
+ raise SourceConnectionError(f"Failed to get download path for file {file_id}")
795
+
796
+ if mime_type in GOOGLE_EXPORT_MIME_MAP:
797
+ # For Google Workspace files, use export functionality
798
+ ext = _get_extension(file_data)
799
+ download_path = download_path.with_suffix(ext)
800
+ download_path.parent.mkdir(parents=True, exist_ok=True)
801
+ export_mime = GOOGLE_EXPORT_MIME_MAP[mime_type]
802
+ self._export_gdrive_native_file(
803
+ file_id=file_id,
804
+ download_path=download_path,
805
+ mime_type=export_mime,
806
+ file_size=file_size,
807
+ )
808
+ file_data.additional_metadata.update(
809
+ {
810
+ "export_mime_type": export_mime,
811
+ "export_extension": ext,
812
+ "download_method": "google_workspace_export",
813
+ }
814
+ )
815
+ else:
816
+ # For other files, use direct download
817
+ download_path.parent.mkdir(parents=True, exist_ok=True)
818
+ self._direct_download_file(file_id=file_id, download_path=download_path)
819
+ file_data.additional_metadata.update(
820
+ {
821
+ "download_method": "direct_download",
822
+ }
823
+ )
582
824
 
583
825
  return download_path
584
826
 
585
827
  def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
586
828
  mime_type = file_data.additional_metadata.get("mimeType", "")
587
- record_id = file_data.identifier
588
829
 
589
830
  logger.debug(
590
831
  f"Downloading file {file_data.source_identifiers.fullpath} of type {mime_type}"
591
832
  )
592
833
 
593
- download_url, ext = self._get_download_url_and_ext(record_id, mime_type)
594
- download_path = self._download_url(file_data, download_url, ext)
834
+ download_path = self._download_file(file_data)
595
835
 
596
- file_data.additional_metadata.update(
597
- {
598
- "download_method": "export_link" if ext else "web_content_link",
599
- "download_url_used": download_url,
600
- }
601
- )
602
836
  file_data.local_download_path = str(download_path.resolve())
603
837
 
604
838
  return self.generate_download_response(file_data=file_data, download_path=download_path)
@@ -370,14 +370,14 @@ class OnedriveUploader(Uploader):
370
370
  # Use the remote_url from upload_config as the base destination folder
371
371
  base_destination_folder = self.upload_config.url
372
372
 
373
- # Use the file's relative path to maintain directory structure, if needed
374
- if file_data.source_identifiers and file_data.source_identifiers.rel_path:
375
- # Combine the base destination folder with the file's relative path
373
+ # Use the file's full path to maintain directory structure, if needed
374
+ if file_data.source_identifiers and file_data.source_identifiers.fullpath:
375
+ # Combine the base destination folder with the file's full path
376
376
  destination_path = Path(base_destination_folder) / Path(
377
- f"{file_data.source_identifiers.rel_path}.json"
377
+ f"{file_data.source_identifiers.fullpath}.json"
378
378
  )
379
379
  else:
380
- # If no relative path is provided, upload directly to the base destination folder
380
+ # If no full path is provided, upload directly to the base destination folder
381
381
  destination_path = Path(base_destination_folder) / f"{path.name}.json"
382
382
 
383
383
  destination_folder = destination_path.parent
@@ -32,7 +32,9 @@ class RedisAccessConfig(AccessConfig):
32
32
  default=None, description="If not anonymous, use this uri, if specified."
33
33
  )
34
34
  password: Optional[str] = Field(
35
- default=None, description="If not anonymous, use this password, if specified."
35
+ default=None,
36
+ description="Password used to connect to database if uri is "
37
+ "not specified and connection is not anonymous.",
36
38
  )
37
39
 
38
40
 
@@ -41,20 +43,32 @@ class RedisConnectionConfig(ConnectionConfig):
41
43
  default=RedisAccessConfig(), validate_default=True
42
44
  )
43
45
  host: Optional[str] = Field(
44
- default=None, description="Hostname or IP address of a Redis instance to connect to."
46
+ default=None,
47
+ description="Hostname or IP address of a Redis instance to connect to "
48
+ "if uri is not specified.",
45
49
  )
46
50
  database: int = Field(default=0, description="Database index to connect to.")
47
- port: int = Field(default=6379, description="port used to connect to database.")
51
+ port: Optional[int] = Field(
52
+ default=6379, description="Port used to connect to database if uri is not specified."
53
+ )
48
54
  username: Optional[str] = Field(
49
- default=None, description="Username used to connect to database."
55
+ default=None, description="Username used to connect to database if uri is not specified."
56
+ )
57
+ ssl: Optional[bool] = Field(
58
+ default=True,
59
+ description="Whether the connection should use SSL encryption if uri is not specified.",
50
60
  )
51
- ssl: bool = Field(default=True, description="Whether the connection should use SSL encryption.")
52
61
  connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
53
62
 
54
63
  @model_validator(mode="after")
55
64
  def validate_host_or_url(self) -> "RedisConnectionConfig":
56
- if not self.access_config.get_secret_value().uri and not self.host:
57
- raise ValueError("Please pass a hostname either directly or through uri")
65
+ if not self.access_config.get_secret_value().uri:
66
+ if not self.host:
67
+ raise ValueError("Please pass a hostname either directly or through uri")
68
+ if self.port is None:
69
+ raise ValueError("Since URI is not specified, port cannot be None")
70
+ if self.ssl is None:
71
+ raise ValueError("Since URI is not specified, ssl cannot be None")
58
72
  return self
59
73
 
60
74
  @requires_dependencies(["redis"], extras="redis")
@@ -64,21 +78,20 @@ class RedisConnectionConfig(ConnectionConfig):
64
78
 
65
79
  access_config = self.access_config.get_secret_value()
66
80
 
67
- options = {
68
- "host": self.host,
69
- "port": self.port,
70
- "db": self.database,
71
- "ssl": self.ssl,
72
- "username": self.username,
73
- }
74
-
75
- if access_config.password:
76
- options["password"] = access_config.password
77
-
78
81
  if access_config.uri:
79
82
  async with from_url(access_config.uri) as client:
80
83
  yield client
81
84
  else:
85
+ options = {
86
+ "host": self.host,
87
+ "port": self.port,
88
+ "db": self.database,
89
+ "ssl": self.ssl,
90
+ "username": self.username,
91
+ }
92
+
93
+ if access_config.password:
94
+ options["password"] = access_config.password
82
95
  async with Redis(**options) as client:
83
96
  yield client
84
97
 
@@ -113,6 +126,20 @@ class RedisUploaderConfig(UploaderConfig):
113
126
  key_prefix: str = Field(default="", description="Prefix for Redis keys")
114
127
 
115
128
 
129
+ def _form_redis_pipeline_error_message(error: str) -> str:
130
+ """
131
+ Form a user-friendly error message for Redis pipeline errors.
132
+ The error message has `$` character at the beginning and `) of pipeline` at the end.
133
+ Everything between these two strings is the value an should be removed.
134
+ """
135
+ start = error.find("$")
136
+ end = error.find(") of pipeline")
137
+ if start != -1 and end != -1:
138
+ return error[: start + 1] + "<value>" + error[end:]
139
+ else:
140
+ return error
141
+
142
+
116
143
  @dataclass
117
144
  class RedisUploader(Uploader):
118
145
  upload_config: RedisUploaderConfig
@@ -169,14 +196,14 @@ class RedisUploader(Uploader):
169
196
  # Redis with stack extension supports JSON type
170
197
  await pipe.json().set(key_with_prefix, "$", element).execute()
171
198
  except redis_exceptions.ResponseError as e:
172
- message = str(e)
199
+ message = _form_redis_pipeline_error_message(str(e))
173
200
  if "unknown command `JSON.SET`" in message:
174
201
  # if this error occurs, Redis server doesn't support JSON type,
175
202
  # so save as string type instead
176
203
  await pipe.set(key_with_prefix, json.dumps(element)).execute()
177
204
  redis_stack = False
178
205
  else:
179
- raise e
206
+ raise redis_exceptions.ResponseError(message) from e
180
207
  return redis_stack
181
208
 
182
209