unstructured-ingest 1.0.8__tar.gz → 1.0.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (239) hide show
  1. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/PKG-INFO +1 -1
  2. unstructured_ingest-1.0.12/unstructured_ingest/__version__.py +1 -0
  3. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/google_drive.py +171 -65
  4. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +36 -6
  5. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/pinecone.py +26 -0
  6. unstructured_ingest-1.0.8/unstructured_ingest/__version__.py +0 -1
  7. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/.gitignore +0 -0
  8. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/LICENSE.md +0 -0
  9. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/README.md +0 -0
  10. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/pyproject.toml +0 -0
  11. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/__init__.py +0 -0
  12. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/README.md +0 -0
  13. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/__init__.py +0 -0
  14. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/base/__init__.py +0 -0
  15. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/base/cmd.py +0 -0
  16. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/base/dest.py +0 -0
  17. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/base/importer.py +0 -0
  18. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/base/src.py +0 -0
  19. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/cli.py +0 -0
  20. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/cmds.py +0 -0
  21. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/utils/__init__.py +0 -0
  22. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/utils/click.py +0 -0
  23. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/cli/utils/model_conversion.py +0 -0
  24. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/data_types/__init__.py +0 -0
  25. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/data_types/entities.py +0 -0
  26. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/data_types/file_data.py +0 -0
  27. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/__init__.py +0 -0
  28. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/azure_openai.py +0 -0
  29. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/bedrock.py +0 -0
  30. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/huggingface.py +0 -0
  31. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/interfaces.py +0 -0
  32. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/mixedbreadai.py +0 -0
  33. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/octoai.py +0 -0
  34. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/openai.py +0 -0
  35. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/togetherai.py +0 -0
  36. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/vertexai.py +0 -0
  37. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/embed/voyageai.py +0 -0
  38. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/error.py +0 -0
  39. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/errors_v2.py +0 -0
  40. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/interfaces/__init__.py +0 -0
  41. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/interfaces/connector.py +0 -0
  42. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/interfaces/downloader.py +0 -0
  43. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/interfaces/indexer.py +0 -0
  44. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/interfaces/process.py +0 -0
  45. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/interfaces/processor.py +0 -0
  46. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/interfaces/upload_stager.py +0 -0
  47. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/interfaces/uploader.py +0 -0
  48. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/logger.py +0 -0
  49. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/main.py +0 -0
  50. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/otel.py +0 -0
  51. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/__init__.py +0 -0
  52. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/interfaces.py +0 -0
  53. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/otel.py +0 -0
  54. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/pipeline.py +0 -0
  55. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/steps/__init__.py +0 -0
  56. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/steps/chunk.py +0 -0
  57. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/steps/download.py +0 -0
  58. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/steps/embed.py +0 -0
  59. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/steps/filter.py +0 -0
  60. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/steps/index.py +0 -0
  61. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/steps/partition.py +0 -0
  62. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/steps/stage.py +0 -0
  63. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/steps/uncompress.py +0 -0
  64. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/pipeline/steps/upload.py +0 -0
  65. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/__init__.py +0 -0
  66. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/chunker.py +0 -0
  67. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connector_registry.py +0 -0
  68. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/__init__.py +0 -0
  69. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/airtable.py +0 -0
  70. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  71. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +0 -0
  72. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +0 -0
  73. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/astradb.py +0 -0
  74. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/azure_ai_search.py +0 -0
  75. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/chroma.py +0 -0
  76. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/confluence.py +0 -0
  77. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/couchbase.py +0 -0
  78. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/databricks/__init__.py +0 -0
  79. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/databricks/volumes.py +0 -0
  80. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/databricks/volumes_aws.py +0 -0
  81. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/databricks/volumes_azure.py +0 -0
  82. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +0 -0
  83. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/databricks/volumes_native.py +0 -0
  84. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/databricks/volumes_table.py +0 -0
  85. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/delta_table.py +0 -0
  86. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/discord.py +0 -0
  87. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/duckdb/__init__.py +0 -0
  88. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/duckdb/base.py +0 -0
  89. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/duckdb/duckdb.py +0 -0
  90. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/duckdb/motherduck.py +0 -0
  91. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/elasticsearch/__init__.py +0 -0
  92. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +0 -0
  93. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +0 -0
  94. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/fsspec/__init__.py +0 -0
  95. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/fsspec/azure.py +0 -0
  96. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/fsspec/box.py +0 -0
  97. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/fsspec/dropbox.py +0 -0
  98. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/fsspec/fsspec.py +0 -0
  99. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/fsspec/gcs.py +0 -0
  100. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/fsspec/s3.py +0 -0
  101. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/fsspec/sftp.py +0 -0
  102. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/fsspec/utils.py +0 -0
  103. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/github.py +0 -0
  104. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/gitlab.py +0 -0
  105. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +0 -0
  106. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/jira.py +0 -0
  107. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/kafka/__init__.py +0 -0
  108. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/kafka/cloud.py +0 -0
  109. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/kafka/kafka.py +0 -0
  110. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/kafka/local.py +0 -0
  111. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/kdbai.py +0 -0
  112. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/lancedb/__init__.py +0 -0
  113. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/lancedb/aws.py +0 -0
  114. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/lancedb/azure.py +0 -0
  115. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/lancedb/cloud.py +0 -0
  116. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/lancedb/gcp.py +0 -0
  117. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/lancedb/lancedb.py +0 -0
  118. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/lancedb/local.py +0 -0
  119. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/local.py +0 -0
  120. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/milvus.py +0 -0
  121. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/mongodb.py +0 -0
  122. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/neo4j.py +0 -0
  123. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  124. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/client.py +0 -0
  125. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/connector.py +0 -0
  126. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/helpers.py +0 -0
  127. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +0 -0
  128. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +0 -0
  129. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +0 -0
  130. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +0 -0
  131. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/interfaces.py +0 -0
  132. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  133. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/block.py +0 -0
  134. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +0 -0
  135. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +0 -0
  136. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +0 -0
  137. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -0
  138. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +0 -0
  139. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +0 -0
  140. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +0 -0
  141. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/code.py +0 -0
  142. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +0 -0
  143. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +0 -0
  144. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +0 -0
  145. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +0 -0
  146. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/file.py +0 -0
  147. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +0 -0
  148. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/image.py +0 -0
  149. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +0 -0
  150. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +0 -0
  151. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +0 -0
  152. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +0 -0
  153. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +0 -0
  154. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +0 -0
  155. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +0 -0
  156. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/table.py +0 -0
  157. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +0 -0
  158. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/template.py +0 -0
  159. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +0 -0
  160. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +0 -0
  161. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +0 -0
  162. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/blocks/video.py +0 -0
  163. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database.py +0 -0
  164. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +0 -0
  165. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +0 -0
  166. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +0 -0
  167. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +0 -0
  168. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +0 -0
  169. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +0 -0
  170. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +0 -0
  171. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +0 -0
  172. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -0
  173. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +0 -0
  174. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +0 -0
  175. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +0 -0
  176. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +0 -0
  177. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +0 -0
  178. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +0 -0
  179. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +0 -0
  180. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +0 -0
  181. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +0 -0
  182. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +0 -0
  183. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +0 -0
  184. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +0 -0
  185. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +0 -0
  186. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +0 -0
  187. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/date.py +0 -0
  188. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/file.py +0 -0
  189. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/page.py +0 -0
  190. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/parent.py +0 -0
  191. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/rich_text.py +0 -0
  192. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/notion/types/user.py +0 -0
  193. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/onedrive.py +0 -0
  194. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/outlook.py +0 -0
  195. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/qdrant/__init__.py +0 -0
  196. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/qdrant/cloud.py +0 -0
  197. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/qdrant/local.py +0 -0
  198. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/qdrant/qdrant.py +0 -0
  199. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/qdrant/server.py +0 -0
  200. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/redisdb.py +0 -0
  201. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/salesforce.py +0 -0
  202. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/sharepoint.py +0 -0
  203. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/slack.py +0 -0
  204. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/sql/__init__.py +0 -0
  205. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +0 -0
  206. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/sql/postgres.py +0 -0
  207. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/sql/singlestore.py +0 -0
  208. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/sql/snowflake.py +0 -0
  209. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/sql/sql.py +0 -0
  210. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/sql/sqlite.py +0 -0
  211. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/sql/vastdb.py +0 -0
  212. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/utils.py +0 -0
  213. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/vectara.py +0 -0
  214. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/weaviate/__init__.py +0 -0
  215. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/weaviate/cloud.py +0 -0
  216. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/weaviate/embedded.py +0 -0
  217. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/weaviate/local.py +0 -0
  218. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/weaviate/weaviate.py +0 -0
  219. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  220. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/zendesk/client.py +0 -0
  221. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -0
  222. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/embedder.py +0 -0
  223. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/filter.py +0 -0
  224. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/partitioner.py +0 -0
  225. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/uncompress.py +0 -0
  226. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/utils/__init__.py +0 -0
  227. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/processes/utils/blob_storage.py +0 -0
  228. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/unstructured_api.py +0 -0
  229. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/utils/__init__.py +0 -0
  230. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/utils/chunking.py +0 -0
  231. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/utils/compression.py +0 -0
  232. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/utils/constants.py +0 -0
  233. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/utils/data_prep.py +0 -0
  234. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/utils/dep_check.py +0 -0
  235. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/utils/html.py +0 -0
  236. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/utils/ndjson.py +0 -0
  237. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/utils/pydantic_models.py +0 -0
  238. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/utils/string_and_date_utils.py +0 -0
  239. {unstructured_ingest-1.0.8 → unstructured_ingest-1.0.12}/unstructured_ingest/utils/table.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unstructured_ingest
3
- Version: 1.0.8
3
+ Version: 1.0.12
4
4
  Summary: Local ETL data pipeline to get data RAG ready
5
5
  Author-email: Unstructured Technologies <devops@unstructuredai.io>
6
6
  License-Expression: Apache-2.0
@@ -0,0 +1 @@
1
+ __version__ = "1.0.12" # pragma: no cover
@@ -1,4 +1,3 @@
1
- import io
2
1
  import json
3
2
  from contextlib import contextmanager
4
3
  from dataclasses import dataclass, field
@@ -16,7 +15,6 @@ from unstructured_ingest.data_types.file_data import (
16
15
  )
17
16
  from unstructured_ingest.error import (
18
17
  SourceConnectionError,
19
- SourceConnectionNetworkError,
20
18
  )
21
19
  from unstructured_ingest.interfaces import (
22
20
  AccessConfig,
@@ -34,25 +32,34 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
34
32
 
35
33
  if TYPE_CHECKING:
36
34
  from googleapiclient.discovery import Resource as GoogleAPIResource
37
- from googleapiclient.http import MediaIoBaseDownload
38
35
 
39
36
  CONNECTOR_TYPE = "google_drive"
40
37
 
41
- GOOGLE_DRIVE_EXPORT_TYPES = {
42
- "application/vnd.google-apps.document": "application/"
43
- "vnd.openxmlformats-officedocument.wordprocessingml.document",
44
- "application/vnd.google-apps.spreadsheet": "application/"
45
- "vnd.openxmlformats-officedocument.spreadsheetml.sheet",
46
- "application/vnd.google-apps.presentation": "application/"
47
- "vnd.openxmlformats-officedocument.presentationml.presentation",
48
- "application/vnd.google-apps.photo": "image/jpeg",
38
+
39
+ # Maps Google-native Drive MIME types → export MIME types
40
+ GOOGLE_EXPORT_MIME_MAP = {
41
+ "application/vnd.google-apps.document": \
42
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
43
+ "application/vnd.google-apps.spreadsheet": \
44
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
45
+ "application/vnd.google-apps.presentation": \
46
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
47
+ }
48
+
49
+ # Maps export MIME types → file extensions
50
+ EXPORT_EXTENSION_MAP = {
51
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
52
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
53
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
54
+ "application/pdf": ".pdf",
55
+ "text/html": ".html",
49
56
  }
50
57
 
51
58
 
52
59
  class GoogleDriveAccessConfig(AccessConfig):
53
- service_account_key: Optional[Annotated[dict, BeforeValidator(conform_string_to_dict)]] = Field(
54
- default=None, description="Credentials values to use for authentication"
55
- )
60
+ service_account_key: Optional[
61
+ Annotated[dict, BeforeValidator(conform_string_to_dict)]
62
+ ] = Field(default=None, description="Credentials values to use for authentication")
56
63
  service_account_key_path: Optional[Path] = Field(
57
64
  default=None,
58
65
  description="File path to credentials values to use for authentication",
@@ -153,7 +160,13 @@ class GoogleDriveIndexer(Indexer):
153
160
  """
154
161
  try:
155
162
  # A very minimal call: list 1 file from the drive.
156
- client.list(spaces="drive", pageSize=1, fields="files(id)").execute()
163
+ client.list(
164
+ supportsAllDrives=True,
165
+ includeItemsFromAllDrives=True,
166
+ spaces="drive",
167
+ pageSize=1,
168
+ fields="files(id)",
169
+ ).execute()
157
170
  except HttpError as e:
158
171
  error_content = e.content.decode() if hasattr(e, "content") else ""
159
172
  lower_error = error_content.lower()
@@ -165,10 +178,14 @@ class GoogleDriveIndexer(Indexer):
165
178
  Please enable it in the Google Cloud Console."
166
179
  )
167
180
  else:
168
- raise SourceConnectionError("Google drive API unreachable for an unknown reason!")
181
+ raise SourceConnectionError(
182
+ "Google drive API unreachable for an unknown reason!"
183
+ )
169
184
 
170
185
  @staticmethod
171
- def count_files_recursively(files_client, folder_id: str, extensions: list[str] = None) -> int:
186
+ def count_files_recursively(
187
+ files_client, folder_id: str, extensions: list[str] = None
188
+ ) -> int:
172
189
  """
173
190
  Count non-folder files recursively under the given folder.
174
191
  If `extensions` is provided, only count files
@@ -183,6 +200,8 @@ class GoogleDriveIndexer(Indexer):
183
200
  page_token = None
184
201
  while True:
185
202
  response = files_client.list(
203
+ supportsAllDrives=True,
204
+ includeItemsFromAllDrives=True,
186
205
  spaces="drive",
187
206
  q=query,
188
207
  fields="nextPageToken, files(id, mimeType, fileExtension)",
@@ -247,10 +266,14 @@ class GoogleDriveIndexer(Indexer):
247
266
  # that the service account has proper permissions."
248
267
  # )
249
268
  else:
250
- logger.info(f"Found {file_count} files recursively in the folder.")
269
+ logger.info(
270
+ f"Found {file_count} files recursively in the folder."
271
+ )
251
272
  else:
252
273
  # Non-recursive: check for at least one immediate non-folder child.
253
274
  response = client.list(
275
+ supportsAllDrives=True,
276
+ includeItemsFromAllDrives=True,
254
277
  spaces="drive",
255
278
  fields="files(id)",
256
279
  pageSize=1,
@@ -296,7 +319,9 @@ class GoogleDriveIndexer(Indexer):
296
319
  date_modified_str = root_info.pop("modifiedTime", None)
297
320
  parent_path = root_info.pop("parent_path", None)
298
321
  parent_root_path = root_info.pop("parent_root_path", None)
299
- date_modified_dt = parser.parse(date_modified_str) if date_modified_str else None
322
+ date_modified_dt = (
323
+ parser.parse(date_modified_str) if date_modified_str else None
324
+ )
300
325
  if (
301
326
  parent_path
302
327
  and isinstance(parent_path, str)
@@ -348,6 +373,8 @@ class GoogleDriveIndexer(Indexer):
348
373
  files_response = []
349
374
  while not done:
350
375
  response: dict = files_client.list(
376
+ supportsAllDrives=True,
377
+ includeItemsFromAllDrives=True,
351
378
  spaces="drive",
352
379
  fields=fields_input,
353
380
  corpora="user",
@@ -381,7 +408,9 @@ class GoogleDriveIndexer(Indexer):
381
408
  return files_response
382
409
 
383
410
  def get_root_info(self, files_client, object_id: str) -> dict:
384
- return files_client.get(fileId=object_id, fields=",".join(self.fields)).execute()
411
+ return files_client.get(
412
+ supportsAllDrives=True, fileId=object_id, fields=",".join(self.fields)
413
+ ).execute()
385
414
 
386
415
  def get_files(
387
416
  self,
@@ -392,7 +421,9 @@ class GoogleDriveIndexer(Indexer):
392
421
  ) -> list[FileData]:
393
422
  root_info = self.get_root_info(files_client=files_client, object_id=object_id)
394
423
  if not self.is_dir(root_info):
395
- root_info["permissions"] = self.extract_permissions(root_info.get("permissions"))
424
+ root_info["permissions"] = self.extract_permissions(
425
+ root_info.get("permissions")
426
+ )
396
427
  data = [self.map_file_data(root_info)]
397
428
  else:
398
429
  file_contents = self.get_paginated_results(
@@ -413,7 +444,7 @@ class GoogleDriveIndexer(Indexer):
413
444
  def extract_permissions(self, permissions: Optional[list[dict]]) -> list[dict]:
414
445
  if not permissions:
415
446
  logger.debug("no permissions found")
416
- return {}
447
+ return [{}]
417
448
 
418
449
  # https://developers.google.com/workspace/drive/api/guides/ref-roles
419
450
  role_mapping = {
@@ -464,61 +495,136 @@ class GoogleDriveDownloaderConfig(DownloaderConfig):
464
495
 
465
496
  @dataclass
466
497
  class GoogleDriveDownloader(Downloader):
498
+ """
499
+ Downloads files from Google Drive using authenticated direct HTTP requests
500
+ via `exportLinks` (for Google-native files) and `webContentLink` (for binary files).
501
+
502
+ These links emulate the behavior of Google Drive's "File > Download as..." options
503
+ in the UI and bypass the size limitations of `files.export()`.
504
+
505
+ Behavior:
506
+ - Google-native formats are downloaded using `exportLinks` in appropriate MIME formats.
507
+ - Binary files (non-Google-native) are downloaded using `webContentLink`.
508
+ - All downloads are performed via `requests.get()` using a valid bearer token.
509
+ """
510
+
467
511
  connection_config: GoogleDriveConnectionConfig
468
512
  download_config: GoogleDriveDownloaderConfig = field(
469
513
  default_factory=lambda: GoogleDriveDownloaderConfig()
470
514
  )
471
515
  connector_type: str = CONNECTOR_TYPE
472
516
 
473
- @SourceConnectionNetworkError.wrap
474
- def _get_content(self, downloader: "MediaIoBaseDownload") -> bool:
475
- downloaded = False
476
- while downloaded is False:
477
- _, downloaded = downloader.next_chunk()
478
- return downloaded
517
+ def _get_download_url_and_ext(
518
+ self, file_id: str, mime_type: str
519
+ ) -> tuple[str, str]:
520
+ """
521
+ Resolves the appropriate download URL and expected file extension for a Google Drive file.
479
522
 
480
- def _write_file(self, file_data: FileData, file_contents: io.BytesIO) -> DownloadResponse:
481
- download_path = self.get_download_path(file_data=file_data)
482
- download_path.parent.mkdir(parents=True, exist_ok=True)
483
- logger.debug(f"writing {file_data.source_identifiers.fullpath} to {download_path}")
484
- with open(download_path, "wb") as handler:
485
- handler.write(file_contents.getbuffer())
486
- return self.generate_download_response(file_data=file_data, download_path=download_path)
523
+ - Google-native files use export MIME types from exportLinks (e.g., .docx, .xlsx).
524
+ - Binary files use webContentLink (e.g., uploaded PDFs or ZIPs).
487
525
 
488
- @requires_dependencies(["googleapiclient"], extras="google-drive")
489
- def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
490
- from googleapiclient.http import MediaIoBaseDownload
526
+ Returns:
527
+ Tuple[str, str]: (download URL, file extension or "")
491
528
 
492
- logger.debug(f"fetching file: {file_data.source_identifiers.fullpath}")
493
- record_id = file_data.identifier
494
- mime_type = file_data.additional_metadata["mimeType"]
495
- if not mime_type:
496
- raise TypeError(
497
- f"File not supported. Name: {file_data.source_identifiers.filename} "
498
- f"ID: {record_id} "
499
- f"MimeType: {mime_type}"
500
- )
529
+ Raises:
530
+ SourceConnectionError: If no valid export or download link is available.
531
+ """
501
532
  with self.connection_config.get_client() as client:
502
- if (
503
- mime_type.startswith("application/vnd.google-apps")
504
- and mime_type in GOOGLE_DRIVE_EXPORT_TYPES
505
- ):
506
- export_mime = GOOGLE_DRIVE_EXPORT_TYPES.get(
507
- mime_type, # type: ignore
533
+ metadata = client.get(
534
+ fileId=file_id, fields="exportLinks,webContentLink"
535
+ ).execute()
536
+
537
+ export_links = metadata.get("exportLinks", {})
538
+ web_link = metadata.get("webContentLink")
539
+
540
+ if export_mime := GOOGLE_EXPORT_MIME_MAP.get(mime_type):
541
+ url = export_links.get(export_mime)
542
+ if not url:
543
+ raise SourceConnectionError(
544
+ f"No export link found for {file_id} as {export_mime}"
508
545
  )
509
- request = client.export_media(
510
- fileId=record_id,
511
- mimeType=export_mime,
546
+ ext = EXPORT_EXTENSION_MAP.get(export_mime, "")
547
+ return url, ext
548
+
549
+ if not web_link:
550
+ raise SourceConnectionError(
551
+ f"No webContentLink available for file {file_id}"
552
+ )
553
+ return web_link, ""
554
+
555
+ @requires_dependencies(["httpx", "google.auth"], extras="google-drive")
556
+ def _download_url(self, file_data: FileData, url: str, ext: str = "") -> Path:
557
+ """
558
+ Streams file content directly to disk using authenticated HTTP request.
559
+
560
+ Writes the file to the correct path in the download directory while downloading.
561
+ Avoids buffering large files in memory.
562
+
563
+ Returns:
564
+ Path to the downloaded file.
565
+
566
+ Raises:
567
+ SourceConnectionError: If the HTTP request fails.
568
+ """
569
+ import httpx
570
+ from google.auth.transport.requests import Request
571
+ from google.oauth2 import service_account
572
+
573
+ access_config = self.connection_config.access_config.get_secret_value()
574
+ key_data = access_config.get_service_account_key()
575
+ creds = service_account.Credentials.from_service_account_info(
576
+ key_data,
577
+ scopes=["https://www.googleapis.com/auth/drive.readonly"],
578
+ )
579
+ creds.refresh(Request())
580
+
581
+ headers = {
582
+ "Authorization": f"Bearer {creds.token}",
583
+ }
584
+
585
+ download_path = self.get_download_path(file_data)
586
+ if ext:
587
+ download_path = download_path.with_suffix(ext)
588
+
589
+ download_path.parent.mkdir(parents=True, exist_ok=True)
590
+ logger.debug(f"Streaming file to {download_path}")
591
+
592
+ with (
593
+ httpx.Client(timeout=None, follow_redirects=True) as client,
594
+ client.stream("GET", url, headers=headers) as response,
595
+ ):
596
+ if response.status_code != 200:
597
+ raise SourceConnectionError(
598
+ f"Failed to stream download from {url}: {response.status_code}"
512
599
  )
513
- else:
514
- request = client.get_media(fileId=record_id)
515
-
516
- file_contents = io.BytesIO()
517
- downloader = MediaIoBaseDownload(file_contents, request)
518
- downloaded = self._get_content(downloader=downloader)
519
- if not downloaded or not file_contents:
520
- raise SourceConnectionError("nothing found to download")
521
- return self._write_file(file_data=file_data, file_contents=file_contents)
600
+ with open(download_path, "wb") as f:
601
+ for chunk in response.iter_bytes():
602
+ f.write(chunk)
603
+
604
+ return download_path
605
+
606
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
607
+ mime_type = file_data.additional_metadata.get("mimeType", "")
608
+ record_id = file_data.identifier
609
+
610
+ logger.debug(
611
+ f"Downloading file {file_data.source_identifiers.fullpath} of type {mime_type}"
612
+ )
613
+
614
+ download_url, ext = self._get_download_url_and_ext(record_id, mime_type)
615
+ download_path = self._download_url(file_data, download_url, ext)
616
+
617
+ file_data.additional_metadata.update(
618
+ {
619
+ "download_method": "export_link" if ext else "web_content_link",
620
+ "download_url_used": download_url,
621
+ }
622
+ )
623
+ file_data.local_download_path = str(download_path.resolve())
624
+
625
+ return self.generate_download_response(
626
+ file_data=file_data, download_path=download_path
627
+ )
522
628
 
523
629
 
524
630
  google_drive_source_entry = SourceRegistryEntry(
@@ -69,8 +69,8 @@ class IbmWatsonxConnectionConfig(ConnectionConfig):
69
69
 
70
70
  @property
71
71
  def bearer_token(self) -> str:
72
- # Add 60 seconds to deal with edge cases where the token expires before the request is made
73
- timestamp = int(time.time()) + 60
72
+ # Add 5 minutes to deal with edge cases where the token expires before the request is made
73
+ timestamp = int(time.time()) + (60 * 5)
74
74
  if self._bearer_token is None or self._bearer_token.get("expiration", 0) <= timestamp:
75
75
  self._bearer_token = self.generate_bearer_token()
76
76
  return self._bearer_token["access_token"]
@@ -172,7 +172,7 @@ class IbmWatsonxUploaderConfig(UploaderConfig):
172
172
  namespace: str = Field(description="Namespace name")
173
173
  table: str = Field(description="Table name")
174
174
  max_retries: int = Field(
175
- default=5, description="Maximum number of retries to upload data", ge=2, le=10
175
+ default=5, description="Maximum number of retries to upload data", ge=2, le=500
176
176
  )
177
177
  record_id_key: str = Field(
178
178
  default=RECORD_ID_LABEL,
@@ -240,7 +240,7 @@ class IbmWatsonxUploader(SQLUploader):
240
240
  def upload_data_table(
241
241
  self, table: "Table", data_table: "ArrowTable", file_data: FileData
242
242
  ) -> None:
243
- from pyiceberg.exceptions import CommitFailedException
243
+ from pyiceberg.exceptions import CommitFailedException, RESTError
244
244
  from tenacity import (
245
245
  before_log,
246
246
  retry,
@@ -265,21 +265,51 @@ class IbmWatsonxUploader(SQLUploader):
265
265
  table.refresh()
266
266
  logger.debug(e)
267
267
  raise IcebergCommitFailedException(e)
268
+ except RESTError:
269
+ raise
268
270
  except Exception as e:
269
271
  raise ProviderError(f"Failed to upload data to table: {e}")
270
272
 
271
273
  try:
272
274
  return _upload_data_table(table, data_table, file_data)
275
+ except RESTError:
276
+ raise
273
277
  except ProviderError:
274
278
  raise
275
279
  except Exception as e:
276
280
  raise ProviderError(f"Failed to upload data to table: {e}")
277
281
 
282
+ @requires_dependencies(["pyiceberg", "tenacity"], extras="ibm-watsonx-s3")
278
283
  def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
284
+ from pyiceberg.exceptions import RESTError
285
+ from tenacity import (
286
+ before_log,
287
+ retry,
288
+ retry_if_exception_type,
289
+ stop_after_attempt,
290
+ wait_random,
291
+ )
292
+
279
293
  data_table = self._df_to_arrow_table(df)
280
294
 
281
- with self.get_table() as table:
282
- self.upload_data_table(table, data_table, file_data)
295
+ # Retry connection in case of connection error
296
+ @retry(
297
+ stop=stop_after_attempt(2),
298
+ wait=wait_random(),
299
+ retry=retry_if_exception_type(RESTError),
300
+ before=before_log(logger, logging.DEBUG),
301
+ reraise=True,
302
+ )
303
+ def _upload_dataframe(data_table: Any, file_data: FileData) -> None:
304
+ with self.get_table() as table:
305
+ self.upload_data_table(table, data_table, file_data)
306
+
307
+ try:
308
+ return _upload_dataframe(data_table, file_data)
309
+ except ProviderError:
310
+ raise
311
+ except Exception as e:
312
+ raise ProviderError(f"Failed to upload data to table: {e}")
283
313
 
284
314
  @requires_dependencies(["pandas"], extras="ibm-watsonx-s3")
285
315
  def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
@@ -1,6 +1,7 @@
1
1
  import json
2
2
  import re
3
3
  from dataclasses import dataclass, field
4
+ from pathlib import Path
4
5
  from typing import TYPE_CHECKING, Any, Literal, Optional
5
6
 
6
7
  from pydantic import Field, Secret
@@ -18,11 +19,14 @@ from unstructured_ingest.interfaces import (
18
19
  )
19
20
  from unstructured_ingest.logger import logger
20
21
  from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
22
+ from unstructured_ingest.utils import ndjson
21
23
  from unstructured_ingest.utils.constants import RECORD_ID_LABEL
22
24
  from unstructured_ingest.utils.data_prep import (
23
25
  flatten_dict,
24
26
  generator_batching_wbytes,
25
27
  get_enhanced_element_id,
28
+ get_json_data,
29
+ write_data,
26
30
  )
27
31
  from unstructured_ingest.utils.dep_check import requires_dependencies
28
32
 
@@ -162,6 +166,28 @@ class PineconeUploadStager(UploadStager):
162
166
  "metadata": metadata,
163
167
  }
164
168
 
169
+ def stream_update(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
170
+ with input_file.open() as in_f:
171
+ reader = ndjson.reader(in_f)
172
+ with output_file.open("w") as out_f:
173
+ writer = ndjson.writer(out_f)
174
+ for element in reader:
175
+ if "embeddings" not in element:
176
+ continue
177
+ conformed_element = self.conform_dict(element_dict=element, file_data=file_data)
178
+ writer.write(row=conformed_element)
179
+ writer.f.flush()
180
+
181
+ def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
182
+ elements_contents = get_json_data(path=input_file)
183
+
184
+ conformed_elements = [
185
+ self.conform_dict(element_dict=element, file_data=file_data)
186
+ for element in elements_contents
187
+ if "embeddings" in element
188
+ ]
189
+ write_data(path=output_file, data=conformed_elements)
190
+
165
191
 
166
192
  @dataclass
167
193
  class PineconeUploader(VectorDBUploader):
@@ -1 +0,0 @@
1
- __version__ = "1.0.8" # pragma: no cover