unstructured-ingest 1.0.2__tar.gz → 1.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (238) hide show
  1. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/PKG-INFO +1 -1
  2. unstructured_ingest-1.0.5/unstructured_ingest/__version__.py +1 -0
  3. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/data_types/file_data.py +24 -2
  4. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/confluence.py +183 -16
  5. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/google_drive.py +42 -1
  6. unstructured_ingest-1.0.2/unstructured_ingest/__version__.py +0 -1
  7. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/.gitignore +0 -0
  8. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/LICENSE.md +0 -0
  9. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/README.md +0 -0
  10. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/pyproject.toml +0 -0
  11. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/__init__.py +0 -0
  12. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/README.md +0 -0
  13. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/__init__.py +0 -0
  14. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/base/__init__.py +0 -0
  15. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/base/cmd.py +0 -0
  16. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/base/dest.py +0 -0
  17. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/base/importer.py +0 -0
  18. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/base/src.py +0 -0
  19. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/cli.py +0 -0
  20. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/cmds.py +0 -0
  21. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/utils/__init__.py +0 -0
  22. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/utils/click.py +0 -0
  23. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/cli/utils/model_conversion.py +0 -0
  24. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/data_types/__init__.py +0 -0
  25. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/__init__.py +0 -0
  26. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/azure_openai.py +0 -0
  27. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/bedrock.py +0 -0
  28. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/huggingface.py +0 -0
  29. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/interfaces.py +0 -0
  30. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/mixedbreadai.py +0 -0
  31. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/octoai.py +0 -0
  32. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/openai.py +0 -0
  33. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/togetherai.py +0 -0
  34. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/vertexai.py +0 -0
  35. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/embed/voyageai.py +0 -0
  36. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/error.py +0 -0
  37. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/errors_v2.py +0 -0
  38. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/interfaces/__init__.py +0 -0
  39. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/interfaces/connector.py +0 -0
  40. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/interfaces/downloader.py +0 -0
  41. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/interfaces/indexer.py +0 -0
  42. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/interfaces/process.py +0 -0
  43. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/interfaces/processor.py +0 -0
  44. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/interfaces/upload_stager.py +0 -0
  45. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/interfaces/uploader.py +0 -0
  46. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/logger.py +0 -0
  47. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/main.py +0 -0
  48. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/otel.py +0 -0
  49. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/__init__.py +0 -0
  50. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/interfaces.py +0 -0
  51. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/otel.py +0 -0
  52. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/pipeline.py +0 -0
  53. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/steps/__init__.py +0 -0
  54. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/steps/chunk.py +0 -0
  55. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/steps/download.py +0 -0
  56. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/steps/embed.py +0 -0
  57. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/steps/filter.py +0 -0
  58. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/steps/index.py +0 -0
  59. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/steps/partition.py +0 -0
  60. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/steps/stage.py +0 -0
  61. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/steps/uncompress.py +0 -0
  62. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/pipeline/steps/upload.py +0 -0
  63. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/__init__.py +0 -0
  64. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/chunker.py +0 -0
  65. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connector_registry.py +0 -0
  66. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/__init__.py +0 -0
  67. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/airtable.py +0 -0
  68. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  69. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +0 -0
  70. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +0 -0
  71. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/astradb.py +0 -0
  72. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/azure_ai_search.py +0 -0
  73. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/chroma.py +0 -0
  74. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/couchbase.py +0 -0
  75. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/databricks/__init__.py +0 -0
  76. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/databricks/volumes.py +0 -0
  77. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/databricks/volumes_aws.py +0 -0
  78. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/databricks/volumes_azure.py +0 -0
  79. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +0 -0
  80. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/databricks/volumes_native.py +0 -0
  81. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/databricks/volumes_table.py +0 -0
  82. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/delta_table.py +0 -0
  83. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/discord.py +0 -0
  84. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/duckdb/__init__.py +0 -0
  85. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/duckdb/base.py +0 -0
  86. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/duckdb/duckdb.py +0 -0
  87. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/duckdb/motherduck.py +0 -0
  88. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/elasticsearch/__init__.py +0 -0
  89. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +0 -0
  90. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +0 -0
  91. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/fsspec/__init__.py +0 -0
  92. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/fsspec/azure.py +0 -0
  93. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/fsspec/box.py +0 -0
  94. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/fsspec/dropbox.py +0 -0
  95. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/fsspec/fsspec.py +0 -0
  96. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/fsspec/gcs.py +0 -0
  97. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/fsspec/s3.py +0 -0
  98. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/fsspec/sftp.py +0 -0
  99. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/fsspec/utils.py +0 -0
  100. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/github.py +0 -0
  101. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/gitlab.py +0 -0
  102. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +0 -0
  103. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +0 -0
  104. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/jira.py +0 -0
  105. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/kafka/__init__.py +0 -0
  106. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/kafka/cloud.py +0 -0
  107. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/kafka/kafka.py +0 -0
  108. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/kafka/local.py +0 -0
  109. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/kdbai.py +0 -0
  110. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/lancedb/__init__.py +0 -0
  111. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/lancedb/aws.py +0 -0
  112. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/lancedb/azure.py +0 -0
  113. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/lancedb/cloud.py +0 -0
  114. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/lancedb/gcp.py +0 -0
  115. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/lancedb/lancedb.py +0 -0
  116. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/lancedb/local.py +0 -0
  117. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/local.py +0 -0
  118. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/milvus.py +0 -0
  119. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/mongodb.py +0 -0
  120. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/neo4j.py +0 -0
  121. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  122. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/client.py +0 -0
  123. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/connector.py +0 -0
  124. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/helpers.py +0 -0
  125. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +0 -0
  126. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +0 -0
  127. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +0 -0
  128. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +0 -0
  129. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/interfaces.py +0 -0
  130. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  131. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/block.py +0 -0
  132. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +0 -0
  133. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +0 -0
  134. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +0 -0
  135. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +0 -0
  136. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +0 -0
  137. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +0 -0
  138. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +0 -0
  139. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/code.py +0 -0
  140. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +0 -0
  141. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +0 -0
  142. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +0 -0
  143. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +0 -0
  144. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/file.py +0 -0
  145. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +0 -0
  146. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/image.py +0 -0
  147. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +0 -0
  148. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +0 -0
  149. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +0 -0
  150. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +0 -0
  151. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +0 -0
  152. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +0 -0
  153. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +0 -0
  154. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/table.py +0 -0
  155. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +0 -0
  156. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/template.py +0 -0
  157. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +0 -0
  158. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +0 -0
  159. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +0 -0
  160. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/blocks/video.py +0 -0
  161. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database.py +0 -0
  162. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +0 -0
  163. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +0 -0
  164. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +0 -0
  165. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +0 -0
  166. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +0 -0
  167. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +0 -0
  168. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +0 -0
  169. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +0 -0
  170. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +0 -0
  171. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +0 -0
  172. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +0 -0
  173. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +0 -0
  174. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +0 -0
  175. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +0 -0
  176. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +0 -0
  177. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +0 -0
  178. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +0 -0
  179. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +0 -0
  180. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +0 -0
  181. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +0 -0
  182. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +0 -0
  183. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +0 -0
  184. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +0 -0
  185. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/date.py +0 -0
  186. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/file.py +0 -0
  187. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/page.py +0 -0
  188. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/parent.py +0 -0
  189. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/rich_text.py +0 -0
  190. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/notion/types/user.py +0 -0
  191. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/onedrive.py +0 -0
  192. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/outlook.py +0 -0
  193. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/pinecone.py +0 -0
  194. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/qdrant/__init__.py +0 -0
  195. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/qdrant/cloud.py +0 -0
  196. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/qdrant/local.py +0 -0
  197. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/qdrant/qdrant.py +0 -0
  198. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/qdrant/server.py +0 -0
  199. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/redisdb.py +0 -0
  200. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/salesforce.py +0 -0
  201. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/sharepoint.py +0 -0
  202. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/slack.py +0 -0
  203. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/sql/__init__.py +0 -0
  204. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +0 -0
  205. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/sql/postgres.py +0 -0
  206. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/sql/singlestore.py +0 -0
  207. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/sql/snowflake.py +0 -0
  208. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/sql/sql.py +0 -0
  209. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/sql/sqlite.py +0 -0
  210. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/sql/vastdb.py +0 -0
  211. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/utils.py +0 -0
  212. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/vectara.py +0 -0
  213. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/weaviate/__init__.py +0 -0
  214. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/weaviate/cloud.py +0 -0
  215. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/weaviate/embedded.py +0 -0
  216. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/weaviate/local.py +0 -0
  217. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/weaviate/weaviate.py +0 -0
  218. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  219. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/zendesk/client.py +0 -0
  220. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -0
  221. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/embedder.py +0 -0
  222. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/filter.py +0 -0
  223. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/partitioner.py +0 -0
  224. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/uncompress.py +0 -0
  225. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/utils/__init__.py +0 -0
  226. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/processes/utils/blob_storage.py +0 -0
  227. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/unstructured_api.py +0 -0
  228. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/utils/__init__.py +0 -0
  229. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/utils/chunking.py +0 -0
  230. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/utils/compression.py +0 -0
  231. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/utils/constants.py +0 -0
  232. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/utils/data_prep.py +0 -0
  233. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/utils/dep_check.py +0 -0
  234. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/utils/html.py +0 -0
  235. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/utils/ndjson.py +0 -0
  236. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/utils/pydantic_models.py +0 -0
  237. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/utils/string_and_date_utils.py +0 -0
  238. {unstructured_ingest-1.0.2 → unstructured_ingest-1.0.5}/unstructured_ingest/utils/table.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unstructured_ingest
3
- Version: 1.0.2
3
+ Version: 1.0.5
4
4
  Summary: Local ETL data pipeline to get data RAG ready
5
5
  Author-email: Unstructured Technologies <devops@unstructuredai.io>
6
6
  License-Expression: Apache-2.0
@@ -0,0 +1 @@
1
+ __version__ = "1.0.5" # pragma: no cover
@@ -1,6 +1,6 @@
1
1
  import json
2
2
  from pathlib import Path
3
- from typing import Any, Optional
3
+ from typing import Any, Optional, Union
4
4
  from uuid import NAMESPACE_DNS, uuid5
5
5
 
6
6
  from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
@@ -29,9 +29,31 @@ class FileDataSourceMetadata(BaseModel):
29
29
  date_created: Optional[str] = None
30
30
  date_modified: Optional[str] = None
31
31
  date_processed: Optional[str] = None
32
- permissions_data: Optional[list[dict[str, Any]]] = None
32
+ permissions_data: Union[list[dict[str, Any]], dict[str, Any], None] = None
33
33
  filesize_bytes: Optional[int] = None
34
34
 
35
+ @field_validator("permissions_data", mode="before")
36
+ @classmethod
37
+ def coerce_permissions_data(cls, v: Any) -> Any:
38
+ if isinstance(v, dict):
39
+ # Temporarily convert dict to list for validation
40
+ return [v]
41
+ return v
42
+
43
+ @field_validator("permissions_data", mode="after")
44
+ @classmethod
45
+ def restore_dict_permissions_data(
46
+ cls, v: Optional[list[dict[str, Any]]]
47
+ ) -> Union[list[dict[str, Any]], dict[str, Any], None]:
48
+ if (
49
+ isinstance(v, list)
50
+ and len(v) == 1
51
+ and isinstance(v[0], dict)
52
+ and any(isinstance(val, dict) for val in v[0].values())
53
+ ):
54
+ return v[0]
55
+ return v
56
+
35
57
 
36
58
  class FileData(BaseModel):
37
59
  identifier: str
@@ -1,7 +1,8 @@
1
+ from collections import OrderedDict
1
2
  from contextlib import contextmanager
2
3
  from dataclasses import dataclass, field
3
4
  from pathlib import Path
4
- from typing import TYPE_CHECKING, Generator, List, Optional
5
+ from typing import TYPE_CHECKING, Generator, List, Optional, Tuple
5
6
 
6
7
  from pydantic import Field, Secret
7
8
 
@@ -135,35 +136,46 @@ class ConfluenceIndexer(Indexer):
135
136
  logger.error(f"Failed to connect to Confluence: {e}", exc_info=True)
136
137
  raise SourceConnectionError(f"Failed to connect to Confluence: {e}")
137
138
 
138
- def _get_space_ids(self) -> List[str]:
139
+ def _get_space_ids_and_keys(self) -> List[Tuple[str, int]]:
140
+ """
141
+ Get a list of space IDs and keys from Confluence.
142
+
143
+ Example space ID (numerical): 98503
144
+ Example space key (str): "SD"
145
+ """
139
146
  spaces = self.index_config.spaces
140
147
  if spaces:
141
- return spaces
148
+ with self.connection_config.get_client() as client:
149
+ space_ids_and_keys = []
150
+ for space_key in spaces:
151
+ space = client.get_space(space_key)
152
+ space_ids_and_keys.append((space_key, space["id"]))
153
+ return space_ids_and_keys
142
154
  else:
143
155
  with self.connection_config.get_client() as client:
144
156
  all_spaces = client.get_all_spaces(limit=self.index_config.max_num_of_spaces)
145
- space_ids = [space["key"] for space in all_spaces["results"]]
146
- return space_ids
157
+ space_ids_and_keys = [(space["key"], space["id"]) for space in all_spaces["results"]]
158
+ return space_ids_and_keys
147
159
 
148
- def _get_docs_ids_within_one_space(self, space_id: str) -> List[dict]:
160
+ def _get_docs_ids_within_one_space(self, space_key: str) -> List[dict]:
149
161
  with self.connection_config.get_client() as client:
150
162
  pages = client.get_all_pages_from_space(
151
- space=space_id,
163
+ space=space_key,
152
164
  start=0,
153
165
  limit=self.index_config.max_num_of_docs_from_each_space,
154
166
  expand=None,
155
- content_type="page",
167
+ content_type="page", # blogpost and comment types not currently supported
156
168
  status=None,
157
169
  )
158
- doc_ids = [{"space_id": space_id, "doc_id": page["id"]} for page in pages]
170
+ doc_ids = [{"space_id": space_key, "doc_id": page["id"]} for page in pages]
159
171
  return doc_ids
160
172
 
161
173
  def run(self) -> Generator[FileData, None, None]:
162
174
  from time import time
163
175
 
164
- space_ids = self._get_space_ids()
165
- for space_id in space_ids:
166
- doc_ids = self._get_docs_ids_within_one_space(space_id)
176
+ space_ids_and_keys = self._get_space_ids_and_keys()
177
+ for space_key, space_id in space_ids_and_keys:
178
+ doc_ids = self._get_docs_ids_within_one_space(space_key)
167
179
  for doc in doc_ids:
168
180
  doc_id = doc["doc_id"]
169
181
  # Build metadata
@@ -171,18 +183,19 @@ class ConfluenceIndexer(Indexer):
171
183
  date_processed=str(time()),
172
184
  url=f"{self.connection_config.url}/pages/{doc_id}",
173
185
  record_locator={
174
- "space_id": space_id,
186
+ "space_id": space_key,
175
187
  "document_id": doc_id,
176
188
  },
177
189
  )
178
190
  additional_metadata = {
179
- "space_id": space_id,
191
+ "space_key": space_key,
192
+ "space_id": space_id, # diff from record_locator space_id (which is space_key)
180
193
  "document_id": doc_id,
181
194
  }
182
195
 
183
196
  # Construct relative path and filename
184
197
  filename = f"{doc_id}.html"
185
- relative_path = str(Path(space_id) / filename)
198
+ relative_path = str(Path(space_key) / filename)
186
199
 
187
200
  source_identifiers = SourceIdentifiers(
188
201
  filename=filename,
@@ -201,7 +214,9 @@ class ConfluenceIndexer(Indexer):
201
214
 
202
215
 
203
216
  class ConfluenceDownloaderConfig(DownloaderConfig, HtmlMixin):
204
- pass
217
+ max_num_metadata_permissions: int = Field(
218
+ 250, description="Approximate maximum number of permissions included in metadata"
219
+ )
205
220
 
206
221
 
207
222
  @dataclass
@@ -209,6 +224,8 @@ class ConfluenceDownloader(Downloader):
209
224
  connection_config: ConfluenceConnectionConfig
210
225
  download_config: ConfluenceDownloaderConfig = field(default_factory=ConfluenceDownloaderConfig)
211
226
  connector_type: str = CONNECTOR_TYPE
227
+ _permissions_cache: dict = field(default_factory=OrderedDict)
228
+ _permissions_cache_max_size: int = 5
212
229
 
213
230
  def download_embedded_files(
214
231
  self, session, html: str, current_file_data: FileData
@@ -233,6 +250,148 @@ class ConfluenceDownloader(Downloader):
233
250
  session=session,
234
251
  )
235
252
 
253
+ def parse_permissions(self, doc_permissions: dict, space_permissions: list) -> dict[str, dict]:
254
+ """
255
+ Parses document and space permissions to determine final user/group roles.
256
+
257
+ :param doc_permissions: dict containing document-level restrictions
258
+ - doc_permissions type in Confluence: ContentRestrictionArray
259
+ :param space_permissions: list of space-level permission assignments
260
+ - space_permissions type in Confluence: list of SpacePermissionAssignment
261
+ :return: dict with operation as keys and each maps to dict with "users" and "groups"
262
+
263
+ Get document permissions. If they exist, they will override space level permissions.
264
+ Otherwise, apply relevant space permissions (read, administer, delete)
265
+ """
266
+
267
+ # Separate flags to track if view or edit is restricted at the page level
268
+ page_view_restricted = bool(
269
+ doc_permissions.get("read", {}).get("restrictions", {}).get("user", {}).get("results")
270
+ or doc_permissions.get("read", {})
271
+ .get("restrictions", {})
272
+ .get("group", {})
273
+ .get("results")
274
+ )
275
+
276
+ page_edit_restricted = bool(
277
+ doc_permissions.get("update", {}).get("restrictions", {}).get("user", {}).get("results")
278
+ or doc_permissions.get("update", {})
279
+ .get("restrictions", {})
280
+ .get("group", {})
281
+ .get("results")
282
+ )
283
+
284
+ permissions_by_role = {
285
+ "read": {"users": set(), "groups": set()},
286
+ "update": {"users": set(), "groups": set()},
287
+ "delete": {"users": set(), "groups": set()},
288
+ }
289
+
290
+ total_permissions = 0
291
+
292
+ for action, permissions in doc_permissions.items():
293
+ restrictions_dict = permissions.get("restrictions", {})
294
+
295
+ for entity_type, entity_data in restrictions_dict.items():
296
+ for entity in entity_data.get("results"):
297
+ entity_id = entity["accountId"] if entity_type == "user" else entity["id"]
298
+ permissions_by_role[action][f"{entity_type}s"].add(entity_id)
299
+ total_permissions += 1
300
+ # edit permission implies view permission
301
+ if action == "update":
302
+ permissions_by_role["read"][f"{entity_type}s"].add(entity_id)
303
+ # total_permissions += 1
304
+ # ^ omitting to not double count an entity.
305
+ # may result in a higher total count than max_num_metadata_permissions
306
+
307
+ for space_perm in space_permissions:
308
+ if total_permissions < self.download_config.max_num_metadata_permissions:
309
+ space_operation = space_perm["operation"]["key"]
310
+ space_target_type = space_perm["operation"]["targetType"]
311
+ space_entity_id = space_perm["principal"]["id"]
312
+ space_entity_type = space_perm["principal"]["type"]
313
+
314
+ # Apply space-level view permissions if no page restrictions exist
315
+ if (
316
+ space_target_type == "space"
317
+ and space_operation == "read"
318
+ and not page_view_restricted
319
+ ):
320
+ permissions_by_role["read"][f"{space_entity_type}s"].add(space_entity_id)
321
+ total_permissions += 1
322
+
323
+ # Administer permission includes view + edit. Apply if not page restricted
324
+ elif space_target_type == "space" and space_operation == "administer":
325
+ if not page_view_restricted:
326
+ permissions_by_role["read"][f"{space_entity_type}s"].add(space_entity_id)
327
+ total_permissions += 1
328
+ if not page_edit_restricted:
329
+ permissions_by_role["update"][f"{space_entity_type}s"].add(
330
+ space_entity_id
331
+ )
332
+ # total_permissions += 1
333
+ # ^ omitting to not double count an entity.
334
+ # may result in a higher total count than max_num_metadata_permissions
335
+
336
+ # Add the "delete page" space permissions if there are other page permissions
337
+ elif (
338
+ space_target_type == "page"
339
+ and space_operation == "delete"
340
+ and space_entity_id in permissions_by_role["read"][f"{space_entity_type}s"]
341
+ ):
342
+ permissions_by_role["delete"][f"{space_entity_type}s"].add(space_entity_id)
343
+ total_permissions += 1
344
+
345
+ # turn sets into sorted lists for consistency and json serialization
346
+ for role_dict in permissions_by_role.values():
347
+ for key in role_dict:
348
+ role_dict[key] = sorted(role_dict[key])
349
+
350
+ return permissions_by_role
351
+
352
+ def _get_permissions_for_space(self, space_id: int) -> Optional[List[dict]]:
353
+ if space_id in self._permissions_cache:
354
+ self._permissions_cache.move_to_end(space_id) # mark recent use
355
+ logger.debug(f"Retrieved cached permissions for space {space_id}")
356
+ return self._permissions_cache[space_id]
357
+ else:
358
+ with self.connection_config.get_client() as client:
359
+ try:
360
+ # TODO limit the total number of results being called.
361
+ # not yet implemented because this client call doesn't allow for filtering for
362
+ # certain operations, so adding a limit here would result in too little data.
363
+ space_permissions = []
364
+ space_permissions_result = client.get(f"/api/v2/spaces/{space_id}/permissions")
365
+ space_permissions.extend(space_permissions_result["results"])
366
+ if space_permissions_result["_links"].get("next"): # pagination
367
+ while space_permissions_result.get("next"):
368
+ space_permissions_result = client.get(space_permissions_result["next"])
369
+ space_permissions.extend(space_permissions_result["results"])
370
+
371
+ if len(self._permissions_cache) >= self._permissions_cache_max_size:
372
+ self._permissions_cache.popitem(last=False) # LRU/FIFO eviction
373
+ self._permissions_cache[space_id] = space_permissions
374
+
375
+ logger.debug(f"Retrieved permissions for space {space_id}")
376
+ return space_permissions
377
+ except Exception as e:
378
+ logger.debug(f"Could not retrieve permissions for space {space_id}: {e}")
379
+ return None
380
+
381
+ def _parse_permissions_for_doc(self, doc_id: str, space_permissions: list) -> Optional[dict]:
382
+ with self.connection_config.get_client() as client:
383
+ try:
384
+ doc_permissions = client.get_all_restrictions_for_content(content_id=doc_id)
385
+ parsed_permissions_dict = self.parse_permissions(doc_permissions, space_permissions)
386
+
387
+ except Exception as e:
388
+ # skip writing any permission metadata
389
+ logger.debug(f"Could not retrieve permissions for doc {doc_id}: {e}")
390
+ return None
391
+
392
+ logger.debug(f"normalized permissions generated: {parsed_permissions_dict}")
393
+ return parsed_permissions_dict
394
+
236
395
  def run(self, file_data: FileData, **kwargs) -> download_responses:
237
396
  from bs4 import BeautifulSoup
238
397
 
@@ -268,6 +427,14 @@ class ConfluenceDownloader(Downloader):
268
427
  soup = BeautifulSoup(content, "html.parser")
269
428
  f.write(soup.prettify())
270
429
 
430
+ # Get document permissions and update metadata
431
+ space_id = file_data.additional_metadata["space_id"]
432
+ space_perm = self._get_permissions_for_space(space_id) # must be the id, NOT the space key
433
+ if space_perm:
434
+ combined_doc_permissions = self._parse_permissions_for_doc(doc_id, space_perm)
435
+ if combined_doc_permissions:
436
+ file_data.metadata.permissions_data = combined_doc_permissions
437
+
271
438
  # Update file_data with metadata
272
439
  file_data.metadata.date_created = page["history"]["createdDate"]
273
440
  file_data.metadata.date_modified = page["version"]["when"]
@@ -391,6 +391,7 @@ class GoogleDriveIndexer(Indexer):
391
391
  ) -> list[FileData]:
392
392
  root_info = self.get_root_info(files_client=files_client, object_id=object_id)
393
393
  if not self.is_dir(root_info):
394
+ root_info["permissions"] = self.extract_permissions(root_info.get("permissions"))
394
395
  data = [self.map_file_data(root_info)]
395
396
  else:
396
397
  file_contents = self.get_paginated_results(
@@ -400,11 +401,51 @@ class GoogleDriveIndexer(Indexer):
400
401
  recursive=recursive,
401
402
  previous_path=root_info["name"],
402
403
  )
403
- data = [self.map_file_data(f=f) for f in file_contents]
404
+ data = []
405
+ for f in file_contents:
406
+ f["permissions"] = self.extract_permissions(f.get("permissions"))
407
+ data.append(self.map_file_data(f=f))
404
408
  for d in data:
405
409
  d.metadata.record_locator["drive_id"]: object_id
406
410
  return data
407
411
 
412
+ def extract_permissions(self, permissions: Optional[list[dict]]) -> dict:
413
+ if not permissions:
414
+ logger.debug("no permissions found")
415
+ return {}
416
+
417
+ # https://developers.google.com/workspace/drive/api/guides/ref-roles
418
+ role_mapping = {
419
+ "owner": ["read", "update", "delete"],
420
+ "organizer": ["read", "update", "delete"],
421
+ "fileOrganizer": ["read", "update"],
422
+ "writer": ["read", "update"],
423
+ "commenter": ["read"],
424
+ "reader": ["read"],
425
+ }
426
+
427
+ normalized_permissions = {
428
+ "read": {"users": set(), "groups": set()},
429
+ "update": {"users": set(), "groups": set()},
430
+ "delete": {"users": set(), "groups": set()},
431
+ }
432
+
433
+ for item in permissions:
434
+ # https://developers.google.com/workspace/drive/api/reference/rest/v3/permissions
435
+ # ignore permissions for "anyone" and "domain"
436
+ if item["type"] in ["user", "group"]:
437
+ type_key = item["type"] + "s"
438
+ for operation in role_mapping[item["role"]]:
439
+ normalized_permissions[operation][type_key].add(item["id"])
440
+
441
+ # turn sets into sorted lists for consistency and json serialization
442
+ for role_dict in normalized_permissions.values():
443
+ for key in role_dict:
444
+ role_dict[key] = sorted(role_dict[key])
445
+
446
+ logger.debug(f"normalized permissions generated: {normalized_permissions}")
447
+ return normalized_permissions
448
+
408
449
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
409
450
  with self.connection_config.get_client() as client:
410
451
  for f in self.get_files(
@@ -1 +0,0 @@
1
- __version__ = "1.0.2" # pragma: no cover