unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,527 @@
1
+ from collections import OrderedDict
2
+ from contextlib import contextmanager
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Generator, List, Optional, Tuple
6
+
7
+ from pydantic import Field, Secret
8
+
9
+ from unstructured_ingest.data_types.file_data import (
10
+ FileData,
11
+ FileDataSourceMetadata,
12
+ SourceIdentifiers,
13
+ )
14
+ from unstructured_ingest.error import (
15
+ SourceConnectionError,
16
+ UserAuthError,
17
+ UserError,
18
+ ValueError,
19
+ )
20
+ from unstructured_ingest.interfaces import (
21
+ AccessConfig,
22
+ ConnectionConfig,
23
+ Downloader,
24
+ DownloaderConfig,
25
+ DownloadResponse,
26
+ Indexer,
27
+ IndexerConfig,
28
+ download_responses,
29
+ )
30
+ from unstructured_ingest.logger import logger
31
+ from unstructured_ingest.processes.connector_registry import (
32
+ SourceRegistryEntry,
33
+ )
34
+ from unstructured_ingest.utils.dep_check import requires_dependencies
35
+ from unstructured_ingest.utils.html import HtmlMixin
36
+ from unstructured_ingest.utils.string_and_date_utils import fix_unescaped_unicode
37
+
38
+ if TYPE_CHECKING:
39
+ from atlassian import Confluence
40
+ from bs4 import BeautifulSoup
41
+ from bs4.element import Tag
42
+
43
+ CONNECTOR_TYPE = "confluence"
44
+
45
+
46
+ class ConfluenceAccessConfig(AccessConfig):
47
+ password: Optional[str] = Field(
48
+ description="Confluence password",
49
+ default=None,
50
+ )
51
+ api_token: Optional[str] = Field(
52
+ description="Confluence Cloud API token",
53
+ default=None,
54
+ )
55
+ token: Optional[str] = Field(
56
+ description="Confluence Personal Access Token",
57
+ default=None,
58
+ )
59
+
60
+
61
+ class ConfluenceConnectionConfig(ConnectionConfig):
62
+ url: str = Field(description="URL of the Confluence instance")
63
+ username: Optional[str] = Field(
64
+ description="Username or email for authentication",
65
+ default=None,
66
+ )
67
+ cloud: bool = Field(description="Authenticate to Confluence Cloud", default=False)
68
+ access_config: Secret[ConfluenceAccessConfig] = Field(
69
+ description="Access configuration for Confluence"
70
+ )
71
+
72
+ def model_post_init(self, __context):
73
+ access_configs = self.access_config.get_secret_value()
74
+ if access_configs.password and access_configs.api_token:
75
+ raise ValueError(
76
+ "both password and api_token provided, only one allowed, "
77
+ "see: https://atlassian-python-api.readthedocs.io/"
78
+ )
79
+ basic_auth = bool(self.username and (access_configs.password or access_configs.api_token))
80
+ pat_auth = access_configs.token
81
+ if self.cloud and not basic_auth:
82
+ raise ValueError(
83
+ "cloud authentication requires username and API token (--password), "
84
+ "see: https://atlassian-python-api.readthedocs.io/"
85
+ )
86
+ if basic_auth and pat_auth:
87
+ raise ValueError(
88
+ "both password and token provided, only one allowed, "
89
+ "see: https://atlassian-python-api.readthedocs.io/"
90
+ )
91
+ if not (basic_auth or pat_auth):
92
+ raise ValueError(
93
+ "no form of auth provided, see: https://atlassian-python-api.readthedocs.io/"
94
+ )
95
+
96
+ def password_or_api_token(self) -> str:
97
+ # Confluence takes either password or API token under the same field: password
98
+ # This ambiguity led to confusion, so we are making it specific what you are passing in
99
+ access_configs = self.access_config.get_secret_value()
100
+ if access_configs.password:
101
+ return access_configs.password
102
+ return access_configs.api_token
103
+
104
+ @requires_dependencies(["atlassian"], extras="confluence")
105
+ @contextmanager
106
+ def get_client(self) -> Generator["Confluence", None, None]:
107
+ from atlassian import Confluence
108
+
109
+ access_configs = self.access_config.get_secret_value()
110
+ with Confluence(
111
+ url=self.url,
112
+ username=self.username,
113
+ password=self.password_or_api_token(),
114
+ token=access_configs.token,
115
+ cloud=self.cloud,
116
+ ) as client:
117
+ yield client
118
+
119
+
120
+ class ConfluenceIndexerConfig(IndexerConfig):
121
+ max_num_of_spaces: int = Field(500, description="Maximum number of spaces to index")
122
+ max_num_of_docs_from_each_space: int = Field(
123
+ 100, description="Maximum number of documents to fetch from each space"
124
+ )
125
+ spaces: Optional[List[str]] = Field(None, description="List of specific space keys to index")
126
+
127
+
128
+ @dataclass
129
+ class ConfluenceIndexer(Indexer):
130
+ connection_config: ConfluenceConnectionConfig
131
+ index_config: ConfluenceIndexerConfig
132
+ connector_type: str = CONNECTOR_TYPE
133
+
134
+ def precheck(self) -> bool:
135
+ try:
136
+ self.connection_config.get_client()
137
+ except Exception as e:
138
+ logger.exception(f"Failed to connect to Confluence: {e}")
139
+ raise UserAuthError(f"Failed to connect to Confluence: {e}")
140
+
141
+ with self.connection_config.get_client() as client:
142
+ # opportunistically check the first space in list of all spaces
143
+ try:
144
+ client.get_all_spaces(limit=1)
145
+ except Exception as e:
146
+ logger.exception(f"Failed to connect to find any Confluence space: {e}")
147
+ raise UserError(f"Failed to connect to find any Confluence space: {e}")
148
+
149
+ logger.info("Connection to Confluence successful.")
150
+
151
+ # If specific spaces are provided, check if we can access them
152
+ errors = []
153
+
154
+ if self.index_config.spaces:
155
+ for space_key in self.index_config.spaces:
156
+ try:
157
+ client.get_space(space_key)
158
+ except Exception as e:
159
+ logger.exception(f"Failed to connect to Confluence: {e}")
160
+ errors.append(f"Failed to connect to '{space_key}' space, cause: '{e}'")
161
+
162
+ if errors:
163
+ raise UserError("\n".join(errors))
164
+
165
+ return True
166
+
167
+ def _get_space_ids_and_keys(self) -> List[Tuple[str, int]]:
168
+ """
169
+ Get a list of space IDs and keys from Confluence.
170
+
171
+ Example space ID (numerical): 98503
172
+ Example space key (str): "SD"
173
+ """
174
+ spaces = self.index_config.spaces
175
+ if spaces:
176
+ with self.connection_config.get_client() as client:
177
+ space_ids_and_keys = []
178
+ for space_key in spaces:
179
+ space = client.get_space(space_key)
180
+ space_ids_and_keys.append((space_key, space["id"]))
181
+ return space_ids_and_keys
182
+ else:
183
+ with self.connection_config.get_client() as client:
184
+ all_spaces = client.get_all_spaces(limit=self.index_config.max_num_of_spaces)
185
+ space_ids_and_keys = [(space["key"], space["id"]) for space in all_spaces["results"]]
186
+ return space_ids_and_keys
187
+
188
+ def _get_docs_ids_within_one_space(self, space_key: str) -> List[dict]:
189
+ with self.connection_config.get_client() as client:
190
+ pages = client.get_all_pages_from_space(
191
+ space=space_key,
192
+ start=0,
193
+ expand=None,
194
+ content_type="page", # blogpost and comment types not currently supported
195
+ status=None,
196
+ )
197
+ # Limit the number of documents to max_num_of_docs_from_each_space
198
+ # Note: this is needed because the limit field in client.get_all_pages_from_space does
199
+ # not seem to work as expected
200
+ limited_pages = pages[: self.index_config.max_num_of_docs_from_each_space]
201
+ doc_ids = [{"space_id": space_key, "doc_id": page["id"]} for page in limited_pages]
202
+ return doc_ids
203
+
204
+ def run(self) -> Generator[FileData, None, None]:
205
+ from time import time
206
+
207
+ space_ids_and_keys = self._get_space_ids_and_keys()
208
+ for space_key, space_id in space_ids_and_keys:
209
+ doc_ids = self._get_docs_ids_within_one_space(space_key)
210
+ for doc in doc_ids:
211
+ doc_id = doc["doc_id"]
212
+ # Build metadata
213
+ metadata = FileDataSourceMetadata(
214
+ date_processed=str(time()),
215
+ url=f"{self.connection_config.url}/pages/{doc_id}",
216
+ record_locator={
217
+ "space_id": space_key,
218
+ "document_id": doc_id,
219
+ },
220
+ )
221
+ additional_metadata = {
222
+ "space_key": space_key,
223
+ "space_id": space_id, # diff from record_locator space_id (which is space_key)
224
+ "document_id": doc_id,
225
+ }
226
+
227
+ # Construct relative path and filename
228
+ filename = f"{doc_id}.html"
229
+ relative_path = str(Path(space_key) / filename)
230
+
231
+ source_identifiers = SourceIdentifiers(
232
+ filename=filename,
233
+ fullpath=relative_path,
234
+ rel_path=relative_path,
235
+ )
236
+
237
+ file_data = FileData(
238
+ identifier=doc_id,
239
+ connector_type=self.connector_type,
240
+ metadata=metadata,
241
+ additional_metadata=additional_metadata,
242
+ source_identifiers=source_identifiers,
243
+ display_name=source_identifiers.fullpath,
244
+ )
245
+ yield file_data
246
+
247
+
248
+ class ConfluenceDownloaderConfig(HtmlMixin, DownloaderConfig):
249
+ max_num_metadata_permissions: int = Field(
250
+ 250, description="Approximate maximum number of permissions included in metadata"
251
+ )
252
+
253
+ @requires_dependencies(["bs4"])
254
+ def _find_hyperlink_tags(self, html_soup: "BeautifulSoup") -> list["Tag"]:
255
+ from bs4.element import Tag
256
+
257
+ return [
258
+ element
259
+ for element in html_soup.find_all(
260
+ "a",
261
+ attrs={
262
+ "class": "confluence-embedded-file",
263
+ "data-linked-resource-type": "attachment",
264
+ "href": True,
265
+ },
266
+ )
267
+ if isinstance(element, Tag)
268
+ ]
269
+
270
+
271
+ @dataclass
272
+ class ConfluenceDownloader(Downloader):
273
+ connection_config: ConfluenceConnectionConfig
274
+ download_config: ConfluenceDownloaderConfig = field(default_factory=ConfluenceDownloaderConfig)
275
+ connector_type: str = CONNECTOR_TYPE
276
+ _permissions_cache: dict = field(default_factory=OrderedDict)
277
+ _permissions_cache_max_size: int = 5
278
+
279
+ def download_embedded_files(
280
+ self, session, html: str, current_file_data: FileData
281
+ ) -> list[DownloadResponse]:
282
+ if not self.download_config.extract_files:
283
+ return []
284
+ url = current_file_data.metadata.url
285
+ if url is None:
286
+ logger.warning(
287
+ f"""Missing URL for file: {current_file_data.source_identifiers.filename}.
288
+ Skipping file extraction."""
289
+ )
290
+ return []
291
+ filepath = current_file_data.source_identifiers.relative_path
292
+ download_path = Path(self.download_dir) / filepath
293
+ download_dir = download_path.with_suffix("")
294
+ return self.download_config.extract_embedded_files(
295
+ url=url,
296
+ download_dir=download_dir,
297
+ original_filedata=current_file_data,
298
+ html=html,
299
+ session=session,
300
+ )
301
+
302
+ def parse_permissions(self, doc_permissions: dict, space_permissions: list) -> dict[str, dict]:
303
+ """
304
+ Parses document and space permissions to determine final user/group roles.
305
+
306
+ :param doc_permissions: dict containing document-level restrictions
307
+ - doc_permissions type in Confluence: ContentRestrictionArray
308
+ :param space_permissions: list of space-level permission assignments
309
+ - space_permissions type in Confluence: list of SpacePermissionAssignment
310
+ :return: dict with operation as keys and each maps to dict with "users" and "groups"
311
+
312
+ Get document permissions. If they exist, they will override space level permissions.
313
+ Otherwise, apply relevant space permissions (read, administer, delete)
314
+ """
315
+
316
+ # Separate flags to track if view or edit is restricted at the page level
317
+ page_view_restricted = bool(
318
+ doc_permissions.get("read", {}).get("restrictions", {}).get("user", {}).get("results")
319
+ or doc_permissions.get("read", {})
320
+ .get("restrictions", {})
321
+ .get("group", {})
322
+ .get("results")
323
+ )
324
+
325
+ page_edit_restricted = bool(
326
+ doc_permissions.get("update", {}).get("restrictions", {}).get("user", {}).get("results")
327
+ or doc_permissions.get("update", {})
328
+ .get("restrictions", {})
329
+ .get("group", {})
330
+ .get("results")
331
+ )
332
+
333
+ permissions_by_role = {
334
+ "read": {"users": set(), "groups": set()},
335
+ "update": {"users": set(), "groups": set()},
336
+ "delete": {"users": set(), "groups": set()},
337
+ }
338
+
339
+ total_permissions = 0
340
+
341
+ for action, permissions in doc_permissions.items():
342
+ restrictions_dict = permissions.get("restrictions", {})
343
+
344
+ for entity_type, entity_data in restrictions_dict.items():
345
+ for entity in entity_data.get("results"):
346
+ entity_id = entity["accountId"] if entity_type == "user" else entity["id"]
347
+ permissions_by_role[action][f"{entity_type}s"].add(entity_id)
348
+ total_permissions += 1
349
+ # edit permission implies view permission
350
+ if action == "update":
351
+ permissions_by_role["read"][f"{entity_type}s"].add(entity_id)
352
+ # total_permissions += 1
353
+ # ^ omitting to not double count an entity.
354
+ # may result in a higher total count than max_num_metadata_permissions
355
+
356
+ for space_perm in space_permissions:
357
+ if total_permissions < self.download_config.max_num_metadata_permissions:
358
+ space_operation = space_perm["operation"]["key"]
359
+ space_target_type = space_perm["operation"]["targetType"]
360
+ space_entity_id = space_perm["principal"]["id"]
361
+ space_entity_type = space_perm["principal"]["type"]
362
+
363
+ # Apply space-level view permissions if no page restrictions exist
364
+ if (
365
+ space_target_type == "space"
366
+ and space_operation == "read"
367
+ and not page_view_restricted
368
+ ):
369
+ permissions_by_role["read"][f"{space_entity_type}s"].add(space_entity_id)
370
+ total_permissions += 1
371
+
372
+ # Administer permission includes view + edit. Apply if not page restricted
373
+ elif space_target_type == "space" and space_operation == "administer":
374
+ if not page_view_restricted:
375
+ permissions_by_role["read"][f"{space_entity_type}s"].add(space_entity_id)
376
+ total_permissions += 1
377
+ if not page_edit_restricted:
378
+ permissions_by_role["update"][f"{space_entity_type}s"].add(
379
+ space_entity_id
380
+ )
381
+ # total_permissions += 1
382
+ # ^ omitting to not double count an entity.
383
+ # may result in a higher total count than max_num_metadata_permissions
384
+
385
+ # Add the "delete page" space permissions if there are other page permissions
386
+ elif (
387
+ space_target_type == "page"
388
+ and space_operation == "delete"
389
+ and space_entity_id in permissions_by_role["read"][f"{space_entity_type}s"]
390
+ ):
391
+ permissions_by_role["delete"][f"{space_entity_type}s"].add(space_entity_id)
392
+ total_permissions += 1
393
+
394
+ # turn sets into sorted lists for consistency and json serialization
395
+ for role_dict in permissions_by_role.values():
396
+ for key in role_dict:
397
+ role_dict[key] = sorted(role_dict[key])
398
+
399
+ return permissions_by_role
400
+
401
+ def _get_permissions_for_space(self, space_id: int) -> Optional[List[dict]]:
402
+ if space_id in self._permissions_cache:
403
+ self._permissions_cache.move_to_end(space_id) # mark recent use
404
+ logger.debug(f"Retrieved cached permissions for space {space_id}")
405
+ return self._permissions_cache[space_id]
406
+ else:
407
+ with self.connection_config.get_client() as client:
408
+ try:
409
+ # TODO limit the total number of results being called.
410
+ # not yet implemented because this client call doesn't allow for filtering for
411
+ # certain operations, so adding a limit here would result in too little data.
412
+ space_permissions = []
413
+ space_permissions_result = client.get(f"/api/v2/spaces/{space_id}/permissions")
414
+ space_permissions.extend(space_permissions_result["results"])
415
+ if space_permissions_result["_links"].get("next"): # pagination
416
+ while space_permissions_result.get("next"):
417
+ space_permissions_result = client.get(space_permissions_result["next"])
418
+ space_permissions.extend(space_permissions_result["results"])
419
+
420
+ if len(self._permissions_cache) >= self._permissions_cache_max_size:
421
+ self._permissions_cache.popitem(last=False) # LRU/FIFO eviction
422
+ self._permissions_cache[space_id] = space_permissions
423
+
424
+ logger.debug(f"Retrieved permissions for space {space_id}")
425
+ return space_permissions
426
+ except Exception as e:
427
+ logger.debug(f"Could not retrieve permissions for space {space_id}: {e}")
428
+ return None
429
+
430
+ def _parse_permissions_for_doc(
431
+ self, doc_id: str, space_permissions: list
432
+ ) -> Optional[list[dict]]:
433
+ with self.connection_config.get_client() as client:
434
+ try:
435
+ doc_permissions = client.get_all_restrictions_for_content(content_id=doc_id)
436
+ parsed_permissions_dict = self.parse_permissions(doc_permissions, space_permissions)
437
+ parsed_permissions_dict = [{k: v} for k, v in parsed_permissions_dict.items()]
438
+
439
+ except Exception as e:
440
+ # skip writing any permission metadata
441
+ logger.debug(f"Could not retrieve permissions for doc {doc_id}: {e}")
442
+ return None
443
+
444
+ logger.debug(f"normalized permissions generated: {parsed_permissions_dict}")
445
+ return parsed_permissions_dict
446
+
447
+ def run(self, file_data: FileData, **kwargs) -> download_responses:
448
+ from bs4 import BeautifulSoup
449
+
450
+ doc_id = file_data.identifier
451
+ try:
452
+ with self.connection_config.get_client() as client:
453
+ page = client.get_page_by_id(
454
+ page_id=doc_id,
455
+ expand="history.lastUpdated,version,body.view",
456
+ )
457
+ except Exception as e:
458
+ logger.exception(f"Failed to retrieve page with ID {doc_id}: {e}")
459
+ raise SourceConnectionError(f"Failed to retrieve page with ID {doc_id}: {e}")
460
+
461
+ if not page:
462
+ raise ValueError(f"Page with ID {doc_id} does not exist.")
463
+
464
+ content = page["body"]["view"]["value"]
465
+ title = page["title"]
466
+ # Using h1 for title is supported by both v1 and v2 html parsing in unstructured
467
+ title_html = f"<h1>{title}</h1>"
468
+ content = fix_unescaped_unicode(f"<body class='Document' >{title_html}{content}</body>")
469
+ if self.download_config.extract_images:
470
+ with self.connection_config.get_client() as client:
471
+ content = self.download_config.extract_html_images(
472
+ url=file_data.metadata.url, html=content, session=client._session
473
+ )
474
+
475
+ filepath = file_data.source_identifiers.relative_path
476
+ download_path = Path(self.download_dir) / filepath
477
+ download_path.parent.mkdir(parents=True, exist_ok=True)
478
+ with open(download_path, "w", encoding="utf8") as f:
479
+ soup = BeautifulSoup(content, "html.parser")
480
+ f.write(soup.prettify())
481
+
482
+ # Get document permissions and update metadata
483
+ space_id = file_data.additional_metadata["space_id"]
484
+ space_perm = self._get_permissions_for_space(space_id) # must be the id, NOT the space key
485
+ if space_perm:
486
+ combined_doc_permissions = self._parse_permissions_for_doc(doc_id, space_perm)
487
+ if combined_doc_permissions:
488
+ file_data.metadata.permissions_data = combined_doc_permissions
489
+
490
+ # Update file_data with metadata
491
+ file_data.metadata.date_created = page["history"]["createdDate"]
492
+ file_data.metadata.date_modified = page["version"]["when"]
493
+ file_data.metadata.version = str(page["version"]["number"])
494
+ file_data.display_name = title
495
+
496
+ download_response = self.generate_download_response(
497
+ file_data=file_data, download_path=download_path
498
+ )
499
+ if self.download_config.extract_files:
500
+ with self.connection_config.get_client() as client:
501
+ extracted_download_responses = self.download_embedded_files(
502
+ html=content,
503
+ current_file_data=download_response["file_data"],
504
+ session=client._session,
505
+ )
506
+ if extracted_download_responses:
507
+ for dr in extracted_download_responses:
508
+ fd = dr["file_data"]
509
+ source_file_path = Path(file_data.source_identifiers.fullpath).with_suffix(
510
+ ""
511
+ )
512
+ new_fullpath = source_file_path / fd.source_identifiers.filename
513
+ fd.source_identifiers = SourceIdentifiers(
514
+ fullpath=new_fullpath.as_posix(), filename=new_fullpath.name
515
+ )
516
+ extracted_download_responses.append(download_response)
517
+ return extracted_download_responses
518
+ return download_response
519
+
520
+
521
+ confluence_source_entry = SourceRegistryEntry(
522
+ connection_config=ConfluenceConnectionConfig,
523
+ indexer_config=ConfluenceIndexerConfig,
524
+ indexer=ConfluenceIndexer,
525
+ downloader_config=ConfluenceDownloaderConfig,
526
+ downloader=ConfluenceDownloader,
527
+ )