unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,270 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Any, Generator, Optional
7
+ from urllib.parse import urlparse
8
+
9
+ from pydantic import Field, Secret, model_validator
10
+
11
+ from unstructured_ingest.data_types.file_data import (
12
+ FileData,
13
+ FileDataSourceMetadata,
14
+ SourceIdentifiers,
15
+ )
16
+ from unstructured_ingest.error import SourceConnectionError, ValueError
17
+ from unstructured_ingest.interfaces import (
18
+ AccessConfig,
19
+ ConnectionConfig,
20
+ Downloader,
21
+ DownloaderConfig,
22
+ DownloadResponse,
23
+ Indexer,
24
+ IndexerConfig,
25
+ )
26
+ from unstructured_ingest.logger import logger
27
+ from unstructured_ingest.processes.connector_registry import SourceRegistryEntry
28
+ from unstructured_ingest.utils.dep_check import requires_dependencies
29
+
30
+ CONNECTOR_TYPE = "gitlab"
31
+ if TYPE_CHECKING:
32
+ from gitlab import Gitlab
33
+ from gitlab.v4.objects.projects import Project
34
+
35
+
36
+ class GitLabAccessConfig(AccessConfig):
37
+ access_token: Optional[str] = Field(
38
+ default=None,
39
+ description="Optional personal access token for authenticating with the GitLab API.",
40
+ )
41
+
42
+
43
+ class GitLabConnectionConfig(ConnectionConfig):
44
+ access_config: Secret[GitLabAccessConfig] = Field(
45
+ default_factory=GitLabAccessConfig,
46
+ validate_default=True,
47
+ description="Secret configuration for accessing the GitLab API by authentication token.",
48
+ )
49
+ url: str = Field(description="The full URL to the GitLab project or repository.")
50
+ base_url: str = Field(
51
+ default="https://gitlab.com",
52
+ description="The base URL for the GitLab instance (default is GitLab's public domain).",
53
+ )
54
+ repo_path: str = Field(
55
+ default=None,
56
+ init=False,
57
+ repr=False,
58
+ description="The normalized path extracted from the repository URL.",
59
+ )
60
+
61
+ @model_validator(mode="after")
62
+ def set_repo_path(self):
63
+ """
64
+ Parses the provided GitLab URL to extract the `base_url` and `repo_path`,
65
+ ensuring both are properly formatted for use.
66
+
67
+ If the URL contains a scheme (e.g., 'https') and a network location (e.g., 'gitlab.com'),
68
+ the `base_url` is set accordingly. The repository path is extracted and normalized
69
+ by removing any leading slashes.
70
+
71
+ Notes:
72
+ - If the URL contains both a scheme and network location, the `base_url` is
73
+ extracted directly from the URL.
74
+ - The `repo_path` is adjusted to remove any leading slashes.
75
+ - This method assumes that the URL follows GitLab's structure
76
+ (e.g., 'https://gitlab.com/owner/repo').
77
+ """
78
+ parsed_gh_url = urlparse(self.url)
79
+
80
+ if parsed_gh_url.scheme and parsed_gh_url.netloc:
81
+ self.base_url = f"{parsed_gh_url.scheme}://{parsed_gh_url.netloc}"
82
+ self.repo_path = parsed_gh_url.path.lstrip("/")
83
+
84
+ return self
85
+
86
+ @SourceConnectionError.wrap
87
+ @requires_dependencies(["gitlab"], extras="gitlab")
88
+ @contextmanager
89
+ def get_client(self) -> Generator["Gitlab", None, None]:
90
+ from gitlab import Gitlab
91
+
92
+ logger.info(f"Connection to GitLab: {self.base_url!r}")
93
+ with Gitlab(
94
+ self.base_url, private_token=self.access_config.get_secret_value().access_token
95
+ ) as client:
96
+ yield client
97
+
98
+ @contextmanager
99
+ def get_project(self) -> Generator["Project", None, None]:
100
+ """Retrieves the specified GitLab project using the configured base URL and access token.
101
+
102
+ Returns:
103
+ Project: A GitLab `Project` object representing the specified repository.
104
+
105
+ Raises:
106
+ SourceConnectionError: If the GitLab API connection fails.
107
+ gitlab.exceptions.GitlabGetError: If the project is not found.
108
+ """
109
+ with self.get_client() as client:
110
+ logger.info(f"Accessing Project: '{self.repo_path}'")
111
+ project = client.projects.get(self.repo_path)
112
+
113
+ logger.info(f"Successfully accessed project '{self.repo_path}'")
114
+ yield project
115
+
116
+
117
+ class GitLabIndexerConfig(IndexerConfig):
118
+ path: Path = Field(
119
+ default="/", description=("Path to the location in the repository that will be processed.")
120
+ )
121
+ recursive: bool = Field(
122
+ default=True,
123
+ description=(
124
+ "Flag to control recursive operations when indexing. "
125
+ "If True, the indexer will traverse directories recursively."
126
+ ),
127
+ )
128
+ git_branch: Optional[str] = Field(
129
+ default=None,
130
+ description="The name of the branch to interact with.",
131
+ )
132
+
133
+
134
+ @dataclass
135
+ class GitLabIndexer(Indexer):
136
+ connection_config: GitLabConnectionConfig
137
+ index_config: GitLabIndexerConfig
138
+
139
+ def precheck(self) -> None:
140
+ """Validates the connection to the GitLab instance by authenticating or
141
+ accessing the project.
142
+
143
+ This method ensures that the GitLab credentials and configuration are correct by
144
+ either authenticating or attempting to fetch the specified project.
145
+
146
+ Raises:
147
+ SourceConnectionError: If the connection or authentication with GitLab fails.
148
+ """
149
+
150
+ try:
151
+ with self.connection_config.get_client() as client:
152
+ if self.connection_config.access_config.get_secret_value().access_token is not None:
153
+ client.auth()
154
+ else:
155
+ client.projects.get(self.connection_config.repo_path)
156
+
157
+ except Exception as e:
158
+ logger.error(f"Failed to validate connection: {e}", exc_info=True)
159
+ raise SourceConnectionError(f"Failed to validate connection: {e}")
160
+
161
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
162
+ """Iterates over the GitLab repository tree and yields file metadata as `FileData` objects.
163
+
164
+ This method fetches the repository tree for the specified branch and iterates
165
+ over its contents. For each file (blob), it generates a `FileData` object containing
166
+ the file's metadata, path, and permissions.
167
+
168
+ Args:
169
+ **kwargs (Any): Additional keyword arguments (if required).
170
+
171
+ Yields:
172
+ FileData: A generator that yields `FileData` objects representing each file (blob)
173
+ in the repository.
174
+ """
175
+ with self.connection_config.get_project() as project:
176
+ ref = self.index_config.git_branch or project.default_branch
177
+
178
+ files = project.repository_tree(
179
+ path=str(self.index_config.path),
180
+ ref=ref,
181
+ recursive=self.index_config.recursive,
182
+ iterator=True,
183
+ all=True,
184
+ )
185
+
186
+ for file in files:
187
+ relative_path = str(Path(file["path"]).relative_to(self.index_config.path))
188
+ if file["type"] == "blob":
189
+ record_locator = {
190
+ "file_path": file["path"],
191
+ "ref": ref,
192
+ }
193
+ source_identifiers = SourceIdentifiers(
194
+ fullpath=file["path"],
195
+ filename=Path(file["path"]).name,
196
+ rel_path=relative_path,
197
+ )
198
+ yield FileData(
199
+ identifier=file["id"],
200
+ connector_type=CONNECTOR_TYPE,
201
+ source_identifiers=source_identifiers,
202
+ metadata=FileDataSourceMetadata(
203
+ url=file["id"],
204
+ record_locator=record_locator,
205
+ permissions_data=[{"mode": file["mode"]}],
206
+ ),
207
+ additional_metadata={},
208
+ display_name=source_identifiers.fullpath,
209
+ )
210
+
211
+
212
+ class GitLabDownloaderConfig(DownloaderConfig):
213
+ pass
214
+
215
+
216
+ @dataclass
217
+ class GitLabDownloader(Downloader):
218
+ connection_config: GitLabConnectionConfig
219
+ download_config: GitLabDownloaderConfig
220
+
221
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
222
+ """Downloads a file from the repository and returns a `DownloadResponse`.
223
+
224
+ Args:
225
+ file_data (FileData): Metadata about the file to be downloaded.
226
+ **kwargs (Any): Additional arguments (if required).
227
+
228
+ Returns:
229
+ DownloadResponse: A response object containing the download details.
230
+ """
231
+ download_path = self.get_download_path(file_data=file_data)
232
+ if download_path is None:
233
+ logger.error(
234
+ "Generated download path is None, source_identifiers might be missingfrom FileData."
235
+ )
236
+ raise ValueError("Generated invalid download path.")
237
+
238
+ self._download_file(file_data, download_path)
239
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
240
+
241
+ def _download_file(self, file_data: FileData, download_path: Path) -> None:
242
+ # NOTE: Indexer should supply the record locator in metadata
243
+ if (
244
+ file_data.metadata.record_locator is None
245
+ or "ref" not in file_data.metadata.record_locator
246
+ or "file_path" not in file_data.metadata.record_locator
247
+ ):
248
+ logger.error(
249
+ f"Invalid record locator in metadata: {file_data.metadata.record_locator}."
250
+ "Keys 'ref' and 'path' must be present."
251
+ )
252
+ raise ValueError("Invalid record locator.")
253
+
254
+ ref = file_data.metadata.record_locator["ref"]
255
+ path = file_data.metadata.record_locator["file_path"]
256
+ download_path.parent.mkdir(exist_ok=True, parents=True)
257
+
258
+ with self.connection_config.get_project() as project:
259
+ project_file = project.files.get(file_path=path, ref=ref)
260
+ with open(download_path, "wb") as file:
261
+ file.write(project_file.decode())
262
+
263
+
264
+ gitlab_source_entry = SourceRegistryEntry(
265
+ connection_config=GitLabConnectionConfig,
266
+ indexer_config=GitLabIndexerConfig,
267
+ indexer=GitLabIndexer,
268
+ downloader_config=GitLabDownloaderConfig,
269
+ downloader=GitLabDownloader,
270
+ )