unstructured-ingest 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (35) hide show
  1. test/integration/connectors/test_confluence.py +113 -0
  2. test/integration/connectors/test_kafka.py +67 -0
  3. test/integration/connectors/test_onedrive.py +112 -0
  4. test/integration/connectors/test_qdrant.py +137 -0
  5. test/integration/connectors/utils/docker.py +2 -1
  6. test/integration/connectors/utils/validation.py +73 -22
  7. unstructured_ingest/__version__.py +1 -1
  8. unstructured_ingest/connector/kafka.py +0 -1
  9. unstructured_ingest/interfaces.py +7 -7
  10. unstructured_ingest/v2/processes/chunker.py +2 -2
  11. unstructured_ingest/v2/processes/connectors/__init__.py +12 -1
  12. unstructured_ingest/v2/processes/connectors/confluence.py +195 -0
  13. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +2 -4
  14. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +1 -10
  15. unstructured_ingest/v2/processes/connectors/gitlab.py +267 -0
  16. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +13 -0
  17. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +82 -0
  18. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +196 -0
  19. unstructured_ingest/v2/processes/connectors/kafka/local.py +75 -0
  20. unstructured_ingest/v2/processes/connectors/onedrive.py +163 -2
  21. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  22. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  23. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  24. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +168 -0
  25. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  26. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +3 -1
  27. unstructured_ingest/v2/processes/connectors/sql/sql.py +2 -4
  28. unstructured_ingest/v2/processes/partitioner.py +14 -3
  29. unstructured_ingest/v2/unstructured_api.py +24 -10
  30. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.2.2.dist-info}/METADATA +22 -22
  31. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.2.2.dist-info}/RECORD +35 -20
  32. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.2.2.dist-info}/LICENSE.md +0 -0
  33. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.2.2.dist-info}/WHEEL +0 -0
  34. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.2.2.dist-info}/entry_points.txt +0 -0
  35. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.2.2.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,8 @@ from __future__ import annotations
2
2
 
3
3
  import unstructured_ingest.v2.processes.connectors.databricks # noqa: F401
4
4
  import unstructured_ingest.v2.processes.connectors.fsspec # noqa: F401
5
+ import unstructured_ingest.v2.processes.connectors.kafka # noqa: F401
6
+ import unstructured_ingest.v2.processes.connectors.qdrant # noqa: F401
5
7
  import unstructured_ingest.v2.processes.connectors.sql # noqa: F401
6
8
  from unstructured_ingest.v2.processes.connector_registry import (
7
9
  add_destination_entry,
@@ -16,12 +18,16 @@ from .azure_cognitive_search import CONNECTOR_TYPE as AZURE_COGNTIVE_SEARCH_CONN
16
18
  from .azure_cognitive_search import azure_cognitive_search_destination_entry
17
19
  from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
18
20
  from .chroma import chroma_destination_entry
21
+ from .confluence import CONNECTOR_TYPE as CONFLUENCE_CONNECTOR_TYPE
22
+ from .confluence import confluence_source_entry
19
23
  from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
20
24
  from .couchbase import couchbase_destination_entry, couchbase_source_entry
21
25
  from .delta_table import CONNECTOR_TYPE as DELTA_TABLE_CONNECTOR_TYPE
22
26
  from .delta_table import delta_table_destination_entry
23
27
  from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
24
28
  from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
29
+ from .gitlab import CONNECTOR_TYPE as GITLAB_CONNECTOR_TYPE
30
+ from .gitlab import gitlab_source_entry
25
31
  from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
26
32
  from .google_drive import google_drive_source_entry
27
33
  from .kdbai import CONNECTOR_TYPE as KDBAI_CONNECTOR_TYPE
@@ -33,7 +39,7 @@ from .milvus import milvus_destination_entry
33
39
  from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
34
40
  from .mongodb import mongodb_destination_entry, mongodb_source_entry
35
41
  from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
36
- from .onedrive import onedrive_source_entry
42
+ from .onedrive import onedrive_destination_entry, onedrive_source_entry
37
43
  from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
38
44
  from .opensearch import opensearch_destination_entry, opensearch_source_entry
39
45
  from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
@@ -72,6 +78,7 @@ add_source_entry(source_type=LOCAL_CONNECTOR_TYPE, entry=local_source_entry)
72
78
  add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=local_destination_entry)
73
79
 
74
80
  add_source_entry(source_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_source_entry)
81
+ add_destination_entry(destination_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_destination_entry)
75
82
 
76
83
  add_source_entry(source_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_source_entry)
77
84
  add_destination_entry(
@@ -99,4 +106,8 @@ add_source_entry(source_type=AIRTABLE_CONNECTOR_TYPE, entry=airtable_source_entr
99
106
 
100
107
  add_source_entry(source_type=OUTLOOK_CONNECTOR_TYPE, entry=outlook_source_entry)
101
108
 
109
+ add_source_entry(source_type=GITLAB_CONNECTOR_TYPE, entry=gitlab_source_entry)
110
+
102
111
  add_source_entry(source_type=SLACK_CONNECTOR_TYPE, entry=slack_source_entry)
112
+
113
+ add_source_entry(source_type=CONFLUENCE_CONNECTOR_TYPE, entry=confluence_source_entry)
@@ -0,0 +1,195 @@
1
+ from dataclasses import dataclass, field
2
+ from pathlib import Path
3
+ from typing import TYPE_CHECKING, Generator, List, Optional
4
+
5
+ from pydantic import Field, Secret
6
+
7
+ from unstructured_ingest.error import SourceConnectionError
8
+ from unstructured_ingest.utils.dep_check import requires_dependencies
9
+ from unstructured_ingest.v2.interfaces import (
10
+ AccessConfig,
11
+ ConnectionConfig,
12
+ Downloader,
13
+ DownloaderConfig,
14
+ FileData,
15
+ FileDataSourceMetadata,
16
+ Indexer,
17
+ IndexerConfig,
18
+ SourceIdentifiers,
19
+ download_responses,
20
+ )
21
+ from unstructured_ingest.v2.logger import logger
22
+ from unstructured_ingest.v2.processes.connector_registry import (
23
+ SourceRegistryEntry,
24
+ )
25
+
26
+ if TYPE_CHECKING:
27
+ from atlassian import Confluence
28
+
29
+ CONNECTOR_TYPE = "confluence"
30
+
31
+
32
+ class ConfluenceAccessConfig(AccessConfig):
33
+ api_token: str = Field(description="Confluence API token")
34
+
35
+
36
+ class ConfluenceConnectionConfig(ConnectionConfig):
37
+ url: str = Field(description="URL of the Confluence instance")
38
+ user_email: str = Field(description="User email for authentication")
39
+ access_config: Secret[ConfluenceAccessConfig] = Field(
40
+ description="Access configuration for Confluence"
41
+ )
42
+
43
+ @requires_dependencies(["atlassian"], extras="confluence")
44
+ def get_client(self) -> "Confluence":
45
+ from atlassian import Confluence
46
+
47
+ access_configs = self.access_config.get_secret_value()
48
+ return Confluence(
49
+ url=self.url,
50
+ username=self.user_email,
51
+ password=access_configs.api_token,
52
+ )
53
+
54
+
55
+ class ConfluenceIndexerConfig(IndexerConfig):
56
+ max_num_of_spaces: int = Field(500, description="Maximum number of spaces to index")
57
+ max_num_of_docs_from_each_space: int = Field(
58
+ 100, description="Maximum number of documents to fetch from each space"
59
+ )
60
+ spaces: Optional[List[str]] = Field(None, description="List of specific space keys to index")
61
+
62
+
63
+ @dataclass
64
+ class ConfluenceIndexer(Indexer):
65
+ connection_config: ConfluenceConnectionConfig
66
+ index_config: ConfluenceIndexerConfig
67
+ connector_type: str = CONNECTOR_TYPE
68
+
69
+ def precheck(self) -> bool:
70
+ try:
71
+
72
+ # Attempt to retrieve a list of spaces with limit=1.
73
+ # This should only succeed if all creds are valid
74
+ client = self.connection_config.get_client()
75
+ client.get_all_spaces(limit=1)
76
+ logger.info("Connection to Confluence successful.")
77
+ return True
78
+ except Exception as e:
79
+ logger.error(f"Failed to connect to Confluence: {e}", exc_info=True)
80
+ raise SourceConnectionError(f"Failed to connect to Confluence: {e}")
81
+
82
+ def _get_space_ids(self) -> List[str]:
83
+ spaces = self.index_config.spaces
84
+ if spaces:
85
+ return spaces
86
+ else:
87
+ client = self.connection_config.get_client()
88
+ all_spaces = client.get_all_spaces(limit=self.index_config.max_num_of_spaces)
89
+ space_ids = [space["key"] for space in all_spaces["results"]]
90
+ return space_ids
91
+
92
+ def _get_docs_ids_within_one_space(self, space_id: str) -> List[dict]:
93
+ client = self.connection_config.get_client()
94
+ pages = client.get_all_pages_from_space(
95
+ space=space_id,
96
+ start=0,
97
+ limit=self.index_config.max_num_of_docs_from_each_space,
98
+ expand=None,
99
+ content_type="page",
100
+ status=None,
101
+ )
102
+ doc_ids = [{"space_id": space_id, "doc_id": page["id"]} for page in pages]
103
+ return doc_ids
104
+
105
+ def run(self) -> Generator[FileData, None, None]:
106
+ from time import time
107
+
108
+ space_ids = self._get_space_ids()
109
+ for space_id in space_ids:
110
+ doc_ids = self._get_docs_ids_within_one_space(space_id)
111
+ for doc in doc_ids:
112
+ doc_id = doc["doc_id"]
113
+ # Build metadata
114
+ metadata = FileDataSourceMetadata(
115
+ date_processed=str(time()),
116
+ url=f"{self.connection_config.url}/pages/{doc_id}",
117
+ record_locator={
118
+ "space_id": space_id,
119
+ "document_id": doc_id,
120
+ },
121
+ )
122
+ additional_metadata = {
123
+ "space_id": space_id,
124
+ "document_id": doc_id,
125
+ }
126
+
127
+ # Construct relative path and filename
128
+ filename = f"{doc_id}.html"
129
+ relative_path = str(Path(space_id) / filename)
130
+
131
+ source_identifiers = SourceIdentifiers(
132
+ filename=filename,
133
+ fullpath=relative_path,
134
+ rel_path=relative_path,
135
+ )
136
+
137
+ file_data = FileData(
138
+ identifier=doc_id,
139
+ connector_type=self.connector_type,
140
+ metadata=metadata,
141
+ additional_metadata=additional_metadata,
142
+ source_identifiers=source_identifiers,
143
+ )
144
+ yield file_data
145
+
146
+
147
+ class ConfluenceDownloaderConfig(DownloaderConfig):
148
+ pass
149
+
150
+
151
+ @dataclass
152
+ class ConfluenceDownloader(Downloader):
153
+ connection_config: ConfluenceConnectionConfig
154
+ download_config: ConfluenceDownloaderConfig = field(default_factory=ConfluenceDownloaderConfig)
155
+ connector_type: str = CONNECTOR_TYPE
156
+
157
+ def run(self, file_data: FileData, **kwargs) -> download_responses:
158
+ doc_id = file_data.identifier
159
+ try:
160
+ client = self.connection_config.get_client()
161
+ page = client.get_page_by_id(
162
+ page_id=doc_id,
163
+ expand="history.lastUpdated,version,body.view",
164
+ )
165
+ except Exception as e:
166
+ logger.error(f"Failed to retrieve page with ID {doc_id}: {e}", exc_info=True)
167
+ raise SourceConnectionError(f"Failed to retrieve page with ID {doc_id}: {e}")
168
+
169
+ if not page:
170
+ raise ValueError(f"Page with ID {doc_id} does not exist.")
171
+
172
+ content = page["body"]["view"]["value"]
173
+
174
+ filepath = file_data.source_identifiers.relative_path
175
+ download_path = Path(self.download_dir) / filepath
176
+ download_path.parent.mkdir(parents=True, exist_ok=True)
177
+ with open(download_path, "w", encoding="utf8") as f:
178
+ f.write(content)
179
+
180
+ # Update file_data with metadata
181
+ file_data.metadata.date_created = page["history"]["createdDate"]
182
+ file_data.metadata.date_modified = page["version"]["when"]
183
+ file_data.metadata.version = str(page["version"]["number"])
184
+ file_data.display_name = page["title"]
185
+
186
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
187
+
188
+
189
+ confluence_source_entry = SourceRegistryEntry(
190
+ connection_config=ConfluenceConnectionConfig,
191
+ indexer_config=ConfluenceIndexerConfig,
192
+ indexer=ConfluenceIndexer,
193
+ downloader_config=ConfluenceDownloaderConfig,
194
+ downloader=ConfluenceDownloader,
195
+ )
@@ -148,9 +148,7 @@ class DatabricksVolumesDownloader(Downloader, ABC):
148
148
 
149
149
 
150
150
  class DatabricksVolumesUploaderConfig(UploaderConfig, DatabricksPathMixin):
151
- overwrite: bool = Field(
152
- default=False, description="If true, an existing file will be overwritten."
153
- )
151
+ pass
154
152
 
155
153
 
156
154
  @dataclass
@@ -173,5 +171,5 @@ class DatabricksVolumesUploader(Uploader, ABC):
173
171
  self.connection_config.get_client().files.upload(
174
172
  file_path=output_path,
175
173
  contents=elements_file,
176
- overwrite=self.upload_config.overwrite,
174
+ overwrite=True,
177
175
  )
@@ -231,9 +231,7 @@ class FsspecDownloader(Downloader):
231
231
 
232
232
 
233
233
  class FsspecUploaderConfig(FileConfig, UploaderConfig):
234
- overwrite: bool = Field(
235
- default=False, description="If true, an existing file will be overwritten."
236
- )
234
+ pass
237
235
 
238
236
 
239
237
  FsspecUploaderConfigT = TypeVar("FsspecUploaderConfigT", bound=FsspecUploaderConfig)
@@ -288,9 +286,6 @@ class FsspecUploader(Uploader):
288
286
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
289
287
  path_str = str(path.resolve())
290
288
  upload_path = self.get_upload_path(file_data=file_data)
291
- if self.fs.exists(path=str(upload_path)) and not self.upload_config.overwrite:
292
- logger.debug(f"skipping upload of {path} to {upload_path}, file already exists")
293
- return
294
289
  logger.debug(f"writing local file {path_str} to {upload_path}")
295
290
  self.fs.upload(lpath=path_str, rpath=str(upload_path))
296
291
 
@@ -298,9 +293,5 @@ class FsspecUploader(Uploader):
298
293
  upload_path = self.get_upload_path(file_data=file_data)
299
294
  path_str = str(path.resolve())
300
295
  # Odd that fsspec doesn't run exists() as async even when client support async
301
- already_exists = self.fs.exists(path=str(upload_path))
302
- if already_exists and not self.upload_config.overwrite:
303
- logger.debug(f"skipping upload of {path} to {upload_path}, file already exists")
304
- return
305
296
  logger.debug(f"writing local file {path_str} to {upload_path}")
306
297
  self.fs.upload(lpath=path_str, rpath=str(upload_path))
@@ -0,0 +1,267 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any, Generator, Optional
6
+ from urllib.parse import urlparse
7
+
8
+ from pydantic import Field, Secret, model_validator
9
+
10
+ from unstructured_ingest.error import SourceConnectionError
11
+ from unstructured_ingest.utils.dep_check import requires_dependencies
12
+ from unstructured_ingest.v2.interfaces import (
13
+ AccessConfig,
14
+ ConnectionConfig,
15
+ Downloader,
16
+ DownloaderConfig,
17
+ DownloadResponse,
18
+ FileData,
19
+ FileDataSourceMetadata,
20
+ Indexer,
21
+ IndexerConfig,
22
+ SourceIdentifiers,
23
+ )
24
+ from unstructured_ingest.v2.logger import logger
25
+ from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
26
+
27
+ CONNECTOR_TYPE = "gitlab"
28
+ if TYPE_CHECKING:
29
+ from gitlab import Gitlab
30
+ from gitlab.v4.objects.projects import Project
31
+
32
+
33
+ class GitLabAccessConfig(AccessConfig):
34
+ access_token: Optional[str] = Field(
35
+ default=None,
36
+ description="Optional personal access token for authenticating with the GitLab API.",
37
+ )
38
+
39
+
40
+ class GitLabConnectionConfig(ConnectionConfig):
41
+ access_config: Secret[GitLabAccessConfig] = Field(
42
+ default_factory=GitLabAccessConfig,
43
+ validate_default=True,
44
+ description="Secret configuration for accessing the GitLab API by authentication token.",
45
+ )
46
+ url: str = Field(description="The full URL to the GitLab project or repository.")
47
+ base_url: str = Field(
48
+ default="https://gitlab.com",
49
+ description="The base URL for the GitLab instance (default is GitLab's public domain).",
50
+ )
51
+ repo_path: str = Field(
52
+ default=None,
53
+ init=False,
54
+ repr=False,
55
+ description="The normalized path extracted from the repository URL.",
56
+ )
57
+
58
+ @model_validator(mode="after")
59
+ def set_repo_path(self):
60
+ """
61
+ Parses the provided GitLab URL to extract the `base_url` and `repo_path`,
62
+ ensuring both are properly formatted for use.
63
+
64
+ If the URL contains a scheme (e.g., 'https') and a network location (e.g., 'gitlab.com'),
65
+ the `base_url` is set accordingly. The repository path is extracted and normalized
66
+ by removing any leading slashes.
67
+
68
+ Notes:
69
+ - If the URL contains both a scheme and network location, the `base_url` is
70
+ extracted directly from the URL.
71
+ - The `repo_path` is adjusted to remove any leading slashes.
72
+ - This method assumes that the URL follows GitLab's structure
73
+ (e.g., 'https://gitlab.com/owner/repo').
74
+ """
75
+ parsed_gh_url = urlparse(self.url)
76
+
77
+ if parsed_gh_url.scheme and parsed_gh_url.netloc:
78
+ self.base_url = f"{parsed_gh_url.scheme}://{parsed_gh_url.netloc}"
79
+ self.repo_path = parsed_gh_url.path.lstrip("/")
80
+
81
+ return self
82
+
83
+ @SourceConnectionError.wrap
84
+ @requires_dependencies(["gitlab"], extras="gitlab")
85
+ def get_client(self) -> "Gitlab":
86
+ from gitlab import Gitlab
87
+
88
+ logger.info(f"Connection to GitLab: {self.base_url!r}")
89
+ gitlab = Gitlab(
90
+ self.base_url, private_token=self.access_config.get_secret_value().access_token
91
+ )
92
+ return gitlab
93
+
94
+ def get_project(self) -> "Project":
95
+ """Retrieves the specified GitLab project using the configured base URL and access token.
96
+
97
+ Returns:
98
+ Project: A GitLab `Project` object representing the specified repository.
99
+
100
+ Raises:
101
+ SourceConnectionError: If the GitLab API connection fails.
102
+ gitlab.exceptions.GitlabGetError: If the project is not found.
103
+ """
104
+ gitlab = self.get_client()
105
+
106
+ logger.info(f"Accessing Project: '{self.repo_path}'")
107
+ project = gitlab.projects.get(self.repo_path)
108
+
109
+ logger.info(f"Successfully accessed project '{self.repo_path}'")
110
+ return project
111
+
112
+
113
+ class GitLabIndexerConfig(IndexerConfig):
114
+ path: Path = Field(
115
+ default="/", description=("Path to the location in the repository that will be processed.")
116
+ )
117
+ recursive: bool = Field(
118
+ default=True,
119
+ description=(
120
+ "Flag to control recursive operations when indexing. "
121
+ "If True, the indexer will traverse directories recursively."
122
+ ),
123
+ )
124
+ git_branch: Optional[str] = Field(
125
+ default=None,
126
+ description="The name of the branch to interact with.",
127
+ )
128
+
129
+
130
+ @dataclass
131
+ class GitLabIndexer(Indexer):
132
+ connection_config: GitLabConnectionConfig
133
+ index_config: GitLabIndexerConfig
134
+
135
+ def precheck(self) -> None:
136
+ """Validates the connection to the GitLab instance by authenticating or
137
+ accessing the project.
138
+
139
+ This method ensures that the GitLab credentials and configuration are correct by
140
+ either authenticating or attempting to fetch the specified project.
141
+
142
+ Raises:
143
+ SourceConnectionError: If the connection or authentication with GitLab fails.
144
+ """
145
+
146
+ try:
147
+ gitlab = self.connection_config.get_client()
148
+ if self.connection_config.access_config.get_secret_value().access_token is not None:
149
+ gitlab.auth()
150
+ else:
151
+ gitlab.projects.get(self.connection_config.repo_path)
152
+
153
+ except Exception as e:
154
+ logger.error(f"Failed to validate connection: {e}", exc_info=True)
155
+ raise SourceConnectionError(f"Failed to validate connection: {e}")
156
+
157
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
158
+ """Iterates over the GitLab repository tree and yields file metadata as `FileData` objects.
159
+
160
+ This method fetches the repository tree for the specified branch and iterates
161
+ over its contents. For each file (blob), it generates a `FileData` object containing
162
+ the file's metadata, path, and permissions.
163
+
164
+ Args:
165
+ **kwargs (Any): Additional keyword arguments (if required).
166
+
167
+ Yields:
168
+ FileData: A generator that yields `FileData` objects representing each file (blob)
169
+ in the repository.
170
+ """
171
+ project = self.connection_config.get_project()
172
+
173
+ ref = self.index_config.git_branch or project.default_branch
174
+
175
+ files = project.repository_tree(
176
+ path=str(self.index_config.path),
177
+ ref=ref,
178
+ recursive=self.index_config.recursive,
179
+ iterator=True,
180
+ all=True,
181
+ )
182
+
183
+ for file in files:
184
+ relative_path = str(Path(file["path"]).relative_to(self.index_config.path))
185
+ if file["type"] == "blob":
186
+ record_locator = {
187
+ "file_path": file["path"],
188
+ "ref": ref,
189
+ }
190
+
191
+ yield FileData(
192
+ identifier=file["id"],
193
+ connector_type=CONNECTOR_TYPE,
194
+ source_identifiers=SourceIdentifiers(
195
+ fullpath=file["path"],
196
+ filename=Path(file["path"]).name,
197
+ rel_path=relative_path,
198
+ ),
199
+ metadata=FileDataSourceMetadata(
200
+ url=file["id"],
201
+ record_locator=record_locator,
202
+ permissions_data=[{"mode": file["mode"]}],
203
+ ),
204
+ additional_metadata={},
205
+ )
206
+
207
+
208
+ class GitLabDownloaderConfig(DownloaderConfig):
209
+ pass
210
+
211
+
212
+ @dataclass
213
+ class GitLabDownloader(Downloader):
214
+ connection_config: GitLabConnectionConfig
215
+ download_config: GitLabDownloaderConfig
216
+
217
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
218
+ """Downloads a file from the repository and returns a `DownloadResponse`.
219
+
220
+ Args:
221
+ file_data (FileData): Metadata about the file to be downloaded.
222
+ **kwargs (Any): Additional arguments (if required).
223
+
224
+ Returns:
225
+ DownloadResponse: A response object containing the download details.
226
+ """
227
+ download_path = self.get_download_path(file_data=file_data)
228
+ if download_path is None:
229
+ logger.error(
230
+ "Generated download path is None, source_identifiers might be missing"
231
+ "from FileData."
232
+ )
233
+ raise ValueError("Generated invalid download path.")
234
+
235
+ self._download_file(file_data, download_path)
236
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
237
+
238
+ def _download_file(self, file_data: FileData, download_path: Path) -> None:
239
+ # NOTE: Indexer should supply the record locator in metadata
240
+ if (
241
+ file_data.metadata.record_locator is None
242
+ or "ref" not in file_data.metadata.record_locator
243
+ or "file_path" not in file_data.metadata.record_locator
244
+ ):
245
+ logger.error(
246
+ f"Invalid record locator in metadata: {file_data.metadata.record_locator}."
247
+ "Keys 'ref' and 'path' must be present."
248
+ )
249
+ raise ValueError("Invalid record locator.")
250
+
251
+ ref = file_data.metadata.record_locator["ref"]
252
+ path = file_data.metadata.record_locator["file_path"]
253
+
254
+ project_file = self.connection_config.get_project().files.get(file_path=path, ref=ref)
255
+ download_path.parent.mkdir(exist_ok=True, parents=True)
256
+
257
+ with open(download_path, "wb") as file:
258
+ file.write(project_file.decode())
259
+
260
+
261
+ gitlab_source_entry = SourceRegistryEntry(
262
+ connection_config=GitLabConnectionConfig,
263
+ indexer_config=GitLabIndexerConfig,
264
+ indexer=GitLabIndexer,
265
+ downloader_config=GitLabDownloaderConfig,
266
+ downloader=GitLabDownloader,
267
+ )
@@ -0,0 +1,13 @@
1
+ from __future__ import annotations
2
+
3
+ from unstructured_ingest.v2.processes.connector_registry import (
4
+ add_source_entry,
5
+ )
6
+
7
+ from .cloud import CONNECTOR_TYPE as CLOUD_CONNECTOR
8
+ from .cloud import kafka_cloud_source_entry
9
+ from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR
10
+ from .local import kafka_local_source_entry
11
+
12
+ add_source_entry(source_type=LOCAL_CONNECTOR, entry=kafka_local_source_entry)
13
+ add_source_entry(source_type=CLOUD_CONNECTOR, entry=kafka_cloud_source_entry)
@@ -0,0 +1,82 @@
1
+ import socket
2
+ from dataclasses import dataclass
3
+ from typing import TYPE_CHECKING, Optional
4
+
5
+ from pydantic import Field, Secret, SecretStr
6
+
7
+ from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
8
+ from unstructured_ingest.v2.processes.connectors.kafka.kafka import (
9
+ KafkaAccessConfig,
10
+ KafkaConnectionConfig,
11
+ KafkaDownloader,
12
+ KafkaDownloaderConfig,
13
+ KafkaIndexer,
14
+ KafkaIndexerConfig,
15
+ )
16
+
17
+ if TYPE_CHECKING:
18
+ pass
19
+
20
+ CONNECTOR_TYPE = "kafka-cloud"
21
+
22
+
23
+ class CloudKafkaAccessConfig(KafkaAccessConfig):
24
+ api_key: Optional[SecretStr] = Field(
25
+ description="Kafka API key to connect at the server", alias="kafka_api_key", default=None
26
+ )
27
+ secret: Optional[SecretStr] = Field(description="", default=None)
28
+
29
+
30
+ class CloudKafkaConnectionConfig(KafkaConnectionConfig):
31
+ access_config: Secret[CloudKafkaAccessConfig]
32
+
33
+ def get_consumer_configuration(self) -> dict:
34
+ bootstrap = self.bootstrap_server
35
+ port = self.port
36
+ access_config = self.access_config.get_secret_value()
37
+
38
+ conf = {
39
+ "bootstrap.servers": f"{bootstrap}:{port}",
40
+ "client.id": socket.gethostname(),
41
+ "group.id": "default_group_id",
42
+ "enable.auto.commit": "false",
43
+ "auto.offset.reset": "earliest",
44
+ "message.max.bytes": 10485760,
45
+ "sasl.username": access_config.api_key,
46
+ "sasl.password": access_config.secret,
47
+ "sasl.mechanism": "PLAIN",
48
+ "security.protocol": "SASL_SSL",
49
+ }
50
+
51
+ return conf
52
+
53
+
54
+ class CloudKafkaIndexerConfig(KafkaIndexerConfig):
55
+ pass
56
+
57
+
58
+ @dataclass
59
+ class CloudKafkaIndexer(KafkaIndexer):
60
+ connection_config: CloudKafkaConnectionConfig
61
+ index_config: CloudKafkaIndexerConfig
62
+ connector_type: str = CONNECTOR_TYPE
63
+
64
+
65
+ class CloudKafkaDownloaderConfig(KafkaDownloaderConfig):
66
+ pass
67
+
68
+
69
+ @dataclass
70
+ class CloudKafkaDownloader(KafkaDownloader):
71
+ connection_config: CloudKafkaConnectionConfig
72
+ download_config: CloudKafkaDownloaderConfig
73
+ connector_type: str = CONNECTOR_TYPE
74
+
75
+
76
+ kafka_cloud_source_entry = SourceRegistryEntry(
77
+ connection_config=CloudKafkaConnectionConfig,
78
+ indexer=CloudKafkaIndexer,
79
+ indexer_config=CloudKafkaIndexerConfig,
80
+ downloader=CloudKafkaDownloader,
81
+ downloader_config=CloudKafkaDownloaderConfig,
82
+ )