unstructured-ingest 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_confluence.py +113 -0
- test/integration/connectors/test_kafka.py +67 -0
- test/integration/connectors/test_onedrive.py +112 -0
- test/integration/connectors/test_qdrant.py +137 -0
- test/integration/connectors/utils/docker.py +2 -1
- test/integration/connectors/utils/validation.py +73 -22
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/connector/kafka.py +0 -1
- unstructured_ingest/interfaces.py +7 -7
- unstructured_ingest/v2/processes/chunker.py +2 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +12 -1
- unstructured_ingest/v2/processes/connectors/confluence.py +195 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +2 -4
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +1 -10
- unstructured_ingest/v2/processes/connectors/gitlab.py +267 -0
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +13 -0
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +82 -0
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +196 -0
- unstructured_ingest/v2/processes/connectors/kafka/local.py +75 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +163 -2
- unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +168 -0
- unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +3 -1
- unstructured_ingest/v2/processes/connectors/sql/sql.py +2 -4
- unstructured_ingest/v2/processes/partitioner.py +14 -3
- unstructured_ingest/v2/unstructured_api.py +24 -10
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.2.2.dist-info}/METADATA +22 -22
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.2.2.dist-info}/RECORD +35 -20
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.2.2.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.2.2.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.2.2.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.2.2.dist-info}/top_level.txt +0 -0
|
@@ -2,6 +2,8 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import unstructured_ingest.v2.processes.connectors.databricks # noqa: F401
|
|
4
4
|
import unstructured_ingest.v2.processes.connectors.fsspec # noqa: F401
|
|
5
|
+
import unstructured_ingest.v2.processes.connectors.kafka # noqa: F401
|
|
6
|
+
import unstructured_ingest.v2.processes.connectors.qdrant # noqa: F401
|
|
5
7
|
import unstructured_ingest.v2.processes.connectors.sql # noqa: F401
|
|
6
8
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
7
9
|
add_destination_entry,
|
|
@@ -16,12 +18,16 @@ from .azure_cognitive_search import CONNECTOR_TYPE as AZURE_COGNTIVE_SEARCH_CONN
|
|
|
16
18
|
from .azure_cognitive_search import azure_cognitive_search_destination_entry
|
|
17
19
|
from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
|
|
18
20
|
from .chroma import chroma_destination_entry
|
|
21
|
+
from .confluence import CONNECTOR_TYPE as CONFLUENCE_CONNECTOR_TYPE
|
|
22
|
+
from .confluence import confluence_source_entry
|
|
19
23
|
from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
|
|
20
24
|
from .couchbase import couchbase_destination_entry, couchbase_source_entry
|
|
21
25
|
from .delta_table import CONNECTOR_TYPE as DELTA_TABLE_CONNECTOR_TYPE
|
|
22
26
|
from .delta_table import delta_table_destination_entry
|
|
23
27
|
from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
|
|
24
28
|
from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
|
|
29
|
+
from .gitlab import CONNECTOR_TYPE as GITLAB_CONNECTOR_TYPE
|
|
30
|
+
from .gitlab import gitlab_source_entry
|
|
25
31
|
from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
|
|
26
32
|
from .google_drive import google_drive_source_entry
|
|
27
33
|
from .kdbai import CONNECTOR_TYPE as KDBAI_CONNECTOR_TYPE
|
|
@@ -33,7 +39,7 @@ from .milvus import milvus_destination_entry
|
|
|
33
39
|
from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
|
|
34
40
|
from .mongodb import mongodb_destination_entry, mongodb_source_entry
|
|
35
41
|
from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
|
|
36
|
-
from .onedrive import onedrive_source_entry
|
|
42
|
+
from .onedrive import onedrive_destination_entry, onedrive_source_entry
|
|
37
43
|
from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
|
|
38
44
|
from .opensearch import opensearch_destination_entry, opensearch_source_entry
|
|
39
45
|
from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
|
|
@@ -72,6 +78,7 @@ add_source_entry(source_type=LOCAL_CONNECTOR_TYPE, entry=local_source_entry)
|
|
|
72
78
|
add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=local_destination_entry)
|
|
73
79
|
|
|
74
80
|
add_source_entry(source_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_source_entry)
|
|
81
|
+
add_destination_entry(destination_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_destination_entry)
|
|
75
82
|
|
|
76
83
|
add_source_entry(source_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_source_entry)
|
|
77
84
|
add_destination_entry(
|
|
@@ -99,4 +106,8 @@ add_source_entry(source_type=AIRTABLE_CONNECTOR_TYPE, entry=airtable_source_entr
|
|
|
99
106
|
|
|
100
107
|
add_source_entry(source_type=OUTLOOK_CONNECTOR_TYPE, entry=outlook_source_entry)
|
|
101
108
|
|
|
109
|
+
add_source_entry(source_type=GITLAB_CONNECTOR_TYPE, entry=gitlab_source_entry)
|
|
110
|
+
|
|
102
111
|
add_source_entry(source_type=SLACK_CONNECTOR_TYPE, entry=slack_source_entry)
|
|
112
|
+
|
|
113
|
+
add_source_entry(source_type=CONFLUENCE_CONNECTOR_TYPE, entry=confluence_source_entry)
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import TYPE_CHECKING, Generator, List, Optional
|
|
4
|
+
|
|
5
|
+
from pydantic import Field, Secret
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.error import SourceConnectionError
|
|
8
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
|
+
from unstructured_ingest.v2.interfaces import (
|
|
10
|
+
AccessConfig,
|
|
11
|
+
ConnectionConfig,
|
|
12
|
+
Downloader,
|
|
13
|
+
DownloaderConfig,
|
|
14
|
+
FileData,
|
|
15
|
+
FileDataSourceMetadata,
|
|
16
|
+
Indexer,
|
|
17
|
+
IndexerConfig,
|
|
18
|
+
SourceIdentifiers,
|
|
19
|
+
download_responses,
|
|
20
|
+
)
|
|
21
|
+
from unstructured_ingest.v2.logger import logger
|
|
22
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
23
|
+
SourceRegistryEntry,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from atlassian import Confluence
|
|
28
|
+
|
|
29
|
+
CONNECTOR_TYPE = "confluence"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ConfluenceAccessConfig(AccessConfig):
|
|
33
|
+
api_token: str = Field(description="Confluence API token")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ConfluenceConnectionConfig(ConnectionConfig):
|
|
37
|
+
url: str = Field(description="URL of the Confluence instance")
|
|
38
|
+
user_email: str = Field(description="User email for authentication")
|
|
39
|
+
access_config: Secret[ConfluenceAccessConfig] = Field(
|
|
40
|
+
description="Access configuration for Confluence"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
@requires_dependencies(["atlassian"], extras="confluence")
|
|
44
|
+
def get_client(self) -> "Confluence":
|
|
45
|
+
from atlassian import Confluence
|
|
46
|
+
|
|
47
|
+
access_configs = self.access_config.get_secret_value()
|
|
48
|
+
return Confluence(
|
|
49
|
+
url=self.url,
|
|
50
|
+
username=self.user_email,
|
|
51
|
+
password=access_configs.api_token,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class ConfluenceIndexerConfig(IndexerConfig):
|
|
56
|
+
max_num_of_spaces: int = Field(500, description="Maximum number of spaces to index")
|
|
57
|
+
max_num_of_docs_from_each_space: int = Field(
|
|
58
|
+
100, description="Maximum number of documents to fetch from each space"
|
|
59
|
+
)
|
|
60
|
+
spaces: Optional[List[str]] = Field(None, description="List of specific space keys to index")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class ConfluenceIndexer(Indexer):
|
|
65
|
+
connection_config: ConfluenceConnectionConfig
|
|
66
|
+
index_config: ConfluenceIndexerConfig
|
|
67
|
+
connector_type: str = CONNECTOR_TYPE
|
|
68
|
+
|
|
69
|
+
def precheck(self) -> bool:
|
|
70
|
+
try:
|
|
71
|
+
|
|
72
|
+
# Attempt to retrieve a list of spaces with limit=1.
|
|
73
|
+
# This should only succeed if all creds are valid
|
|
74
|
+
client = self.connection_config.get_client()
|
|
75
|
+
client.get_all_spaces(limit=1)
|
|
76
|
+
logger.info("Connection to Confluence successful.")
|
|
77
|
+
return True
|
|
78
|
+
except Exception as e:
|
|
79
|
+
logger.error(f"Failed to connect to Confluence: {e}", exc_info=True)
|
|
80
|
+
raise SourceConnectionError(f"Failed to connect to Confluence: {e}")
|
|
81
|
+
|
|
82
|
+
def _get_space_ids(self) -> List[str]:
|
|
83
|
+
spaces = self.index_config.spaces
|
|
84
|
+
if spaces:
|
|
85
|
+
return spaces
|
|
86
|
+
else:
|
|
87
|
+
client = self.connection_config.get_client()
|
|
88
|
+
all_spaces = client.get_all_spaces(limit=self.index_config.max_num_of_spaces)
|
|
89
|
+
space_ids = [space["key"] for space in all_spaces["results"]]
|
|
90
|
+
return space_ids
|
|
91
|
+
|
|
92
|
+
def _get_docs_ids_within_one_space(self, space_id: str) -> List[dict]:
|
|
93
|
+
client = self.connection_config.get_client()
|
|
94
|
+
pages = client.get_all_pages_from_space(
|
|
95
|
+
space=space_id,
|
|
96
|
+
start=0,
|
|
97
|
+
limit=self.index_config.max_num_of_docs_from_each_space,
|
|
98
|
+
expand=None,
|
|
99
|
+
content_type="page",
|
|
100
|
+
status=None,
|
|
101
|
+
)
|
|
102
|
+
doc_ids = [{"space_id": space_id, "doc_id": page["id"]} for page in pages]
|
|
103
|
+
return doc_ids
|
|
104
|
+
|
|
105
|
+
def run(self) -> Generator[FileData, None, None]:
|
|
106
|
+
from time import time
|
|
107
|
+
|
|
108
|
+
space_ids = self._get_space_ids()
|
|
109
|
+
for space_id in space_ids:
|
|
110
|
+
doc_ids = self._get_docs_ids_within_one_space(space_id)
|
|
111
|
+
for doc in doc_ids:
|
|
112
|
+
doc_id = doc["doc_id"]
|
|
113
|
+
# Build metadata
|
|
114
|
+
metadata = FileDataSourceMetadata(
|
|
115
|
+
date_processed=str(time()),
|
|
116
|
+
url=f"{self.connection_config.url}/pages/{doc_id}",
|
|
117
|
+
record_locator={
|
|
118
|
+
"space_id": space_id,
|
|
119
|
+
"document_id": doc_id,
|
|
120
|
+
},
|
|
121
|
+
)
|
|
122
|
+
additional_metadata = {
|
|
123
|
+
"space_id": space_id,
|
|
124
|
+
"document_id": doc_id,
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
# Construct relative path and filename
|
|
128
|
+
filename = f"{doc_id}.html"
|
|
129
|
+
relative_path = str(Path(space_id) / filename)
|
|
130
|
+
|
|
131
|
+
source_identifiers = SourceIdentifiers(
|
|
132
|
+
filename=filename,
|
|
133
|
+
fullpath=relative_path,
|
|
134
|
+
rel_path=relative_path,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
file_data = FileData(
|
|
138
|
+
identifier=doc_id,
|
|
139
|
+
connector_type=self.connector_type,
|
|
140
|
+
metadata=metadata,
|
|
141
|
+
additional_metadata=additional_metadata,
|
|
142
|
+
source_identifiers=source_identifiers,
|
|
143
|
+
)
|
|
144
|
+
yield file_data
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class ConfluenceDownloaderConfig(DownloaderConfig):
|
|
148
|
+
pass
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
@dataclass
|
|
152
|
+
class ConfluenceDownloader(Downloader):
|
|
153
|
+
connection_config: ConfluenceConnectionConfig
|
|
154
|
+
download_config: ConfluenceDownloaderConfig = field(default_factory=ConfluenceDownloaderConfig)
|
|
155
|
+
connector_type: str = CONNECTOR_TYPE
|
|
156
|
+
|
|
157
|
+
def run(self, file_data: FileData, **kwargs) -> download_responses:
|
|
158
|
+
doc_id = file_data.identifier
|
|
159
|
+
try:
|
|
160
|
+
client = self.connection_config.get_client()
|
|
161
|
+
page = client.get_page_by_id(
|
|
162
|
+
page_id=doc_id,
|
|
163
|
+
expand="history.lastUpdated,version,body.view",
|
|
164
|
+
)
|
|
165
|
+
except Exception as e:
|
|
166
|
+
logger.error(f"Failed to retrieve page with ID {doc_id}: {e}", exc_info=True)
|
|
167
|
+
raise SourceConnectionError(f"Failed to retrieve page with ID {doc_id}: {e}")
|
|
168
|
+
|
|
169
|
+
if not page:
|
|
170
|
+
raise ValueError(f"Page with ID {doc_id} does not exist.")
|
|
171
|
+
|
|
172
|
+
content = page["body"]["view"]["value"]
|
|
173
|
+
|
|
174
|
+
filepath = file_data.source_identifiers.relative_path
|
|
175
|
+
download_path = Path(self.download_dir) / filepath
|
|
176
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
177
|
+
with open(download_path, "w", encoding="utf8") as f:
|
|
178
|
+
f.write(content)
|
|
179
|
+
|
|
180
|
+
# Update file_data with metadata
|
|
181
|
+
file_data.metadata.date_created = page["history"]["createdDate"]
|
|
182
|
+
file_data.metadata.date_modified = page["version"]["when"]
|
|
183
|
+
file_data.metadata.version = str(page["version"]["number"])
|
|
184
|
+
file_data.display_name = page["title"]
|
|
185
|
+
|
|
186
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
confluence_source_entry = SourceRegistryEntry(
|
|
190
|
+
connection_config=ConfluenceConnectionConfig,
|
|
191
|
+
indexer_config=ConfluenceIndexerConfig,
|
|
192
|
+
indexer=ConfluenceIndexer,
|
|
193
|
+
downloader_config=ConfluenceDownloaderConfig,
|
|
194
|
+
downloader=ConfluenceDownloader,
|
|
195
|
+
)
|
|
@@ -148,9 +148,7 @@ class DatabricksVolumesDownloader(Downloader, ABC):
|
|
|
148
148
|
|
|
149
149
|
|
|
150
150
|
class DatabricksVolumesUploaderConfig(UploaderConfig, DatabricksPathMixin):
|
|
151
|
-
|
|
152
|
-
default=False, description="If true, an existing file will be overwritten."
|
|
153
|
-
)
|
|
151
|
+
pass
|
|
154
152
|
|
|
155
153
|
|
|
156
154
|
@dataclass
|
|
@@ -173,5 +171,5 @@ class DatabricksVolumesUploader(Uploader, ABC):
|
|
|
173
171
|
self.connection_config.get_client().files.upload(
|
|
174
172
|
file_path=output_path,
|
|
175
173
|
contents=elements_file,
|
|
176
|
-
overwrite=
|
|
174
|
+
overwrite=True,
|
|
177
175
|
)
|
|
@@ -231,9 +231,7 @@ class FsspecDownloader(Downloader):
|
|
|
231
231
|
|
|
232
232
|
|
|
233
233
|
class FsspecUploaderConfig(FileConfig, UploaderConfig):
|
|
234
|
-
|
|
235
|
-
default=False, description="If true, an existing file will be overwritten."
|
|
236
|
-
)
|
|
234
|
+
pass
|
|
237
235
|
|
|
238
236
|
|
|
239
237
|
FsspecUploaderConfigT = TypeVar("FsspecUploaderConfigT", bound=FsspecUploaderConfig)
|
|
@@ -288,9 +286,6 @@ class FsspecUploader(Uploader):
|
|
|
288
286
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
289
287
|
path_str = str(path.resolve())
|
|
290
288
|
upload_path = self.get_upload_path(file_data=file_data)
|
|
291
|
-
if self.fs.exists(path=str(upload_path)) and not self.upload_config.overwrite:
|
|
292
|
-
logger.debug(f"skipping upload of {path} to {upload_path}, file already exists")
|
|
293
|
-
return
|
|
294
289
|
logger.debug(f"writing local file {path_str} to {upload_path}")
|
|
295
290
|
self.fs.upload(lpath=path_str, rpath=str(upload_path))
|
|
296
291
|
|
|
@@ -298,9 +293,5 @@ class FsspecUploader(Uploader):
|
|
|
298
293
|
upload_path = self.get_upload_path(file_data=file_data)
|
|
299
294
|
path_str = str(path.resolve())
|
|
300
295
|
# Odd that fsspec doesn't run exists() as async even when client support async
|
|
301
|
-
already_exists = self.fs.exists(path=str(upload_path))
|
|
302
|
-
if already_exists and not self.upload_config.overwrite:
|
|
303
|
-
logger.debug(f"skipping upload of {path} to {upload_path}, file already exists")
|
|
304
|
-
return
|
|
305
296
|
logger.debug(f"writing local file {path_str} to {upload_path}")
|
|
306
297
|
self.fs.upload(lpath=path_str, rpath=str(upload_path))
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
6
|
+
from urllib.parse import urlparse
|
|
7
|
+
|
|
8
|
+
from pydantic import Field, Secret, model_validator
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.error import SourceConnectionError
|
|
11
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
|
+
from unstructured_ingest.v2.interfaces import (
|
|
13
|
+
AccessConfig,
|
|
14
|
+
ConnectionConfig,
|
|
15
|
+
Downloader,
|
|
16
|
+
DownloaderConfig,
|
|
17
|
+
DownloadResponse,
|
|
18
|
+
FileData,
|
|
19
|
+
FileDataSourceMetadata,
|
|
20
|
+
Indexer,
|
|
21
|
+
IndexerConfig,
|
|
22
|
+
SourceIdentifiers,
|
|
23
|
+
)
|
|
24
|
+
from unstructured_ingest.v2.logger import logger
|
|
25
|
+
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
26
|
+
|
|
27
|
+
CONNECTOR_TYPE = "gitlab"
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from gitlab import Gitlab
|
|
30
|
+
from gitlab.v4.objects.projects import Project
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class GitLabAccessConfig(AccessConfig):
|
|
34
|
+
access_token: Optional[str] = Field(
|
|
35
|
+
default=None,
|
|
36
|
+
description="Optional personal access token for authenticating with the GitLab API.",
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class GitLabConnectionConfig(ConnectionConfig):
|
|
41
|
+
access_config: Secret[GitLabAccessConfig] = Field(
|
|
42
|
+
default_factory=GitLabAccessConfig,
|
|
43
|
+
validate_default=True,
|
|
44
|
+
description="Secret configuration for accessing the GitLab API by authentication token.",
|
|
45
|
+
)
|
|
46
|
+
url: str = Field(description="The full URL to the GitLab project or repository.")
|
|
47
|
+
base_url: str = Field(
|
|
48
|
+
default="https://gitlab.com",
|
|
49
|
+
description="The base URL for the GitLab instance (default is GitLab's public domain).",
|
|
50
|
+
)
|
|
51
|
+
repo_path: str = Field(
|
|
52
|
+
default=None,
|
|
53
|
+
init=False,
|
|
54
|
+
repr=False,
|
|
55
|
+
description="The normalized path extracted from the repository URL.",
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
@model_validator(mode="after")
|
|
59
|
+
def set_repo_path(self):
|
|
60
|
+
"""
|
|
61
|
+
Parses the provided GitLab URL to extract the `base_url` and `repo_path`,
|
|
62
|
+
ensuring both are properly formatted for use.
|
|
63
|
+
|
|
64
|
+
If the URL contains a scheme (e.g., 'https') and a network location (e.g., 'gitlab.com'),
|
|
65
|
+
the `base_url` is set accordingly. The repository path is extracted and normalized
|
|
66
|
+
by removing any leading slashes.
|
|
67
|
+
|
|
68
|
+
Notes:
|
|
69
|
+
- If the URL contains both a scheme and network location, the `base_url` is
|
|
70
|
+
extracted directly from the URL.
|
|
71
|
+
- The `repo_path` is adjusted to remove any leading slashes.
|
|
72
|
+
- This method assumes that the URL follows GitLab's structure
|
|
73
|
+
(e.g., 'https://gitlab.com/owner/repo').
|
|
74
|
+
"""
|
|
75
|
+
parsed_gh_url = urlparse(self.url)
|
|
76
|
+
|
|
77
|
+
if parsed_gh_url.scheme and parsed_gh_url.netloc:
|
|
78
|
+
self.base_url = f"{parsed_gh_url.scheme}://{parsed_gh_url.netloc}"
|
|
79
|
+
self.repo_path = parsed_gh_url.path.lstrip("/")
|
|
80
|
+
|
|
81
|
+
return self
|
|
82
|
+
|
|
83
|
+
@SourceConnectionError.wrap
|
|
84
|
+
@requires_dependencies(["gitlab"], extras="gitlab")
|
|
85
|
+
def get_client(self) -> "Gitlab":
|
|
86
|
+
from gitlab import Gitlab
|
|
87
|
+
|
|
88
|
+
logger.info(f"Connection to GitLab: {self.base_url!r}")
|
|
89
|
+
gitlab = Gitlab(
|
|
90
|
+
self.base_url, private_token=self.access_config.get_secret_value().access_token
|
|
91
|
+
)
|
|
92
|
+
return gitlab
|
|
93
|
+
|
|
94
|
+
def get_project(self) -> "Project":
|
|
95
|
+
"""Retrieves the specified GitLab project using the configured base URL and access token.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
Project: A GitLab `Project` object representing the specified repository.
|
|
99
|
+
|
|
100
|
+
Raises:
|
|
101
|
+
SourceConnectionError: If the GitLab API connection fails.
|
|
102
|
+
gitlab.exceptions.GitlabGetError: If the project is not found.
|
|
103
|
+
"""
|
|
104
|
+
gitlab = self.get_client()
|
|
105
|
+
|
|
106
|
+
logger.info(f"Accessing Project: '{self.repo_path}'")
|
|
107
|
+
project = gitlab.projects.get(self.repo_path)
|
|
108
|
+
|
|
109
|
+
logger.info(f"Successfully accessed project '{self.repo_path}'")
|
|
110
|
+
return project
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class GitLabIndexerConfig(IndexerConfig):
|
|
114
|
+
path: Path = Field(
|
|
115
|
+
default="/", description=("Path to the location in the repository that will be processed.")
|
|
116
|
+
)
|
|
117
|
+
recursive: bool = Field(
|
|
118
|
+
default=True,
|
|
119
|
+
description=(
|
|
120
|
+
"Flag to control recursive operations when indexing. "
|
|
121
|
+
"If True, the indexer will traverse directories recursively."
|
|
122
|
+
),
|
|
123
|
+
)
|
|
124
|
+
git_branch: Optional[str] = Field(
|
|
125
|
+
default=None,
|
|
126
|
+
description="The name of the branch to interact with.",
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
@dataclass
|
|
131
|
+
class GitLabIndexer(Indexer):
|
|
132
|
+
connection_config: GitLabConnectionConfig
|
|
133
|
+
index_config: GitLabIndexerConfig
|
|
134
|
+
|
|
135
|
+
def precheck(self) -> None:
|
|
136
|
+
"""Validates the connection to the GitLab instance by authenticating or
|
|
137
|
+
accessing the project.
|
|
138
|
+
|
|
139
|
+
This method ensures that the GitLab credentials and configuration are correct by
|
|
140
|
+
either authenticating or attempting to fetch the specified project.
|
|
141
|
+
|
|
142
|
+
Raises:
|
|
143
|
+
SourceConnectionError: If the connection or authentication with GitLab fails.
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
try:
|
|
147
|
+
gitlab = self.connection_config.get_client()
|
|
148
|
+
if self.connection_config.access_config.get_secret_value().access_token is not None:
|
|
149
|
+
gitlab.auth()
|
|
150
|
+
else:
|
|
151
|
+
gitlab.projects.get(self.connection_config.repo_path)
|
|
152
|
+
|
|
153
|
+
except Exception as e:
|
|
154
|
+
logger.error(f"Failed to validate connection: {e}", exc_info=True)
|
|
155
|
+
raise SourceConnectionError(f"Failed to validate connection: {e}")
|
|
156
|
+
|
|
157
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
158
|
+
"""Iterates over the GitLab repository tree and yields file metadata as `FileData` objects.
|
|
159
|
+
|
|
160
|
+
This method fetches the repository tree for the specified branch and iterates
|
|
161
|
+
over its contents. For each file (blob), it generates a `FileData` object containing
|
|
162
|
+
the file's metadata, path, and permissions.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
**kwargs (Any): Additional keyword arguments (if required).
|
|
166
|
+
|
|
167
|
+
Yields:
|
|
168
|
+
FileData: A generator that yields `FileData` objects representing each file (blob)
|
|
169
|
+
in the repository.
|
|
170
|
+
"""
|
|
171
|
+
project = self.connection_config.get_project()
|
|
172
|
+
|
|
173
|
+
ref = self.index_config.git_branch or project.default_branch
|
|
174
|
+
|
|
175
|
+
files = project.repository_tree(
|
|
176
|
+
path=str(self.index_config.path),
|
|
177
|
+
ref=ref,
|
|
178
|
+
recursive=self.index_config.recursive,
|
|
179
|
+
iterator=True,
|
|
180
|
+
all=True,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
for file in files:
|
|
184
|
+
relative_path = str(Path(file["path"]).relative_to(self.index_config.path))
|
|
185
|
+
if file["type"] == "blob":
|
|
186
|
+
record_locator = {
|
|
187
|
+
"file_path": file["path"],
|
|
188
|
+
"ref": ref,
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
yield FileData(
|
|
192
|
+
identifier=file["id"],
|
|
193
|
+
connector_type=CONNECTOR_TYPE,
|
|
194
|
+
source_identifiers=SourceIdentifiers(
|
|
195
|
+
fullpath=file["path"],
|
|
196
|
+
filename=Path(file["path"]).name,
|
|
197
|
+
rel_path=relative_path,
|
|
198
|
+
),
|
|
199
|
+
metadata=FileDataSourceMetadata(
|
|
200
|
+
url=file["id"],
|
|
201
|
+
record_locator=record_locator,
|
|
202
|
+
permissions_data=[{"mode": file["mode"]}],
|
|
203
|
+
),
|
|
204
|
+
additional_metadata={},
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
class GitLabDownloaderConfig(DownloaderConfig):
|
|
209
|
+
pass
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
@dataclass
|
|
213
|
+
class GitLabDownloader(Downloader):
|
|
214
|
+
connection_config: GitLabConnectionConfig
|
|
215
|
+
download_config: GitLabDownloaderConfig
|
|
216
|
+
|
|
217
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
218
|
+
"""Downloads a file from the repository and returns a `DownloadResponse`.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
file_data (FileData): Metadata about the file to be downloaded.
|
|
222
|
+
**kwargs (Any): Additional arguments (if required).
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
DownloadResponse: A response object containing the download details.
|
|
226
|
+
"""
|
|
227
|
+
download_path = self.get_download_path(file_data=file_data)
|
|
228
|
+
if download_path is None:
|
|
229
|
+
logger.error(
|
|
230
|
+
"Generated download path is None, source_identifiers might be missing"
|
|
231
|
+
"from FileData."
|
|
232
|
+
)
|
|
233
|
+
raise ValueError("Generated invalid download path.")
|
|
234
|
+
|
|
235
|
+
self._download_file(file_data, download_path)
|
|
236
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
237
|
+
|
|
238
|
+
def _download_file(self, file_data: FileData, download_path: Path) -> None:
|
|
239
|
+
# NOTE: Indexer should supply the record locator in metadata
|
|
240
|
+
if (
|
|
241
|
+
file_data.metadata.record_locator is None
|
|
242
|
+
or "ref" not in file_data.metadata.record_locator
|
|
243
|
+
or "file_path" not in file_data.metadata.record_locator
|
|
244
|
+
):
|
|
245
|
+
logger.error(
|
|
246
|
+
f"Invalid record locator in metadata: {file_data.metadata.record_locator}."
|
|
247
|
+
"Keys 'ref' and 'path' must be present."
|
|
248
|
+
)
|
|
249
|
+
raise ValueError("Invalid record locator.")
|
|
250
|
+
|
|
251
|
+
ref = file_data.metadata.record_locator["ref"]
|
|
252
|
+
path = file_data.metadata.record_locator["file_path"]
|
|
253
|
+
|
|
254
|
+
project_file = self.connection_config.get_project().files.get(file_path=path, ref=ref)
|
|
255
|
+
download_path.parent.mkdir(exist_ok=True, parents=True)
|
|
256
|
+
|
|
257
|
+
with open(download_path, "wb") as file:
|
|
258
|
+
file.write(project_file.decode())
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
gitlab_source_entry = SourceRegistryEntry(
|
|
262
|
+
connection_config=GitLabConnectionConfig,
|
|
263
|
+
indexer_config=GitLabIndexerConfig,
|
|
264
|
+
indexer=GitLabIndexer,
|
|
265
|
+
downloader_config=GitLabDownloaderConfig,
|
|
266
|
+
downloader=GitLabDownloader,
|
|
267
|
+
)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
4
|
+
add_source_entry,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
from .cloud import CONNECTOR_TYPE as CLOUD_CONNECTOR
|
|
8
|
+
from .cloud import kafka_cloud_source_entry
|
|
9
|
+
from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR
|
|
10
|
+
from .local import kafka_local_source_entry
|
|
11
|
+
|
|
12
|
+
add_source_entry(source_type=LOCAL_CONNECTOR, entry=kafka_local_source_entry)
|
|
13
|
+
add_source_entry(source_type=CLOUD_CONNECTOR, entry=kafka_cloud_source_entry)
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import socket
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import TYPE_CHECKING, Optional
|
|
4
|
+
|
|
5
|
+
from pydantic import Field, Secret, SecretStr
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
8
|
+
from unstructured_ingest.v2.processes.connectors.kafka.kafka import (
|
|
9
|
+
KafkaAccessConfig,
|
|
10
|
+
KafkaConnectionConfig,
|
|
11
|
+
KafkaDownloader,
|
|
12
|
+
KafkaDownloaderConfig,
|
|
13
|
+
KafkaIndexer,
|
|
14
|
+
KafkaIndexerConfig,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
CONNECTOR_TYPE = "kafka-cloud"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class CloudKafkaAccessConfig(KafkaAccessConfig):
|
|
24
|
+
api_key: Optional[SecretStr] = Field(
|
|
25
|
+
description="Kafka API key to connect at the server", alias="kafka_api_key", default=None
|
|
26
|
+
)
|
|
27
|
+
secret: Optional[SecretStr] = Field(description="", default=None)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class CloudKafkaConnectionConfig(KafkaConnectionConfig):
|
|
31
|
+
access_config: Secret[CloudKafkaAccessConfig]
|
|
32
|
+
|
|
33
|
+
def get_consumer_configuration(self) -> dict:
|
|
34
|
+
bootstrap = self.bootstrap_server
|
|
35
|
+
port = self.port
|
|
36
|
+
access_config = self.access_config.get_secret_value()
|
|
37
|
+
|
|
38
|
+
conf = {
|
|
39
|
+
"bootstrap.servers": f"{bootstrap}:{port}",
|
|
40
|
+
"client.id": socket.gethostname(),
|
|
41
|
+
"group.id": "default_group_id",
|
|
42
|
+
"enable.auto.commit": "false",
|
|
43
|
+
"auto.offset.reset": "earliest",
|
|
44
|
+
"message.max.bytes": 10485760,
|
|
45
|
+
"sasl.username": access_config.api_key,
|
|
46
|
+
"sasl.password": access_config.secret,
|
|
47
|
+
"sasl.mechanism": "PLAIN",
|
|
48
|
+
"security.protocol": "SASL_SSL",
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
return conf
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class CloudKafkaIndexerConfig(KafkaIndexerConfig):
|
|
55
|
+
pass
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class CloudKafkaIndexer(KafkaIndexer):
|
|
60
|
+
connection_config: CloudKafkaConnectionConfig
|
|
61
|
+
index_config: CloudKafkaIndexerConfig
|
|
62
|
+
connector_type: str = CONNECTOR_TYPE
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class CloudKafkaDownloaderConfig(KafkaDownloaderConfig):
|
|
66
|
+
pass
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class CloudKafkaDownloader(KafkaDownloader):
|
|
71
|
+
connection_config: CloudKafkaConnectionConfig
|
|
72
|
+
download_config: CloudKafkaDownloaderConfig
|
|
73
|
+
connector_type: str = CONNECTOR_TYPE
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
kafka_cloud_source_entry = SourceRegistryEntry(
|
|
77
|
+
connection_config=CloudKafkaConnectionConfig,
|
|
78
|
+
indexer=CloudKafkaIndexer,
|
|
79
|
+
indexer_config=CloudKafkaIndexerConfig,
|
|
80
|
+
downloader=CloudKafkaDownloader,
|
|
81
|
+
downloader_config=CloudKafkaDownloaderConfig,
|
|
82
|
+
)
|