unstructured-ingest 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_astradb.py +109 -0
- test/integration/connectors/test_azure_cog_search.py +233 -0
- test/integration/connectors/test_confluence.py +113 -0
- test/integration/connectors/test_kafka.py +167 -0
- test/integration/connectors/test_onedrive.py +112 -0
- test/integration/connectors/test_pinecone.py +161 -0
- test/integration/connectors/test_qdrant.py +137 -0
- test/integration/connectors/test_s3.py +23 -0
- test/integration/connectors/utils/docker.py +2 -1
- test/integration/connectors/utils/validation.py +73 -22
- test/unit/v2/__init__.py +0 -0
- test/unit/v2/chunkers/__init__.py +0 -0
- test/unit/v2/chunkers/test_chunkers.py +49 -0
- test/unit/v2/connectors/__init__.py +0 -0
- test/unit/v2/embedders/__init__.py +0 -0
- test/unit/v2/embedders/test_bedrock.py +36 -0
- test/unit/v2/embedders/test_huggingface.py +48 -0
- test/unit/v2/embedders/test_mixedbread.py +37 -0
- test/unit/v2/embedders/test_octoai.py +35 -0
- test/unit/v2/embedders/test_openai.py +35 -0
- test/unit/v2/embedders/test_togetherai.py +37 -0
- test/unit/v2/embedders/test_vertexai.py +37 -0
- test/unit/v2/embedders/test_voyageai.py +38 -0
- test/unit/v2/partitioners/__init__.py +0 -0
- test/unit/v2/partitioners/test_partitioner.py +63 -0
- test/unit/v2/utils/__init__.py +0 -0
- test/unit/v2/utils/data_generator.py +32 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cmds/__init__.py +2 -2
- unstructured_ingest/cli/cmds/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
- unstructured_ingest/connector/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
- unstructured_ingest/connector/kafka.py +0 -1
- unstructured_ingest/interfaces.py +7 -7
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
- unstructured_ingest/v2/constants.py +2 -0
- unstructured_ingest/v2/processes/chunker.py +2 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +16 -5
- unstructured_ingest/v2/processes/connectors/airtable.py +2 -2
- unstructured_ingest/v2/processes/connectors/astradb.py +33 -21
- unstructured_ingest/v2/processes/connectors/{azure_cognitive_search.py → azure_ai_search.py} +112 -35
- unstructured_ingest/v2/processes/connectors/confluence.py +195 -0
- unstructured_ingest/v2/processes/connectors/couchbase.py +1 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +2 -4
- unstructured_ingest/v2/processes/connectors/delta_table.py +17 -5
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +28 -10
- unstructured_ingest/v2/processes/connectors/gitlab.py +267 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +3 -3
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +118 -0
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +251 -0
- unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +165 -5
- unstructured_ingest/v2/processes/connectors/outlook.py +2 -2
- unstructured_ingest/v2/processes/connectors/pinecone.py +83 -12
- unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +168 -0
- unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +3 -2
- unstructured_ingest/v2/processes/connectors/slack.py +2 -2
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +16 -8
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +3 -1
- unstructured_ingest/v2/processes/connectors/sql/sql.py +2 -4
- unstructured_ingest/v2/processes/partitioner.py +14 -3
- unstructured_ingest/v2/unstructured_api.py +24 -10
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/METADATA +17 -16
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/RECORD +77 -41
- unstructured_ingest/runner/writers/azure_cognitive_search.py +0 -24
- /test/integration/embedders/{togetherai.py → test_togetherai.py} +0 -0
- /test/unit/{test_interfaces_v2.py → v2/test_interfaces.py} +0 -0
- /test/unit/{test_utils_v2.py → v2/test_utils.py} +0 -0
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -205,6 +205,7 @@ class CouchbaseIndexer(Indexer):
|
|
|
205
205
|
yield FileData(
|
|
206
206
|
identifier=identified,
|
|
207
207
|
connector_type=CONNECTOR_TYPE,
|
|
208
|
+
doc_type="batch",
|
|
208
209
|
metadata=FileDataSourceMetadata(
|
|
209
210
|
url=f"{self.connection_config.connection_string}/"
|
|
210
211
|
f"{self.connection_config.bucket}",
|
|
@@ -148,9 +148,7 @@ class DatabricksVolumesDownloader(Downloader, ABC):
|
|
|
148
148
|
|
|
149
149
|
|
|
150
150
|
class DatabricksVolumesUploaderConfig(UploaderConfig, DatabricksPathMixin):
|
|
151
|
-
|
|
152
|
-
default=False, description="If true, an existing file will be overwritten."
|
|
153
|
-
)
|
|
151
|
+
pass
|
|
154
152
|
|
|
155
153
|
|
|
156
154
|
@dataclass
|
|
@@ -173,5 +171,5 @@ class DatabricksVolumesUploader(Uploader, ABC):
|
|
|
173
171
|
self.connection_config.get_client().files.upload(
|
|
174
172
|
file_path=output_path,
|
|
175
173
|
contents=elements_file,
|
|
176
|
-
overwrite=
|
|
174
|
+
overwrite=True,
|
|
177
175
|
)
|
|
@@ -4,6 +4,7 @@ from dataclasses import dataclass, field
|
|
|
4
4
|
from multiprocessing import Process
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import Any, Optional
|
|
7
|
+
from urllib.parse import urlparse
|
|
7
8
|
|
|
8
9
|
import pandas as pd
|
|
9
10
|
from pydantic import Field, Secret
|
|
@@ -94,7 +95,7 @@ class DeltaTableUploader(Uploader):
|
|
|
94
95
|
connection_config: DeltaTableConnectionConfig
|
|
95
96
|
connector_type: str = CONNECTOR_TYPE
|
|
96
97
|
|
|
97
|
-
@requires_dependencies(["
|
|
98
|
+
@requires_dependencies(["boto3"], extras="delta-table")
|
|
98
99
|
def precheck(self):
|
|
99
100
|
secrets = self.connection_config.access_config.get_secret_value()
|
|
100
101
|
if (
|
|
@@ -102,13 +103,24 @@ class DeltaTableUploader(Uploader):
|
|
|
102
103
|
and secrets.aws_access_key_id
|
|
103
104
|
and secrets.aws_secret_access_key
|
|
104
105
|
):
|
|
105
|
-
from
|
|
106
|
+
from boto3 import client
|
|
107
|
+
|
|
108
|
+
url = urlparse(self.connection_config.table_uri)
|
|
109
|
+
bucket_name = url.netloc
|
|
110
|
+
dir_path = url.path.lstrip("/")
|
|
106
111
|
|
|
107
112
|
try:
|
|
108
|
-
|
|
109
|
-
|
|
113
|
+
s3_client = client(
|
|
114
|
+
"s3",
|
|
115
|
+
aws_access_key_id=secrets.aws_access_key_id,
|
|
116
|
+
aws_secret_access_key=secrets.aws_secret_access_key,
|
|
110
117
|
)
|
|
111
|
-
|
|
118
|
+
s3_client.put_object(Bucket=bucket_name, Key=dir_path, Body=b"")
|
|
119
|
+
|
|
120
|
+
response = s3_client.get_bucket_location(Bucket=bucket_name)
|
|
121
|
+
|
|
122
|
+
if self.connection_config.aws_region != response.get("LocationConstraint"):
|
|
123
|
+
raise ValueError("Wrong AWS Region was provided.")
|
|
112
124
|
|
|
113
125
|
except Exception as e:
|
|
114
126
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
@@ -191,6 +191,7 @@ class ElasticsearchIndexer(Indexer):
|
|
|
191
191
|
yield FileData(
|
|
192
192
|
identifier=identified,
|
|
193
193
|
connector_type=CONNECTOR_TYPE,
|
|
194
|
+
doc_type="batch",
|
|
194
195
|
metadata=FileDataSourceMetadata(
|
|
195
196
|
url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
|
|
196
197
|
date_processed=str(time()),
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import os
|
|
3
4
|
import random
|
|
5
|
+
import shutil
|
|
6
|
+
import tempfile
|
|
4
7
|
from dataclasses import dataclass, field
|
|
5
8
|
from pathlib import Path
|
|
6
9
|
from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
|
|
@@ -207,12 +210,35 @@ class FsspecDownloader(Downloader):
|
|
|
207
210
|
**self.connection_config.get_access_config(),
|
|
208
211
|
)
|
|
209
212
|
|
|
213
|
+
def handle_directory_download(self, lpath: Path) -> None:
|
|
214
|
+
# If the object's name contains certain characters (i.e. '?'), it
|
|
215
|
+
# gets downloaded into a new directory of the same name. This
|
|
216
|
+
# reconciles that with what is expected, which is to download it
|
|
217
|
+
# as a file that is not within a directory.
|
|
218
|
+
if not lpath.is_dir():
|
|
219
|
+
return
|
|
220
|
+
desired_name = lpath.name
|
|
221
|
+
files_in_dir = [file for file in lpath.iterdir() if file.is_file()]
|
|
222
|
+
if not files_in_dir:
|
|
223
|
+
raise ValueError(f"no files in {lpath}")
|
|
224
|
+
if len(files_in_dir) > 1:
|
|
225
|
+
raise ValueError(
|
|
226
|
+
"Multiple files in {}: {}".format(lpath, ", ".join([str(f) for f in files_in_dir]))
|
|
227
|
+
)
|
|
228
|
+
file = files_in_dir[0]
|
|
229
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
230
|
+
temp_location = os.path.join(temp_dir, desired_name)
|
|
231
|
+
shutil.copyfile(src=file, dst=temp_location)
|
|
232
|
+
shutil.rmtree(lpath)
|
|
233
|
+
shutil.move(src=temp_location, dst=lpath)
|
|
234
|
+
|
|
210
235
|
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
211
236
|
download_path = self.get_download_path(file_data=file_data)
|
|
212
237
|
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
213
238
|
try:
|
|
214
239
|
rpath = file_data.additional_metadata["original_file_path"]
|
|
215
240
|
self.fs.get(rpath=rpath, lpath=download_path.as_posix())
|
|
241
|
+
self.handle_directory_download(lpath=download_path)
|
|
216
242
|
except Exception as e:
|
|
217
243
|
logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
|
|
218
244
|
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
@@ -224,6 +250,7 @@ class FsspecDownloader(Downloader):
|
|
|
224
250
|
try:
|
|
225
251
|
rpath = file_data.additional_metadata["original_file_path"]
|
|
226
252
|
await self.fs.get(rpath=rpath, lpath=download_path.as_posix())
|
|
253
|
+
self.handle_directory_download(lpath=download_path)
|
|
227
254
|
except Exception as e:
|
|
228
255
|
logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
|
|
229
256
|
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
@@ -231,9 +258,7 @@ class FsspecDownloader(Downloader):
|
|
|
231
258
|
|
|
232
259
|
|
|
233
260
|
class FsspecUploaderConfig(FileConfig, UploaderConfig):
|
|
234
|
-
|
|
235
|
-
default=False, description="If true, an existing file will be overwritten."
|
|
236
|
-
)
|
|
261
|
+
pass
|
|
237
262
|
|
|
238
263
|
|
|
239
264
|
FsspecUploaderConfigT = TypeVar("FsspecUploaderConfigT", bound=FsspecUploaderConfig)
|
|
@@ -288,9 +313,6 @@ class FsspecUploader(Uploader):
|
|
|
288
313
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
289
314
|
path_str = str(path.resolve())
|
|
290
315
|
upload_path = self.get_upload_path(file_data=file_data)
|
|
291
|
-
if self.fs.exists(path=str(upload_path)) and not self.upload_config.overwrite:
|
|
292
|
-
logger.debug(f"skipping upload of {path} to {upload_path}, file already exists")
|
|
293
|
-
return
|
|
294
316
|
logger.debug(f"writing local file {path_str} to {upload_path}")
|
|
295
317
|
self.fs.upload(lpath=path_str, rpath=str(upload_path))
|
|
296
318
|
|
|
@@ -298,9 +320,5 @@ class FsspecUploader(Uploader):
|
|
|
298
320
|
upload_path = self.get_upload_path(file_data=file_data)
|
|
299
321
|
path_str = str(path.resolve())
|
|
300
322
|
# Odd that fsspec doesn't run exists() as async even when client support async
|
|
301
|
-
already_exists = self.fs.exists(path=str(upload_path))
|
|
302
|
-
if already_exists and not self.upload_config.overwrite:
|
|
303
|
-
logger.debug(f"skipping upload of {path} to {upload_path}, file already exists")
|
|
304
|
-
return
|
|
305
323
|
logger.debug(f"writing local file {path_str} to {upload_path}")
|
|
306
324
|
self.fs.upload(lpath=path_str, rpath=str(upload_path))
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
6
|
+
from urllib.parse import urlparse
|
|
7
|
+
|
|
8
|
+
from pydantic import Field, Secret, model_validator
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.error import SourceConnectionError
|
|
11
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
|
+
from unstructured_ingest.v2.interfaces import (
|
|
13
|
+
AccessConfig,
|
|
14
|
+
ConnectionConfig,
|
|
15
|
+
Downloader,
|
|
16
|
+
DownloaderConfig,
|
|
17
|
+
DownloadResponse,
|
|
18
|
+
FileData,
|
|
19
|
+
FileDataSourceMetadata,
|
|
20
|
+
Indexer,
|
|
21
|
+
IndexerConfig,
|
|
22
|
+
SourceIdentifiers,
|
|
23
|
+
)
|
|
24
|
+
from unstructured_ingest.v2.logger import logger
|
|
25
|
+
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
26
|
+
|
|
27
|
+
CONNECTOR_TYPE = "gitlab"
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from gitlab import Gitlab
|
|
30
|
+
from gitlab.v4.objects.projects import Project
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class GitLabAccessConfig(AccessConfig):
|
|
34
|
+
access_token: Optional[str] = Field(
|
|
35
|
+
default=None,
|
|
36
|
+
description="Optional personal access token for authenticating with the GitLab API.",
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class GitLabConnectionConfig(ConnectionConfig):
|
|
41
|
+
access_config: Secret[GitLabAccessConfig] = Field(
|
|
42
|
+
default_factory=GitLabAccessConfig,
|
|
43
|
+
validate_default=True,
|
|
44
|
+
description="Secret configuration for accessing the GitLab API by authentication token.",
|
|
45
|
+
)
|
|
46
|
+
url: str = Field(description="The full URL to the GitLab project or repository.")
|
|
47
|
+
base_url: str = Field(
|
|
48
|
+
default="https://gitlab.com",
|
|
49
|
+
description="The base URL for the GitLab instance (default is GitLab's public domain).",
|
|
50
|
+
)
|
|
51
|
+
repo_path: str = Field(
|
|
52
|
+
default=None,
|
|
53
|
+
init=False,
|
|
54
|
+
repr=False,
|
|
55
|
+
description="The normalized path extracted from the repository URL.",
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
@model_validator(mode="after")
|
|
59
|
+
def set_repo_path(self):
|
|
60
|
+
"""
|
|
61
|
+
Parses the provided GitLab URL to extract the `base_url` and `repo_path`,
|
|
62
|
+
ensuring both are properly formatted for use.
|
|
63
|
+
|
|
64
|
+
If the URL contains a scheme (e.g., 'https') and a network location (e.g., 'gitlab.com'),
|
|
65
|
+
the `base_url` is set accordingly. The repository path is extracted and normalized
|
|
66
|
+
by removing any leading slashes.
|
|
67
|
+
|
|
68
|
+
Notes:
|
|
69
|
+
- If the URL contains both a scheme and network location, the `base_url` is
|
|
70
|
+
extracted directly from the URL.
|
|
71
|
+
- The `repo_path` is adjusted to remove any leading slashes.
|
|
72
|
+
- This method assumes that the URL follows GitLab's structure
|
|
73
|
+
(e.g., 'https://gitlab.com/owner/repo').
|
|
74
|
+
"""
|
|
75
|
+
parsed_gh_url = urlparse(self.url)
|
|
76
|
+
|
|
77
|
+
if parsed_gh_url.scheme and parsed_gh_url.netloc:
|
|
78
|
+
self.base_url = f"{parsed_gh_url.scheme}://{parsed_gh_url.netloc}"
|
|
79
|
+
self.repo_path = parsed_gh_url.path.lstrip("/")
|
|
80
|
+
|
|
81
|
+
return self
|
|
82
|
+
|
|
83
|
+
@SourceConnectionError.wrap
|
|
84
|
+
@requires_dependencies(["gitlab"], extras="gitlab")
|
|
85
|
+
def get_client(self) -> "Gitlab":
|
|
86
|
+
from gitlab import Gitlab
|
|
87
|
+
|
|
88
|
+
logger.info(f"Connection to GitLab: {self.base_url!r}")
|
|
89
|
+
gitlab = Gitlab(
|
|
90
|
+
self.base_url, private_token=self.access_config.get_secret_value().access_token
|
|
91
|
+
)
|
|
92
|
+
return gitlab
|
|
93
|
+
|
|
94
|
+
def get_project(self) -> "Project":
|
|
95
|
+
"""Retrieves the specified GitLab project using the configured base URL and access token.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
Project: A GitLab `Project` object representing the specified repository.
|
|
99
|
+
|
|
100
|
+
Raises:
|
|
101
|
+
SourceConnectionError: If the GitLab API connection fails.
|
|
102
|
+
gitlab.exceptions.GitlabGetError: If the project is not found.
|
|
103
|
+
"""
|
|
104
|
+
gitlab = self.get_client()
|
|
105
|
+
|
|
106
|
+
logger.info(f"Accessing Project: '{self.repo_path}'")
|
|
107
|
+
project = gitlab.projects.get(self.repo_path)
|
|
108
|
+
|
|
109
|
+
logger.info(f"Successfully accessed project '{self.repo_path}'")
|
|
110
|
+
return project
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class GitLabIndexerConfig(IndexerConfig):
|
|
114
|
+
path: Path = Field(
|
|
115
|
+
default="/", description=("Path to the location in the repository that will be processed.")
|
|
116
|
+
)
|
|
117
|
+
recursive: bool = Field(
|
|
118
|
+
default=True,
|
|
119
|
+
description=(
|
|
120
|
+
"Flag to control recursive operations when indexing. "
|
|
121
|
+
"If True, the indexer will traverse directories recursively."
|
|
122
|
+
),
|
|
123
|
+
)
|
|
124
|
+
git_branch: Optional[str] = Field(
|
|
125
|
+
default=None,
|
|
126
|
+
description="The name of the branch to interact with.",
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
@dataclass
|
|
131
|
+
class GitLabIndexer(Indexer):
|
|
132
|
+
connection_config: GitLabConnectionConfig
|
|
133
|
+
index_config: GitLabIndexerConfig
|
|
134
|
+
|
|
135
|
+
def precheck(self) -> None:
|
|
136
|
+
"""Validates the connection to the GitLab instance by authenticating or
|
|
137
|
+
accessing the project.
|
|
138
|
+
|
|
139
|
+
This method ensures that the GitLab credentials and configuration are correct by
|
|
140
|
+
either authenticating or attempting to fetch the specified project.
|
|
141
|
+
|
|
142
|
+
Raises:
|
|
143
|
+
SourceConnectionError: If the connection or authentication with GitLab fails.
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
try:
|
|
147
|
+
gitlab = self.connection_config.get_client()
|
|
148
|
+
if self.connection_config.access_config.get_secret_value().access_token is not None:
|
|
149
|
+
gitlab.auth()
|
|
150
|
+
else:
|
|
151
|
+
gitlab.projects.get(self.connection_config.repo_path)
|
|
152
|
+
|
|
153
|
+
except Exception as e:
|
|
154
|
+
logger.error(f"Failed to validate connection: {e}", exc_info=True)
|
|
155
|
+
raise SourceConnectionError(f"Failed to validate connection: {e}")
|
|
156
|
+
|
|
157
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
158
|
+
"""Iterates over the GitLab repository tree and yields file metadata as `FileData` objects.
|
|
159
|
+
|
|
160
|
+
This method fetches the repository tree for the specified branch and iterates
|
|
161
|
+
over its contents. For each file (blob), it generates a `FileData` object containing
|
|
162
|
+
the file's metadata, path, and permissions.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
**kwargs (Any): Additional keyword arguments (if required).
|
|
166
|
+
|
|
167
|
+
Yields:
|
|
168
|
+
FileData: A generator that yields `FileData` objects representing each file (blob)
|
|
169
|
+
in the repository.
|
|
170
|
+
"""
|
|
171
|
+
project = self.connection_config.get_project()
|
|
172
|
+
|
|
173
|
+
ref = self.index_config.git_branch or project.default_branch
|
|
174
|
+
|
|
175
|
+
files = project.repository_tree(
|
|
176
|
+
path=str(self.index_config.path),
|
|
177
|
+
ref=ref,
|
|
178
|
+
recursive=self.index_config.recursive,
|
|
179
|
+
iterator=True,
|
|
180
|
+
all=True,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
for file in files:
|
|
184
|
+
relative_path = str(Path(file["path"]).relative_to(self.index_config.path))
|
|
185
|
+
if file["type"] == "blob":
|
|
186
|
+
record_locator = {
|
|
187
|
+
"file_path": file["path"],
|
|
188
|
+
"ref": ref,
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
yield FileData(
|
|
192
|
+
identifier=file["id"],
|
|
193
|
+
connector_type=CONNECTOR_TYPE,
|
|
194
|
+
source_identifiers=SourceIdentifiers(
|
|
195
|
+
fullpath=file["path"],
|
|
196
|
+
filename=Path(file["path"]).name,
|
|
197
|
+
rel_path=relative_path,
|
|
198
|
+
),
|
|
199
|
+
metadata=FileDataSourceMetadata(
|
|
200
|
+
url=file["id"],
|
|
201
|
+
record_locator=record_locator,
|
|
202
|
+
permissions_data=[{"mode": file["mode"]}],
|
|
203
|
+
),
|
|
204
|
+
additional_metadata={},
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
class GitLabDownloaderConfig(DownloaderConfig):
|
|
209
|
+
pass
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
@dataclass
|
|
213
|
+
class GitLabDownloader(Downloader):
|
|
214
|
+
connection_config: GitLabConnectionConfig
|
|
215
|
+
download_config: GitLabDownloaderConfig
|
|
216
|
+
|
|
217
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
218
|
+
"""Downloads a file from the repository and returns a `DownloadResponse`.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
file_data (FileData): Metadata about the file to be downloaded.
|
|
222
|
+
**kwargs (Any): Additional arguments (if required).
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
DownloadResponse: A response object containing the download details.
|
|
226
|
+
"""
|
|
227
|
+
download_path = self.get_download_path(file_data=file_data)
|
|
228
|
+
if download_path is None:
|
|
229
|
+
logger.error(
|
|
230
|
+
"Generated download path is None, source_identifiers might be missing"
|
|
231
|
+
"from FileData."
|
|
232
|
+
)
|
|
233
|
+
raise ValueError("Generated invalid download path.")
|
|
234
|
+
|
|
235
|
+
self._download_file(file_data, download_path)
|
|
236
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
237
|
+
|
|
238
|
+
def _download_file(self, file_data: FileData, download_path: Path) -> None:
|
|
239
|
+
# NOTE: Indexer should supply the record locator in metadata
|
|
240
|
+
if (
|
|
241
|
+
file_data.metadata.record_locator is None
|
|
242
|
+
or "ref" not in file_data.metadata.record_locator
|
|
243
|
+
or "file_path" not in file_data.metadata.record_locator
|
|
244
|
+
):
|
|
245
|
+
logger.error(
|
|
246
|
+
f"Invalid record locator in metadata: {file_data.metadata.record_locator}."
|
|
247
|
+
"Keys 'ref' and 'path' must be present."
|
|
248
|
+
)
|
|
249
|
+
raise ValueError("Invalid record locator.")
|
|
250
|
+
|
|
251
|
+
ref = file_data.metadata.record_locator["ref"]
|
|
252
|
+
path = file_data.metadata.record_locator["file_path"]
|
|
253
|
+
|
|
254
|
+
project_file = self.connection_config.get_project().files.get(file_path=path, ref=ref)
|
|
255
|
+
download_path.parent.mkdir(exist_ok=True, parents=True)
|
|
256
|
+
|
|
257
|
+
with open(download_path, "wb") as file:
|
|
258
|
+
file.write(project_file.decode())
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
gitlab_source_entry = SourceRegistryEntry(
|
|
262
|
+
connection_config=GitLabConnectionConfig,
|
|
263
|
+
indexer_config=GitLabIndexerConfig,
|
|
264
|
+
indexer=GitLabIndexer,
|
|
265
|
+
downloader_config=GitLabDownloaderConfig,
|
|
266
|
+
downloader=GitLabDownloader,
|
|
267
|
+
)
|
|
@@ -19,12 +19,12 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
19
19
|
ConnectionConfig,
|
|
20
20
|
Downloader,
|
|
21
21
|
DownloaderConfig,
|
|
22
|
+
DownloadResponse,
|
|
22
23
|
FileData,
|
|
23
24
|
FileDataSourceMetadata,
|
|
24
25
|
Indexer,
|
|
25
26
|
IndexerConfig,
|
|
26
27
|
SourceIdentifiers,
|
|
27
|
-
download_responses,
|
|
28
28
|
)
|
|
29
29
|
from unstructured_ingest.v2.logger import logger
|
|
30
30
|
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
@@ -294,7 +294,7 @@ class GoogleDriveDownloader(Downloader):
|
|
|
294
294
|
_, downloaded = downloader.next_chunk()
|
|
295
295
|
return downloaded
|
|
296
296
|
|
|
297
|
-
def _write_file(self, file_data: FileData, file_contents: io.BytesIO):
|
|
297
|
+
def _write_file(self, file_data: FileData, file_contents: io.BytesIO) -> DownloadResponse:
|
|
298
298
|
download_path = self.get_download_path(file_data=file_data)
|
|
299
299
|
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
300
300
|
logger.debug(f"writing {file_data.source_identifiers.fullpath} to {download_path}")
|
|
@@ -303,7 +303,7 @@ class GoogleDriveDownloader(Downloader):
|
|
|
303
303
|
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
304
304
|
|
|
305
305
|
@requires_dependencies(["googleapiclient"], extras="google-drive")
|
|
306
|
-
def run(self, file_data: FileData, **kwargs: Any) ->
|
|
306
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
307
307
|
from googleapiclient.http import MediaIoBaseDownload
|
|
308
308
|
|
|
309
309
|
logger.debug(f"fetching file: {file_data.source_identifiers.fullpath}")
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
4
|
+
add_destination_entry,
|
|
5
|
+
add_source_entry,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
from .cloud import CONNECTOR_TYPE as CLOUD_CONNECTOR
|
|
9
|
+
from .cloud import kafka_cloud_destination_entry, kafka_cloud_source_entry
|
|
10
|
+
from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR
|
|
11
|
+
from .local import kafka_local_destination_entry, kafka_local_source_entry
|
|
12
|
+
|
|
13
|
+
add_source_entry(source_type=LOCAL_CONNECTOR, entry=kafka_local_source_entry)
|
|
14
|
+
add_destination_entry(destination_type=LOCAL_CONNECTOR, entry=kafka_local_destination_entry)
|
|
15
|
+
|
|
16
|
+
add_source_entry(source_type=CLOUD_CONNECTOR, entry=kafka_cloud_source_entry)
|
|
17
|
+
add_destination_entry(destination_type=CLOUD_CONNECTOR, entry=kafka_cloud_destination_entry)
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import socket
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import TYPE_CHECKING, Optional
|
|
4
|
+
|
|
5
|
+
from pydantic import Field, Secret, SecretStr
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
8
|
+
DestinationRegistryEntry,
|
|
9
|
+
SourceRegistryEntry,
|
|
10
|
+
)
|
|
11
|
+
from unstructured_ingest.v2.processes.connectors.kafka.kafka import (
|
|
12
|
+
KafkaAccessConfig,
|
|
13
|
+
KafkaConnectionConfig,
|
|
14
|
+
KafkaDownloader,
|
|
15
|
+
KafkaDownloaderConfig,
|
|
16
|
+
KafkaIndexer,
|
|
17
|
+
KafkaIndexerConfig,
|
|
18
|
+
KafkaUploader,
|
|
19
|
+
KafkaUploaderConfig,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
CONNECTOR_TYPE = "kafka-cloud"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class CloudKafkaAccessConfig(KafkaAccessConfig):
|
|
29
|
+
api_key: Optional[SecretStr] = Field(
|
|
30
|
+
description="Kafka API key to connect at the server", alias="kafka_api_key", default=None
|
|
31
|
+
)
|
|
32
|
+
secret: Optional[SecretStr] = Field(description="", default=None)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class CloudKafkaConnectionConfig(KafkaConnectionConfig):
|
|
36
|
+
access_config: Secret[CloudKafkaAccessConfig]
|
|
37
|
+
|
|
38
|
+
def get_consumer_configuration(self) -> dict:
|
|
39
|
+
bootstrap = self.bootstrap_server
|
|
40
|
+
port = self.port
|
|
41
|
+
access_config = self.access_config.get_secret_value()
|
|
42
|
+
|
|
43
|
+
conf = {
|
|
44
|
+
"bootstrap.servers": f"{bootstrap}:{port}",
|
|
45
|
+
"client.id": socket.gethostname(),
|
|
46
|
+
"group.id": "default_group_id",
|
|
47
|
+
"enable.auto.commit": "false",
|
|
48
|
+
"auto.offset.reset": "earliest",
|
|
49
|
+
"sasl.username": access_config.api_key,
|
|
50
|
+
"sasl.password": access_config.secret,
|
|
51
|
+
"sasl.mechanism": "PLAIN",
|
|
52
|
+
"security.protocol": "SASL_SSL",
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
return conf
|
|
56
|
+
|
|
57
|
+
def get_producer_configuration(self) -> dict:
|
|
58
|
+
bootstrap = self.bootstrap_server
|
|
59
|
+
port = self.port
|
|
60
|
+
access_config = self.access_config.get_secret_value()
|
|
61
|
+
|
|
62
|
+
conf = {
|
|
63
|
+
"bootstrap.servers": f"{bootstrap}:{port}",
|
|
64
|
+
"sasl.username": access_config.api_key,
|
|
65
|
+
"sasl.password": access_config.secret,
|
|
66
|
+
"sasl.mechanism": "PLAIN",
|
|
67
|
+
"security.protocol": "SASL_SSL",
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
return conf
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class CloudKafkaIndexerConfig(KafkaIndexerConfig):
|
|
74
|
+
pass
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@dataclass
|
|
78
|
+
class CloudKafkaIndexer(KafkaIndexer):
|
|
79
|
+
connection_config: CloudKafkaConnectionConfig
|
|
80
|
+
index_config: CloudKafkaIndexerConfig
|
|
81
|
+
connector_type: str = CONNECTOR_TYPE
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class CloudKafkaDownloaderConfig(KafkaDownloaderConfig):
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclass
|
|
89
|
+
class CloudKafkaDownloader(KafkaDownloader):
|
|
90
|
+
connection_config: CloudKafkaConnectionConfig
|
|
91
|
+
download_config: CloudKafkaDownloaderConfig
|
|
92
|
+
connector_type: str = CONNECTOR_TYPE
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class CloudKafkaUploaderConfig(KafkaUploaderConfig):
|
|
96
|
+
pass
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@dataclass
|
|
100
|
+
class CloudKafkaUploader(KafkaUploader):
|
|
101
|
+
connection_config: CloudKafkaConnectionConfig
|
|
102
|
+
upload_config: CloudKafkaUploaderConfig
|
|
103
|
+
connector_type: str = CONNECTOR_TYPE
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
kafka_cloud_source_entry = SourceRegistryEntry(
|
|
107
|
+
connection_config=CloudKafkaConnectionConfig,
|
|
108
|
+
indexer=CloudKafkaIndexer,
|
|
109
|
+
indexer_config=CloudKafkaIndexerConfig,
|
|
110
|
+
downloader=CloudKafkaDownloader,
|
|
111
|
+
downloader_config=CloudKafkaDownloaderConfig,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
kafka_cloud_destination_entry = DestinationRegistryEntry(
|
|
115
|
+
connection_config=CloudKafkaConnectionConfig,
|
|
116
|
+
uploader=CloudKafkaUploader,
|
|
117
|
+
uploader_config=CloudKafkaUploaderConfig,
|
|
118
|
+
)
|