unstructured-ingest 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (78) hide show
  1. test/integration/connectors/test_astradb.py +109 -0
  2. test/integration/connectors/test_azure_cog_search.py +233 -0
  3. test/integration/connectors/test_confluence.py +113 -0
  4. test/integration/connectors/test_kafka.py +167 -0
  5. test/integration/connectors/test_onedrive.py +112 -0
  6. test/integration/connectors/test_pinecone.py +161 -0
  7. test/integration/connectors/test_qdrant.py +137 -0
  8. test/integration/connectors/test_s3.py +23 -0
  9. test/integration/connectors/utils/docker.py +2 -1
  10. test/integration/connectors/utils/validation.py +73 -22
  11. test/unit/v2/__init__.py +0 -0
  12. test/unit/v2/chunkers/__init__.py +0 -0
  13. test/unit/v2/chunkers/test_chunkers.py +49 -0
  14. test/unit/v2/connectors/__init__.py +0 -0
  15. test/unit/v2/embedders/__init__.py +0 -0
  16. test/unit/v2/embedders/test_bedrock.py +36 -0
  17. test/unit/v2/embedders/test_huggingface.py +48 -0
  18. test/unit/v2/embedders/test_mixedbread.py +37 -0
  19. test/unit/v2/embedders/test_octoai.py +35 -0
  20. test/unit/v2/embedders/test_openai.py +35 -0
  21. test/unit/v2/embedders/test_togetherai.py +37 -0
  22. test/unit/v2/embedders/test_vertexai.py +37 -0
  23. test/unit/v2/embedders/test_voyageai.py +38 -0
  24. test/unit/v2/partitioners/__init__.py +0 -0
  25. test/unit/v2/partitioners/test_partitioner.py +63 -0
  26. test/unit/v2/utils/__init__.py +0 -0
  27. test/unit/v2/utils/data_generator.py +32 -0
  28. unstructured_ingest/__version__.py +1 -1
  29. unstructured_ingest/cli/cmds/__init__.py +2 -2
  30. unstructured_ingest/cli/cmds/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
  31. unstructured_ingest/connector/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
  32. unstructured_ingest/connector/kafka.py +0 -1
  33. unstructured_ingest/interfaces.py +7 -7
  34. unstructured_ingest/runner/writers/__init__.py +2 -2
  35. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  36. unstructured_ingest/v2/constants.py +2 -0
  37. unstructured_ingest/v2/processes/chunker.py +2 -2
  38. unstructured_ingest/v2/processes/connectors/__init__.py +16 -5
  39. unstructured_ingest/v2/processes/connectors/airtable.py +2 -2
  40. unstructured_ingest/v2/processes/connectors/astradb.py +33 -21
  41. unstructured_ingest/v2/processes/connectors/{azure_cognitive_search.py → azure_ai_search.py} +112 -35
  42. unstructured_ingest/v2/processes/connectors/confluence.py +195 -0
  43. unstructured_ingest/v2/processes/connectors/couchbase.py +1 -0
  44. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +2 -4
  45. unstructured_ingest/v2/processes/connectors/delta_table.py +17 -5
  46. unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -0
  47. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +28 -10
  48. unstructured_ingest/v2/processes/connectors/gitlab.py +267 -0
  49. unstructured_ingest/v2/processes/connectors/google_drive.py +3 -3
  50. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  51. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +118 -0
  52. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +251 -0
  53. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  54. unstructured_ingest/v2/processes/connectors/onedrive.py +165 -5
  55. unstructured_ingest/v2/processes/connectors/outlook.py +2 -2
  56. unstructured_ingest/v2/processes/connectors/pinecone.py +83 -12
  57. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  58. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  59. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  60. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +168 -0
  61. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  62. unstructured_ingest/v2/processes/connectors/sharepoint.py +3 -2
  63. unstructured_ingest/v2/processes/connectors/slack.py +2 -2
  64. unstructured_ingest/v2/processes/connectors/sql/postgres.py +16 -8
  65. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +3 -1
  66. unstructured_ingest/v2/processes/connectors/sql/sql.py +2 -4
  67. unstructured_ingest/v2/processes/partitioner.py +14 -3
  68. unstructured_ingest/v2/unstructured_api.py +24 -10
  69. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/METADATA +17 -16
  70. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/RECORD +77 -41
  71. unstructured_ingest/runner/writers/azure_cognitive_search.py +0 -24
  72. /test/integration/embedders/{togetherai.py → test_togetherai.py} +0 -0
  73. /test/unit/{test_interfaces_v2.py → v2/test_interfaces.py} +0 -0
  74. /test/unit/{test_utils_v2.py → v2/test_utils.py} +0 -0
  75. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/LICENSE.md +0 -0
  76. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/WHEEL +0 -0
  77. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/entry_points.txt +0 -0
  78. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/top_level.txt +0 -0
@@ -205,6 +205,7 @@ class CouchbaseIndexer(Indexer):
205
205
  yield FileData(
206
206
  identifier=identified,
207
207
  connector_type=CONNECTOR_TYPE,
208
+ doc_type="batch",
208
209
  metadata=FileDataSourceMetadata(
209
210
  url=f"{self.connection_config.connection_string}/"
210
211
  f"{self.connection_config.bucket}",
@@ -148,9 +148,7 @@ class DatabricksVolumesDownloader(Downloader, ABC):
148
148
 
149
149
 
150
150
  class DatabricksVolumesUploaderConfig(UploaderConfig, DatabricksPathMixin):
151
- overwrite: bool = Field(
152
- default=False, description="If true, an existing file will be overwritten."
153
- )
151
+ pass
154
152
 
155
153
 
156
154
  @dataclass
@@ -173,5 +171,5 @@ class DatabricksVolumesUploader(Uploader, ABC):
173
171
  self.connection_config.get_client().files.upload(
174
172
  file_path=output_path,
175
173
  contents=elements_file,
176
- overwrite=self.upload_config.overwrite,
174
+ overwrite=True,
177
175
  )
@@ -4,6 +4,7 @@ from dataclasses import dataclass, field
4
4
  from multiprocessing import Process
5
5
  from pathlib import Path
6
6
  from typing import Any, Optional
7
+ from urllib.parse import urlparse
7
8
 
8
9
  import pandas as pd
9
10
  from pydantic import Field, Secret
@@ -94,7 +95,7 @@ class DeltaTableUploader(Uploader):
94
95
  connection_config: DeltaTableConnectionConfig
95
96
  connector_type: str = CONNECTOR_TYPE
96
97
 
97
- @requires_dependencies(["s3fs", "fsspec"], extras="s3")
98
+ @requires_dependencies(["boto3"], extras="delta-table")
98
99
  def precheck(self):
99
100
  secrets = self.connection_config.access_config.get_secret_value()
100
101
  if (
@@ -102,13 +103,24 @@ class DeltaTableUploader(Uploader):
102
103
  and secrets.aws_access_key_id
103
104
  and secrets.aws_secret_access_key
104
105
  ):
105
- from fsspec import get_filesystem_class
106
+ from boto3 import client
107
+
108
+ url = urlparse(self.connection_config.table_uri)
109
+ bucket_name = url.netloc
110
+ dir_path = url.path.lstrip("/")
106
111
 
107
112
  try:
108
- fs = get_filesystem_class("s3")(
109
- key=secrets.aws_access_key_id, secret=secrets.aws_secret_access_key
113
+ s3_client = client(
114
+ "s3",
115
+ aws_access_key_id=secrets.aws_access_key_id,
116
+ aws_secret_access_key=secrets.aws_secret_access_key,
110
117
  )
111
- fs.write_bytes(path=self.connection_config.table_uri, value=b"")
118
+ s3_client.put_object(Bucket=bucket_name, Key=dir_path, Body=b"")
119
+
120
+ response = s3_client.get_bucket_location(Bucket=bucket_name)
121
+
122
+ if self.connection_config.aws_region != response.get("LocationConstraint"):
123
+ raise ValueError("Wrong AWS Region was provided.")
112
124
 
113
125
  except Exception as e:
114
126
  logger.error(f"failed to validate connection: {e}", exc_info=True)
@@ -191,6 +191,7 @@ class ElasticsearchIndexer(Indexer):
191
191
  yield FileData(
192
192
  identifier=identified,
193
193
  connector_type=CONNECTOR_TYPE,
194
+ doc_type="batch",
194
195
  metadata=FileDataSourceMetadata(
195
196
  url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
196
197
  date_processed=str(time()),
@@ -1,6 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import os
3
4
  import random
5
+ import shutil
6
+ import tempfile
4
7
  from dataclasses import dataclass, field
5
8
  from pathlib import Path
6
9
  from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
@@ -207,12 +210,35 @@ class FsspecDownloader(Downloader):
207
210
  **self.connection_config.get_access_config(),
208
211
  )
209
212
 
213
+ def handle_directory_download(self, lpath: Path) -> None:
214
+ # If the object's name contains certain characters (i.e. '?'), it
215
+ # gets downloaded into a new directory of the same name. This
216
+ # reconciles that with what is expected, which is to download it
217
+ # as a file that is not within a directory.
218
+ if not lpath.is_dir():
219
+ return
220
+ desired_name = lpath.name
221
+ files_in_dir = [file for file in lpath.iterdir() if file.is_file()]
222
+ if not files_in_dir:
223
+ raise ValueError(f"no files in {lpath}")
224
+ if len(files_in_dir) > 1:
225
+ raise ValueError(
226
+ "Multiple files in {}: {}".format(lpath, ", ".join([str(f) for f in files_in_dir]))
227
+ )
228
+ file = files_in_dir[0]
229
+ with tempfile.TemporaryDirectory() as temp_dir:
230
+ temp_location = os.path.join(temp_dir, desired_name)
231
+ shutil.copyfile(src=file, dst=temp_location)
232
+ shutil.rmtree(lpath)
233
+ shutil.move(src=temp_location, dst=lpath)
234
+
210
235
  def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
211
236
  download_path = self.get_download_path(file_data=file_data)
212
237
  download_path.parent.mkdir(parents=True, exist_ok=True)
213
238
  try:
214
239
  rpath = file_data.additional_metadata["original_file_path"]
215
240
  self.fs.get(rpath=rpath, lpath=download_path.as_posix())
241
+ self.handle_directory_download(lpath=download_path)
216
242
  except Exception as e:
217
243
  logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
218
244
  raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
@@ -224,6 +250,7 @@ class FsspecDownloader(Downloader):
224
250
  try:
225
251
  rpath = file_data.additional_metadata["original_file_path"]
226
252
  await self.fs.get(rpath=rpath, lpath=download_path.as_posix())
253
+ self.handle_directory_download(lpath=download_path)
227
254
  except Exception as e:
228
255
  logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
229
256
  raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
@@ -231,9 +258,7 @@ class FsspecDownloader(Downloader):
231
258
 
232
259
 
233
260
  class FsspecUploaderConfig(FileConfig, UploaderConfig):
234
- overwrite: bool = Field(
235
- default=False, description="If true, an existing file will be overwritten."
236
- )
261
+ pass
237
262
 
238
263
 
239
264
  FsspecUploaderConfigT = TypeVar("FsspecUploaderConfigT", bound=FsspecUploaderConfig)
@@ -288,9 +313,6 @@ class FsspecUploader(Uploader):
288
313
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
289
314
  path_str = str(path.resolve())
290
315
  upload_path = self.get_upload_path(file_data=file_data)
291
- if self.fs.exists(path=str(upload_path)) and not self.upload_config.overwrite:
292
- logger.debug(f"skipping upload of {path} to {upload_path}, file already exists")
293
- return
294
316
  logger.debug(f"writing local file {path_str} to {upload_path}")
295
317
  self.fs.upload(lpath=path_str, rpath=str(upload_path))
296
318
 
@@ -298,9 +320,5 @@ class FsspecUploader(Uploader):
298
320
  upload_path = self.get_upload_path(file_data=file_data)
299
321
  path_str = str(path.resolve())
300
322
  # Odd that fsspec doesn't run exists() as async even when client support async
301
- already_exists = self.fs.exists(path=str(upload_path))
302
- if already_exists and not self.upload_config.overwrite:
303
- logger.debug(f"skipping upload of {path} to {upload_path}, file already exists")
304
- return
305
323
  logger.debug(f"writing local file {path_str} to {upload_path}")
306
324
  self.fs.upload(lpath=path_str, rpath=str(upload_path))
@@ -0,0 +1,267 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any, Generator, Optional
6
+ from urllib.parse import urlparse
7
+
8
+ from pydantic import Field, Secret, model_validator
9
+
10
+ from unstructured_ingest.error import SourceConnectionError
11
+ from unstructured_ingest.utils.dep_check import requires_dependencies
12
+ from unstructured_ingest.v2.interfaces import (
13
+ AccessConfig,
14
+ ConnectionConfig,
15
+ Downloader,
16
+ DownloaderConfig,
17
+ DownloadResponse,
18
+ FileData,
19
+ FileDataSourceMetadata,
20
+ Indexer,
21
+ IndexerConfig,
22
+ SourceIdentifiers,
23
+ )
24
+ from unstructured_ingest.v2.logger import logger
25
+ from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
26
+
27
+ CONNECTOR_TYPE = "gitlab"
28
+ if TYPE_CHECKING:
29
+ from gitlab import Gitlab
30
+ from gitlab.v4.objects.projects import Project
31
+
32
+
33
+ class GitLabAccessConfig(AccessConfig):
34
+ access_token: Optional[str] = Field(
35
+ default=None,
36
+ description="Optional personal access token for authenticating with the GitLab API.",
37
+ )
38
+
39
+
40
+ class GitLabConnectionConfig(ConnectionConfig):
41
+ access_config: Secret[GitLabAccessConfig] = Field(
42
+ default_factory=GitLabAccessConfig,
43
+ validate_default=True,
44
+ description="Secret configuration for accessing the GitLab API by authentication token.",
45
+ )
46
+ url: str = Field(description="The full URL to the GitLab project or repository.")
47
+ base_url: str = Field(
48
+ default="https://gitlab.com",
49
+ description="The base URL for the GitLab instance (default is GitLab's public domain).",
50
+ )
51
+ repo_path: str = Field(
52
+ default=None,
53
+ init=False,
54
+ repr=False,
55
+ description="The normalized path extracted from the repository URL.",
56
+ )
57
+
58
+ @model_validator(mode="after")
59
+ def set_repo_path(self):
60
+ """
61
+ Parses the provided GitLab URL to extract the `base_url` and `repo_path`,
62
+ ensuring both are properly formatted for use.
63
+
64
+ If the URL contains a scheme (e.g., 'https') and a network location (e.g., 'gitlab.com'),
65
+ the `base_url` is set accordingly. The repository path is extracted and normalized
66
+ by removing any leading slashes.
67
+
68
+ Notes:
69
+ - If the URL contains both a scheme and network location, the `base_url` is
70
+ extracted directly from the URL.
71
+ - The `repo_path` is adjusted to remove any leading slashes.
72
+ - This method assumes that the URL follows GitLab's structure
73
+ (e.g., 'https://gitlab.com/owner/repo').
74
+ """
75
+ parsed_gh_url = urlparse(self.url)
76
+
77
+ if parsed_gh_url.scheme and parsed_gh_url.netloc:
78
+ self.base_url = f"{parsed_gh_url.scheme}://{parsed_gh_url.netloc}"
79
+ self.repo_path = parsed_gh_url.path.lstrip("/")
80
+
81
+ return self
82
+
83
+ @SourceConnectionError.wrap
84
+ @requires_dependencies(["gitlab"], extras="gitlab")
85
+ def get_client(self) -> "Gitlab":
86
+ from gitlab import Gitlab
87
+
88
+ logger.info(f"Connection to GitLab: {self.base_url!r}")
89
+ gitlab = Gitlab(
90
+ self.base_url, private_token=self.access_config.get_secret_value().access_token
91
+ )
92
+ return gitlab
93
+
94
+ def get_project(self) -> "Project":
95
+ """Retrieves the specified GitLab project using the configured base URL and access token.
96
+
97
+ Returns:
98
+ Project: A GitLab `Project` object representing the specified repository.
99
+
100
+ Raises:
101
+ SourceConnectionError: If the GitLab API connection fails.
102
+ gitlab.exceptions.GitlabGetError: If the project is not found.
103
+ """
104
+ gitlab = self.get_client()
105
+
106
+ logger.info(f"Accessing Project: '{self.repo_path}'")
107
+ project = gitlab.projects.get(self.repo_path)
108
+
109
+ logger.info(f"Successfully accessed project '{self.repo_path}'")
110
+ return project
111
+
112
+
113
+ class GitLabIndexerConfig(IndexerConfig):
114
+ path: Path = Field(
115
+ default="/", description=("Path to the location in the repository that will be processed.")
116
+ )
117
+ recursive: bool = Field(
118
+ default=True,
119
+ description=(
120
+ "Flag to control recursive operations when indexing. "
121
+ "If True, the indexer will traverse directories recursively."
122
+ ),
123
+ )
124
+ git_branch: Optional[str] = Field(
125
+ default=None,
126
+ description="The name of the branch to interact with.",
127
+ )
128
+
129
+
130
+ @dataclass
131
+ class GitLabIndexer(Indexer):
132
+ connection_config: GitLabConnectionConfig
133
+ index_config: GitLabIndexerConfig
134
+
135
+ def precheck(self) -> None:
136
+ """Validates the connection to the GitLab instance by authenticating or
137
+ accessing the project.
138
+
139
+ This method ensures that the GitLab credentials and configuration are correct by
140
+ either authenticating or attempting to fetch the specified project.
141
+
142
+ Raises:
143
+ SourceConnectionError: If the connection or authentication with GitLab fails.
144
+ """
145
+
146
+ try:
147
+ gitlab = self.connection_config.get_client()
148
+ if self.connection_config.access_config.get_secret_value().access_token is not None:
149
+ gitlab.auth()
150
+ else:
151
+ gitlab.projects.get(self.connection_config.repo_path)
152
+
153
+ except Exception as e:
154
+ logger.error(f"Failed to validate connection: {e}", exc_info=True)
155
+ raise SourceConnectionError(f"Failed to validate connection: {e}")
156
+
157
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
158
+ """Iterates over the GitLab repository tree and yields file metadata as `FileData` objects.
159
+
160
+ This method fetches the repository tree for the specified branch and iterates
161
+ over its contents. For each file (blob), it generates a `FileData` object containing
162
+ the file's metadata, path, and permissions.
163
+
164
+ Args:
165
+ **kwargs (Any): Additional keyword arguments (if required).
166
+
167
+ Yields:
168
+ FileData: A generator that yields `FileData` objects representing each file (blob)
169
+ in the repository.
170
+ """
171
+ project = self.connection_config.get_project()
172
+
173
+ ref = self.index_config.git_branch or project.default_branch
174
+
175
+ files = project.repository_tree(
176
+ path=str(self.index_config.path),
177
+ ref=ref,
178
+ recursive=self.index_config.recursive,
179
+ iterator=True,
180
+ all=True,
181
+ )
182
+
183
+ for file in files:
184
+ relative_path = str(Path(file["path"]).relative_to(self.index_config.path))
185
+ if file["type"] == "blob":
186
+ record_locator = {
187
+ "file_path": file["path"],
188
+ "ref": ref,
189
+ }
190
+
191
+ yield FileData(
192
+ identifier=file["id"],
193
+ connector_type=CONNECTOR_TYPE,
194
+ source_identifiers=SourceIdentifiers(
195
+ fullpath=file["path"],
196
+ filename=Path(file["path"]).name,
197
+ rel_path=relative_path,
198
+ ),
199
+ metadata=FileDataSourceMetadata(
200
+ url=file["id"],
201
+ record_locator=record_locator,
202
+ permissions_data=[{"mode": file["mode"]}],
203
+ ),
204
+ additional_metadata={},
205
+ )
206
+
207
+
208
+ class GitLabDownloaderConfig(DownloaderConfig):
209
+ pass
210
+
211
+
212
+ @dataclass
213
+ class GitLabDownloader(Downloader):
214
+ connection_config: GitLabConnectionConfig
215
+ download_config: GitLabDownloaderConfig
216
+
217
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
218
+ """Downloads a file from the repository and returns a `DownloadResponse`.
219
+
220
+ Args:
221
+ file_data (FileData): Metadata about the file to be downloaded.
222
+ **kwargs (Any): Additional arguments (if required).
223
+
224
+ Returns:
225
+ DownloadResponse: A response object containing the download details.
226
+ """
227
+ download_path = self.get_download_path(file_data=file_data)
228
+ if download_path is None:
229
+ logger.error(
230
+ "Generated download path is None, source_identifiers might be missing"
231
+ "from FileData."
232
+ )
233
+ raise ValueError("Generated invalid download path.")
234
+
235
+ self._download_file(file_data, download_path)
236
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
237
+
238
+ def _download_file(self, file_data: FileData, download_path: Path) -> None:
239
+ # NOTE: Indexer should supply the record locator in metadata
240
+ if (
241
+ file_data.metadata.record_locator is None
242
+ or "ref" not in file_data.metadata.record_locator
243
+ or "file_path" not in file_data.metadata.record_locator
244
+ ):
245
+ logger.error(
246
+ f"Invalid record locator in metadata: {file_data.metadata.record_locator}."
247
+ "Keys 'ref' and 'path' must be present."
248
+ )
249
+ raise ValueError("Invalid record locator.")
250
+
251
+ ref = file_data.metadata.record_locator["ref"]
252
+ path = file_data.metadata.record_locator["file_path"]
253
+
254
+ project_file = self.connection_config.get_project().files.get(file_path=path, ref=ref)
255
+ download_path.parent.mkdir(exist_ok=True, parents=True)
256
+
257
+ with open(download_path, "wb") as file:
258
+ file.write(project_file.decode())
259
+
260
+
261
+ gitlab_source_entry = SourceRegistryEntry(
262
+ connection_config=GitLabConnectionConfig,
263
+ indexer_config=GitLabIndexerConfig,
264
+ indexer=GitLabIndexer,
265
+ downloader_config=GitLabDownloaderConfig,
266
+ downloader=GitLabDownloader,
267
+ )
@@ -19,12 +19,12 @@ from unstructured_ingest.v2.interfaces import (
19
19
  ConnectionConfig,
20
20
  Downloader,
21
21
  DownloaderConfig,
22
+ DownloadResponse,
22
23
  FileData,
23
24
  FileDataSourceMetadata,
24
25
  Indexer,
25
26
  IndexerConfig,
26
27
  SourceIdentifiers,
27
- download_responses,
28
28
  )
29
29
  from unstructured_ingest.v2.logger import logger
30
30
  from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
@@ -294,7 +294,7 @@ class GoogleDriveDownloader(Downloader):
294
294
  _, downloaded = downloader.next_chunk()
295
295
  return downloaded
296
296
 
297
- def _write_file(self, file_data: FileData, file_contents: io.BytesIO):
297
+ def _write_file(self, file_data: FileData, file_contents: io.BytesIO) -> DownloadResponse:
298
298
  download_path = self.get_download_path(file_data=file_data)
299
299
  download_path.parent.mkdir(parents=True, exist_ok=True)
300
300
  logger.debug(f"writing {file_data.source_identifiers.fullpath} to {download_path}")
@@ -303,7 +303,7 @@ class GoogleDriveDownloader(Downloader):
303
303
  return self.generate_download_response(file_data=file_data, download_path=download_path)
304
304
 
305
305
  @requires_dependencies(["googleapiclient"], extras="google-drive")
306
- def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
306
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
307
307
  from googleapiclient.http import MediaIoBaseDownload
308
308
 
309
309
  logger.debug(f"fetching file: {file_data.source_identifiers.fullpath}")
@@ -0,0 +1,17 @@
1
+ from __future__ import annotations
2
+
3
+ from unstructured_ingest.v2.processes.connector_registry import (
4
+ add_destination_entry,
5
+ add_source_entry,
6
+ )
7
+
8
+ from .cloud import CONNECTOR_TYPE as CLOUD_CONNECTOR
9
+ from .cloud import kafka_cloud_destination_entry, kafka_cloud_source_entry
10
+ from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR
11
+ from .local import kafka_local_destination_entry, kafka_local_source_entry
12
+
13
+ add_source_entry(source_type=LOCAL_CONNECTOR, entry=kafka_local_source_entry)
14
+ add_destination_entry(destination_type=LOCAL_CONNECTOR, entry=kafka_local_destination_entry)
15
+
16
+ add_source_entry(source_type=CLOUD_CONNECTOR, entry=kafka_cloud_source_entry)
17
+ add_destination_entry(destination_type=CLOUD_CONNECTOR, entry=kafka_cloud_destination_entry)
@@ -0,0 +1,118 @@
1
+ import socket
2
+ from dataclasses import dataclass
3
+ from typing import TYPE_CHECKING, Optional
4
+
5
+ from pydantic import Field, Secret, SecretStr
6
+
7
+ from unstructured_ingest.v2.processes.connector_registry import (
8
+ DestinationRegistryEntry,
9
+ SourceRegistryEntry,
10
+ )
11
+ from unstructured_ingest.v2.processes.connectors.kafka.kafka import (
12
+ KafkaAccessConfig,
13
+ KafkaConnectionConfig,
14
+ KafkaDownloader,
15
+ KafkaDownloaderConfig,
16
+ KafkaIndexer,
17
+ KafkaIndexerConfig,
18
+ KafkaUploader,
19
+ KafkaUploaderConfig,
20
+ )
21
+
22
+ if TYPE_CHECKING:
23
+ pass
24
+
25
+ CONNECTOR_TYPE = "kafka-cloud"
26
+
27
+
28
+ class CloudKafkaAccessConfig(KafkaAccessConfig):
29
+ api_key: Optional[SecretStr] = Field(
30
+ description="Kafka API key to connect at the server", alias="kafka_api_key", default=None
31
+ )
32
+ secret: Optional[SecretStr] = Field(description="", default=None)
33
+
34
+
35
+ class CloudKafkaConnectionConfig(KafkaConnectionConfig):
36
+ access_config: Secret[CloudKafkaAccessConfig]
37
+
38
+ def get_consumer_configuration(self) -> dict:
39
+ bootstrap = self.bootstrap_server
40
+ port = self.port
41
+ access_config = self.access_config.get_secret_value()
42
+
43
+ conf = {
44
+ "bootstrap.servers": f"{bootstrap}:{port}",
45
+ "client.id": socket.gethostname(),
46
+ "group.id": "default_group_id",
47
+ "enable.auto.commit": "false",
48
+ "auto.offset.reset": "earliest",
49
+ "sasl.username": access_config.api_key,
50
+ "sasl.password": access_config.secret,
51
+ "sasl.mechanism": "PLAIN",
52
+ "security.protocol": "SASL_SSL",
53
+ }
54
+
55
+ return conf
56
+
57
+ def get_producer_configuration(self) -> dict:
58
+ bootstrap = self.bootstrap_server
59
+ port = self.port
60
+ access_config = self.access_config.get_secret_value()
61
+
62
+ conf = {
63
+ "bootstrap.servers": f"{bootstrap}:{port}",
64
+ "sasl.username": access_config.api_key,
65
+ "sasl.password": access_config.secret,
66
+ "sasl.mechanism": "PLAIN",
67
+ "security.protocol": "SASL_SSL",
68
+ }
69
+
70
+ return conf
71
+
72
+
73
+ class CloudKafkaIndexerConfig(KafkaIndexerConfig):
74
+ pass
75
+
76
+
77
+ @dataclass
78
+ class CloudKafkaIndexer(KafkaIndexer):
79
+ connection_config: CloudKafkaConnectionConfig
80
+ index_config: CloudKafkaIndexerConfig
81
+ connector_type: str = CONNECTOR_TYPE
82
+
83
+
84
+ class CloudKafkaDownloaderConfig(KafkaDownloaderConfig):
85
+ pass
86
+
87
+
88
+ @dataclass
89
+ class CloudKafkaDownloader(KafkaDownloader):
90
+ connection_config: CloudKafkaConnectionConfig
91
+ download_config: CloudKafkaDownloaderConfig
92
+ connector_type: str = CONNECTOR_TYPE
93
+
94
+
95
+ class CloudKafkaUploaderConfig(KafkaUploaderConfig):
96
+ pass
97
+
98
+
99
+ @dataclass
100
+ class CloudKafkaUploader(KafkaUploader):
101
+ connection_config: CloudKafkaConnectionConfig
102
+ upload_config: CloudKafkaUploaderConfig
103
+ connector_type: str = CONNECTOR_TYPE
104
+
105
+
106
+ kafka_cloud_source_entry = SourceRegistryEntry(
107
+ connection_config=CloudKafkaConnectionConfig,
108
+ indexer=CloudKafkaIndexer,
109
+ indexer_config=CloudKafkaIndexerConfig,
110
+ downloader=CloudKafkaDownloader,
111
+ downloader_config=CloudKafkaDownloaderConfig,
112
+ )
113
+
114
+ kafka_cloud_destination_entry = DestinationRegistryEntry(
115
+ connection_config=CloudKafkaConnectionConfig,
116
+ uploader=CloudKafkaUploader,
117
+ uploader_config=CloudKafkaUploaderConfig,
118
+ )