unstructured-ingest 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (44) hide show
  1. test/integration/connectors/sql/test_singlestore.py +156 -0
  2. test/integration/connectors/test_confluence.py +113 -0
  3. test/integration/connectors/test_kafka.py +67 -0
  4. test/integration/connectors/test_onedrive.py +112 -0
  5. test/integration/connectors/test_qdrant.py +137 -0
  6. test/integration/connectors/test_s3.py +1 -1
  7. test/integration/connectors/utils/docker.py +2 -1
  8. test/integration/connectors/utils/docker_compose.py +23 -8
  9. test/integration/connectors/utils/validation.py +73 -22
  10. unstructured_ingest/__version__.py +1 -1
  11. unstructured_ingest/connector/kafka.py +0 -1
  12. unstructured_ingest/interfaces.py +7 -7
  13. unstructured_ingest/v2/interfaces/file_data.py +1 -0
  14. unstructured_ingest/v2/processes/chunker.py +2 -2
  15. unstructured_ingest/v2/processes/connectors/__init__.py +15 -7
  16. unstructured_ingest/v2/processes/connectors/astradb.py +278 -55
  17. unstructured_ingest/v2/processes/connectors/confluence.py +195 -0
  18. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +5 -5
  19. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +2 -10
  20. unstructured_ingest/v2/processes/connectors/gitlab.py +267 -0
  21. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +13 -0
  22. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +82 -0
  23. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +196 -0
  24. unstructured_ingest/v2/processes/connectors/kafka/local.py +75 -0
  25. unstructured_ingest/v2/processes/connectors/onedrive.py +163 -2
  26. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  27. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  28. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  29. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +168 -0
  30. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  31. unstructured_ingest/v2/processes/connectors/sql/__init__.py +5 -0
  32. unstructured_ingest/v2/processes/connectors/sql/postgres.py +1 -20
  33. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +168 -0
  34. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
  35. unstructured_ingest/v2/processes/connectors/sql/sql.py +15 -6
  36. unstructured_ingest/v2/processes/partitioner.py +14 -3
  37. unstructured_ingest/v2/unstructured_api.py +25 -11
  38. {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/METADATA +17 -17
  39. {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/RECORD +43 -27
  40. unstructured_ingest/v2/processes/connectors/singlestore.py +0 -156
  41. {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/LICENSE.md +0 -0
  42. {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/WHEEL +0 -0
  43. {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/entry_points.txt +0 -0
  44. {unstructured_ingest-0.2.0.dist-info → unstructured_ingest-0.2.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,267 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any, Generator, Optional
6
+ from urllib.parse import urlparse
7
+
8
+ from pydantic import Field, Secret, model_validator
9
+
10
+ from unstructured_ingest.error import SourceConnectionError
11
+ from unstructured_ingest.utils.dep_check import requires_dependencies
12
+ from unstructured_ingest.v2.interfaces import (
13
+ AccessConfig,
14
+ ConnectionConfig,
15
+ Downloader,
16
+ DownloaderConfig,
17
+ DownloadResponse,
18
+ FileData,
19
+ FileDataSourceMetadata,
20
+ Indexer,
21
+ IndexerConfig,
22
+ SourceIdentifiers,
23
+ )
24
+ from unstructured_ingest.v2.logger import logger
25
+ from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
26
+
27
+ CONNECTOR_TYPE = "gitlab"
28
+ if TYPE_CHECKING:
29
+ from gitlab import Gitlab
30
+ from gitlab.v4.objects.projects import Project
31
+
32
+
33
+ class GitLabAccessConfig(AccessConfig):
34
+ access_token: Optional[str] = Field(
35
+ default=None,
36
+ description="Optional personal access token for authenticating with the GitLab API.",
37
+ )
38
+
39
+
40
+ class GitLabConnectionConfig(ConnectionConfig):
41
+ access_config: Secret[GitLabAccessConfig] = Field(
42
+ default_factory=GitLabAccessConfig,
43
+ validate_default=True,
44
+ description="Secret configuration for accessing the GitLab API by authentication token.",
45
+ )
46
+ url: str = Field(description="The full URL to the GitLab project or repository.")
47
+ base_url: str = Field(
48
+ default="https://gitlab.com",
49
+ description="The base URL for the GitLab instance (default is GitLab's public domain).",
50
+ )
51
+ repo_path: str = Field(
52
+ default=None,
53
+ init=False,
54
+ repr=False,
55
+ description="The normalized path extracted from the repository URL.",
56
+ )
57
+
58
+ @model_validator(mode="after")
59
+ def set_repo_path(self):
60
+ """
61
+ Parses the provided GitLab URL to extract the `base_url` and `repo_path`,
62
+ ensuring both are properly formatted for use.
63
+
64
+ If the URL contains a scheme (e.g., 'https') and a network location (e.g., 'gitlab.com'),
65
+ the `base_url` is set accordingly. The repository path is extracted and normalized
66
+ by removing any leading slashes.
67
+
68
+ Notes:
69
+ - If the URL contains both a scheme and network location, the `base_url` is
70
+ extracted directly from the URL.
71
+ - The `repo_path` is adjusted to remove any leading slashes.
72
+ - This method assumes that the URL follows GitLab's structure
73
+ (e.g., 'https://gitlab.com/owner/repo').
74
+ """
75
+ parsed_gh_url = urlparse(self.url)
76
+
77
+ if parsed_gh_url.scheme and parsed_gh_url.netloc:
78
+ self.base_url = f"{parsed_gh_url.scheme}://{parsed_gh_url.netloc}"
79
+ self.repo_path = parsed_gh_url.path.lstrip("/")
80
+
81
+ return self
82
+
83
+ @SourceConnectionError.wrap
84
+ @requires_dependencies(["gitlab"], extras="gitlab")
85
+ def get_client(self) -> "Gitlab":
86
+ from gitlab import Gitlab
87
+
88
+ logger.info(f"Connection to GitLab: {self.base_url!r}")
89
+ gitlab = Gitlab(
90
+ self.base_url, private_token=self.access_config.get_secret_value().access_token
91
+ )
92
+ return gitlab
93
+
94
+ def get_project(self) -> "Project":
95
+ """Retrieves the specified GitLab project using the configured base URL and access token.
96
+
97
+ Returns:
98
+ Project: A GitLab `Project` object representing the specified repository.
99
+
100
+ Raises:
101
+ SourceConnectionError: If the GitLab API connection fails.
102
+ gitlab.exceptions.GitlabGetError: If the project is not found.
103
+ """
104
+ gitlab = self.get_client()
105
+
106
+ logger.info(f"Accessing Project: '{self.repo_path}'")
107
+ project = gitlab.projects.get(self.repo_path)
108
+
109
+ logger.info(f"Successfully accessed project '{self.repo_path}'")
110
+ return project
111
+
112
+
113
+ class GitLabIndexerConfig(IndexerConfig):
114
+ path: Path = Field(
115
+ default="/", description=("Path to the location in the repository that will be processed.")
116
+ )
117
+ recursive: bool = Field(
118
+ default=True,
119
+ description=(
120
+ "Flag to control recursive operations when indexing. "
121
+ "If True, the indexer will traverse directories recursively."
122
+ ),
123
+ )
124
+ git_branch: Optional[str] = Field(
125
+ default=None,
126
+ description="The name of the branch to interact with.",
127
+ )
128
+
129
+
130
+ @dataclass
131
+ class GitLabIndexer(Indexer):
132
+ connection_config: GitLabConnectionConfig
133
+ index_config: GitLabIndexerConfig
134
+
135
+ def precheck(self) -> None:
136
+ """Validates the connection to the GitLab instance by authenticating or
137
+ accessing the project.
138
+
139
+ This method ensures that the GitLab credentials and configuration are correct by
140
+ either authenticating or attempting to fetch the specified project.
141
+
142
+ Raises:
143
+ SourceConnectionError: If the connection or authentication with GitLab fails.
144
+ """
145
+
146
+ try:
147
+ gitlab = self.connection_config.get_client()
148
+ if self.connection_config.access_config.get_secret_value().access_token is not None:
149
+ gitlab.auth()
150
+ else:
151
+ gitlab.projects.get(self.connection_config.repo_path)
152
+
153
+ except Exception as e:
154
+ logger.error(f"Failed to validate connection: {e}", exc_info=True)
155
+ raise SourceConnectionError(f"Failed to validate connection: {e}")
156
+
157
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
158
+ """Iterates over the GitLab repository tree and yields file metadata as `FileData` objects.
159
+
160
+ This method fetches the repository tree for the specified branch and iterates
161
+ over its contents. For each file (blob), it generates a `FileData` object containing
162
+ the file's metadata, path, and permissions.
163
+
164
+ Args:
165
+ **kwargs (Any): Additional keyword arguments (if required).
166
+
167
+ Yields:
168
+ FileData: A generator that yields `FileData` objects representing each file (blob)
169
+ in the repository.
170
+ """
171
+ project = self.connection_config.get_project()
172
+
173
+ ref = self.index_config.git_branch or project.default_branch
174
+
175
+ files = project.repository_tree(
176
+ path=str(self.index_config.path),
177
+ ref=ref,
178
+ recursive=self.index_config.recursive,
179
+ iterator=True,
180
+ all=True,
181
+ )
182
+
183
+ for file in files:
184
+ relative_path = str(Path(file["path"]).relative_to(self.index_config.path))
185
+ if file["type"] == "blob":
186
+ record_locator = {
187
+ "file_path": file["path"],
188
+ "ref": ref,
189
+ }
190
+
191
+ yield FileData(
192
+ identifier=file["id"],
193
+ connector_type=CONNECTOR_TYPE,
194
+ source_identifiers=SourceIdentifiers(
195
+ fullpath=file["path"],
196
+ filename=Path(file["path"]).name,
197
+ rel_path=relative_path,
198
+ ),
199
+ metadata=FileDataSourceMetadata(
200
+ url=file["id"],
201
+ record_locator=record_locator,
202
+ permissions_data=[{"mode": file["mode"]}],
203
+ ),
204
+ additional_metadata={},
205
+ )
206
+
207
+
208
+ class GitLabDownloaderConfig(DownloaderConfig):
209
+ pass
210
+
211
+
212
+ @dataclass
213
+ class GitLabDownloader(Downloader):
214
+ connection_config: GitLabConnectionConfig
215
+ download_config: GitLabDownloaderConfig
216
+
217
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
218
+ """Downloads a file from the repository and returns a `DownloadResponse`.
219
+
220
+ Args:
221
+ file_data (FileData): Metadata about the file to be downloaded.
222
+ **kwargs (Any): Additional arguments (if required).
223
+
224
+ Returns:
225
+ DownloadResponse: A response object containing the download details.
226
+ """
227
+ download_path = self.get_download_path(file_data=file_data)
228
+ if download_path is None:
229
+ logger.error(
230
+ "Generated download path is None, source_identifiers might be missing"
231
+ "from FileData."
232
+ )
233
+ raise ValueError("Generated invalid download path.")
234
+
235
+ self._download_file(file_data, download_path)
236
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
237
+
238
+ def _download_file(self, file_data: FileData, download_path: Path) -> None:
239
+ # NOTE: Indexer should supply the record locator in metadata
240
+ if (
241
+ file_data.metadata.record_locator is None
242
+ or "ref" not in file_data.metadata.record_locator
243
+ or "file_path" not in file_data.metadata.record_locator
244
+ ):
245
+ logger.error(
246
+ f"Invalid record locator in metadata: {file_data.metadata.record_locator}."
247
+ "Keys 'ref' and 'path' must be present."
248
+ )
249
+ raise ValueError("Invalid record locator.")
250
+
251
+ ref = file_data.metadata.record_locator["ref"]
252
+ path = file_data.metadata.record_locator["file_path"]
253
+
254
+ project_file = self.connection_config.get_project().files.get(file_path=path, ref=ref)
255
+ download_path.parent.mkdir(exist_ok=True, parents=True)
256
+
257
+ with open(download_path, "wb") as file:
258
+ file.write(project_file.decode())
259
+
260
+
261
+ gitlab_source_entry = SourceRegistryEntry(
262
+ connection_config=GitLabConnectionConfig,
263
+ indexer_config=GitLabIndexerConfig,
264
+ indexer=GitLabIndexer,
265
+ downloader_config=GitLabDownloaderConfig,
266
+ downloader=GitLabDownloader,
267
+ )
@@ -0,0 +1,13 @@
1
+ from __future__ import annotations
2
+
3
+ from unstructured_ingest.v2.processes.connector_registry import (
4
+ add_source_entry,
5
+ )
6
+
7
+ from .cloud import CONNECTOR_TYPE as CLOUD_CONNECTOR
8
+ from .cloud import kafka_cloud_source_entry
9
+ from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR
10
+ from .local import kafka_local_source_entry
11
+
12
+ add_source_entry(source_type=LOCAL_CONNECTOR, entry=kafka_local_source_entry)
13
+ add_source_entry(source_type=CLOUD_CONNECTOR, entry=kafka_cloud_source_entry)
@@ -0,0 +1,82 @@
1
+ import socket
2
+ from dataclasses import dataclass
3
+ from typing import TYPE_CHECKING, Optional
4
+
5
+ from pydantic import Field, Secret, SecretStr
6
+
7
+ from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
8
+ from unstructured_ingest.v2.processes.connectors.kafka.kafka import (
9
+ KafkaAccessConfig,
10
+ KafkaConnectionConfig,
11
+ KafkaDownloader,
12
+ KafkaDownloaderConfig,
13
+ KafkaIndexer,
14
+ KafkaIndexerConfig,
15
+ )
16
+
17
+ if TYPE_CHECKING:
18
+ pass
19
+
20
+ CONNECTOR_TYPE = "kafka-cloud"
21
+
22
+
23
+ class CloudKafkaAccessConfig(KafkaAccessConfig):
24
+ api_key: Optional[SecretStr] = Field(
25
+ description="Kafka API key to connect at the server", alias="kafka_api_key", default=None
26
+ )
27
+ secret: Optional[SecretStr] = Field(description="", default=None)
28
+
29
+
30
+ class CloudKafkaConnectionConfig(KafkaConnectionConfig):
31
+ access_config: Secret[CloudKafkaAccessConfig]
32
+
33
+ def get_consumer_configuration(self) -> dict:
34
+ bootstrap = self.bootstrap_server
35
+ port = self.port
36
+ access_config = self.access_config.get_secret_value()
37
+
38
+ conf = {
39
+ "bootstrap.servers": f"{bootstrap}:{port}",
40
+ "client.id": socket.gethostname(),
41
+ "group.id": "default_group_id",
42
+ "enable.auto.commit": "false",
43
+ "auto.offset.reset": "earliest",
44
+ "message.max.bytes": 10485760,
45
+ "sasl.username": access_config.api_key,
46
+ "sasl.password": access_config.secret,
47
+ "sasl.mechanism": "PLAIN",
48
+ "security.protocol": "SASL_SSL",
49
+ }
50
+
51
+ return conf
52
+
53
+
54
+ class CloudKafkaIndexerConfig(KafkaIndexerConfig):
55
+ pass
56
+
57
+
58
+ @dataclass
59
+ class CloudKafkaIndexer(KafkaIndexer):
60
+ connection_config: CloudKafkaConnectionConfig
61
+ index_config: CloudKafkaIndexerConfig
62
+ connector_type: str = CONNECTOR_TYPE
63
+
64
+
65
+ class CloudKafkaDownloaderConfig(KafkaDownloaderConfig):
66
+ pass
67
+
68
+
69
+ @dataclass
70
+ class CloudKafkaDownloader(KafkaDownloader):
71
+ connection_config: CloudKafkaConnectionConfig
72
+ download_config: CloudKafkaDownloaderConfig
73
+ connector_type: str = CONNECTOR_TYPE
74
+
75
+
76
+ kafka_cloud_source_entry = SourceRegistryEntry(
77
+ connection_config=CloudKafkaConnectionConfig,
78
+ indexer=CloudKafkaIndexer,
79
+ indexer_config=CloudKafkaIndexerConfig,
80
+ downloader=CloudKafkaDownloader,
81
+ downloader_config=CloudKafkaDownloaderConfig,
82
+ )
@@ -0,0 +1,196 @@
1
+ from abc import ABC, abstractmethod
2
+ from contextlib import contextmanager
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+ from time import time
6
+ from typing import TYPE_CHECKING, Any, ContextManager, Generator, Optional
7
+
8
+ from pydantic import Secret
9
+
10
+ from unstructured_ingest.error import (
11
+ SourceConnectionError,
12
+ SourceConnectionNetworkError,
13
+ )
14
+ from unstructured_ingest.utils.dep_check import requires_dependencies
15
+ from unstructured_ingest.v2.interfaces import (
16
+ AccessConfig,
17
+ ConnectionConfig,
18
+ Downloader,
19
+ DownloaderConfig,
20
+ FileData,
21
+ FileDataSourceMetadata,
22
+ Indexer,
23
+ IndexerConfig,
24
+ SourceIdentifiers,
25
+ download_responses,
26
+ )
27
+ from unstructured_ingest.v2.logger import logger
28
+ from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
29
+
30
+ if TYPE_CHECKING:
31
+ from confluent_kafka import Consumer
32
+
33
+ CONNECTOR_TYPE = "kafka"
34
+
35
+
36
+ class KafkaAccessConfig(AccessConfig, ABC):
37
+ pass
38
+
39
+
40
+ class KafkaConnectionConfig(ConnectionConfig, ABC):
41
+ access_config: Secret[KafkaAccessConfig]
42
+ timeout: Optional[float] = 1.0
43
+ bootstrap_server: str
44
+ port: int
45
+
46
+ @abstractmethod
47
+ def get_consumer_configuration(self) -> dict:
48
+ pass
49
+
50
+ @contextmanager
51
+ @requires_dependencies(["confluent_kafka"], extras="kafka")
52
+ def get_consumer(self) -> ContextManager["Consumer"]:
53
+ from confluent_kafka import Consumer
54
+
55
+ consumer = Consumer(self.get_consumer_configuration())
56
+ try:
57
+ logger.debug("kafka consumer connected")
58
+ yield consumer
59
+ finally:
60
+ consumer.close()
61
+
62
+
63
+ class KafkaIndexerConfig(IndexerConfig):
64
+ topic: str
65
+ num_messages_to_consume: Optional[int] = 100
66
+
67
+ def update_consumer(self, consumer: "Consumer") -> None:
68
+ consumer.subscribe([self.topic])
69
+
70
+
71
+ @dataclass
72
+ class KafkaIndexer(Indexer):
73
+ connection_config: KafkaConnectionConfig
74
+ index_config: KafkaIndexerConfig
75
+ connector_type: str = CONNECTOR_TYPE
76
+
77
+ @contextmanager
78
+ def get_consumer(self) -> ContextManager["Consumer"]:
79
+ with self.connection_config.get_consumer() as consumer:
80
+ self.index_config.update_consumer(consumer=consumer)
81
+ yield consumer
82
+
83
+ @requires_dependencies(["confluent_kafka"], extras="kafka")
84
+ def generate_messages(self) -> Generator[Any, None, None]:
85
+ from confluent_kafka import KafkaError, KafkaException
86
+
87
+ messages_consumed = 0
88
+ max_empty_polls = 10
89
+ empty_polls = 0
90
+ num_messages_to_consume = self.index_config.num_messages_to_consume
91
+ with self.get_consumer() as consumer:
92
+ while messages_consumed < num_messages_to_consume and empty_polls < max_empty_polls:
93
+ msg = consumer.poll(timeout=self.connection_config.timeout)
94
+ if msg is None:
95
+ logger.debug("No Kafka messages found")
96
+ empty_polls += 1
97
+ continue
98
+ if msg.error():
99
+ if msg.error().code() == KafkaError._PARTITION_EOF:
100
+ logger.info(
101
+ "Reached end of partition for topic %s [%d] at offset %d"
102
+ % (msg.topic(), msg.partition(), msg.offset())
103
+ )
104
+ break
105
+ else:
106
+ raise KafkaException(msg.error())
107
+ try:
108
+ empty_polls = 0
109
+ messages_consumed += 1
110
+ yield msg
111
+ finally:
112
+ consumer.commit(asynchronous=False)
113
+
114
+ def generate_file_data(self, msg) -> FileData:
115
+ msg_content = msg.value().decode("utf8")
116
+ identifier = f"{msg.topic()}_{msg.partition()}_{msg.offset()}"
117
+ additional_metadata = {
118
+ "topic": msg.topic(),
119
+ "partition": msg.partition(),
120
+ "offset": msg.offset(),
121
+ "content": msg_content,
122
+ }
123
+ filename = f"{identifier}.txt"
124
+ return FileData(
125
+ identifier=identifier,
126
+ connector_type=self.connector_type,
127
+ source_identifiers=SourceIdentifiers(
128
+ filename=filename,
129
+ fullpath=filename,
130
+ ),
131
+ metadata=FileDataSourceMetadata(
132
+ date_processed=str(time()),
133
+ ),
134
+ additional_metadata=additional_metadata,
135
+ display_name=filename,
136
+ )
137
+
138
+ def run(self) -> Generator[FileData, None, None]:
139
+ for message in self.generate_messages():
140
+ yield self.generate_file_data(message)
141
+
142
+ async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
143
+ raise NotImplementedError()
144
+
145
+ def precheck(self):
146
+ try:
147
+ with self.get_consumer() as consumer:
148
+ cluster_meta = consumer.list_topics(timeout=self.connection_config.timeout)
149
+ current_topics = [
150
+ topic for topic in cluster_meta.topics if topic != "__consumer_offsets"
151
+ ]
152
+ logger.info(f"successfully checked available topics: {current_topics}")
153
+ except Exception as e:
154
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
155
+ raise SourceConnectionError(f"failed to validate connection: {e}")
156
+
157
+
158
+ class KafkaDownloaderConfig(DownloaderConfig):
159
+ pass
160
+
161
+
162
+ @dataclass
163
+ class KafkaDownloader(Downloader):
164
+ connection_config: KafkaConnectionConfig
165
+ download_config: KafkaDownloaderConfig = field(default_factory=KafkaDownloaderConfig)
166
+ connector_type: str = CONNECTOR_TYPE
167
+ version: Optional[str] = None
168
+ source_url: Optional[str] = None
169
+
170
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
171
+ source_identifiers = file_data.source_identifiers
172
+ if source_identifiers is None:
173
+ raise ValueError("FileData is missing source_identifiers")
174
+
175
+ # Build the download path using source_identifiers
176
+ download_path = Path(self.download_dir) / source_identifiers.relative_path
177
+ download_path.parent.mkdir(parents=True, exist_ok=True)
178
+
179
+ try:
180
+ content = file_data.additional_metadata["content"]
181
+ with open(download_path, "w") as file:
182
+ file.write(content)
183
+ except Exception as e:
184
+ logger.error(f"Failed to download file {file_data.identifier}: {e}")
185
+ raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
186
+
187
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
188
+
189
+
190
+ kafka_source_entry = SourceRegistryEntry(
191
+ connection_config=KafkaConnectionConfig,
192
+ indexer=KafkaIndexer,
193
+ indexer_config=KafkaIndexerConfig,
194
+ downloader=KafkaDownloader,
195
+ downloader_config=KafkaDownloaderConfig,
196
+ )
@@ -0,0 +1,75 @@
1
+ import socket
2
+ from dataclasses import dataclass
3
+ from typing import TYPE_CHECKING
4
+
5
+ from pydantic import Field, Secret
6
+
7
+ from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
8
+ from unstructured_ingest.v2.processes.connectors.kafka.kafka import (
9
+ KafkaAccessConfig,
10
+ KafkaConnectionConfig,
11
+ KafkaDownloader,
12
+ KafkaDownloaderConfig,
13
+ KafkaIndexer,
14
+ KafkaIndexerConfig,
15
+ )
16
+
17
+ if TYPE_CHECKING:
18
+ pass
19
+
20
+ CONNECTOR_TYPE = "kafka-local"
21
+
22
+
23
+ class LocalKafkaAccessConfig(KafkaAccessConfig):
24
+ pass
25
+
26
+
27
+ class LocalKafkaConnectionConfig(KafkaConnectionConfig):
28
+ access_config: Secret[LocalKafkaAccessConfig] = Field(
29
+ default=LocalKafkaAccessConfig(), validate_default=True
30
+ )
31
+
32
+ def get_consumer_configuration(self) -> dict:
33
+ bootstrap = self.bootstrap_server
34
+ port = self.port
35
+
36
+ conf = {
37
+ "bootstrap.servers": f"{bootstrap}:{port}",
38
+ "client.id": socket.gethostname(),
39
+ "group.id": "default_group_id",
40
+ "enable.auto.commit": "false",
41
+ "auto.offset.reset": "earliest",
42
+ "message.max.bytes": 10485760,
43
+ }
44
+ return conf
45
+
46
+
47
+ class LocalKafkaIndexerConfig(KafkaIndexerConfig):
48
+ pass
49
+
50
+
51
+ @dataclass
52
+ class LocalKafkaIndexer(KafkaIndexer):
53
+ connection_config: LocalKafkaConnectionConfig
54
+ index_config: LocalKafkaIndexerConfig
55
+ connector_type: str = CONNECTOR_TYPE
56
+
57
+
58
+ class LocalKafkaDownloaderConfig(KafkaDownloaderConfig):
59
+ pass
60
+
61
+
62
+ @dataclass
63
+ class LocalKafkaDownloader(KafkaDownloader):
64
+ connection_config: LocalKafkaConnectionConfig
65
+ download_config: LocalKafkaDownloaderConfig
66
+ connector_type: str = CONNECTOR_TYPE
67
+
68
+
69
+ kafka_local_source_entry = SourceRegistryEntry(
70
+ connection_config=LocalKafkaConnectionConfig,
71
+ indexer=LocalKafkaIndexer,
72
+ indexer_config=LocalKafkaIndexerConfig,
73
+ downloader=LocalKafkaDownloader,
74
+ downloader_config=LocalKafkaDownloaderConfig,
75
+ )