unstructured-ingest 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (39) hide show
  1. test/integration/connectors/duckdb/__init__.py +0 -0
  2. test/integration/connectors/duckdb/test_duckdb.py +82 -0
  3. test/integration/connectors/duckdb/test_motherduck.py +106 -0
  4. test/integration/connectors/test_kafka.py +109 -6
  5. test/integration/connectors/test_qdrant.py +55 -0
  6. test/unit/v2/connectors/test_confluence.py +39 -0
  7. unstructured_ingest/__version__.py +1 -1
  8. unstructured_ingest/v2/processes/connectors/__init__.py +1 -0
  9. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +24 -21
  10. unstructured_ingest/v2/processes/connectors/chroma.py +6 -5
  11. unstructured_ingest/v2/processes/connectors/confluence.py +14 -2
  12. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  13. unstructured_ingest/v2/processes/connectors/duckdb/base.py +99 -0
  14. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +118 -0
  15. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +133 -0
  16. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +34 -15
  17. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +6 -2
  18. unstructured_ingest/v2/processes/connectors/fsspec/box.py +23 -11
  19. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +3 -3
  20. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +2 -2
  21. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +2 -3
  22. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +8 -8
  23. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +9 -2
  24. unstructured_ingest/v2/processes/connectors/kafka/local.py +1 -1
  25. unstructured_ingest/v2/processes/connectors/kdbai.py +2 -2
  26. unstructured_ingest/v2/processes/connectors/pinecone.py +2 -2
  27. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +6 -4
  28. unstructured_ingest/v2/processes/connectors/sql/__init__.py +2 -1
  29. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +7 -9
  30. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +62 -24
  31. unstructured_ingest/v2/processes/connectors/sql/sql.py +8 -3
  32. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +6 -9
  33. unstructured_ingest/v2/utils.py +9 -0
  34. {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/METADATA +19 -17
  35. {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/RECORD +39 -31
  36. {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/LICENSE.md +0 -0
  37. {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/WHEEL +0 -0
  38. {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/entry_points.txt +0 -0
  39. {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,99 @@
1
+ import json
2
+ import uuid
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ import pandas as pd
8
+
9
+ from unstructured_ingest.v2.interfaces import FileData, UploadStager
10
+
11
+ _COLUMNS = (
12
+ "id",
13
+ "element_id",
14
+ "text",
15
+ "embeddings",
16
+ "type",
17
+ "system",
18
+ "layout_width",
19
+ "layout_height",
20
+ "points",
21
+ "url",
22
+ "version",
23
+ "date_created",
24
+ "date_modified",
25
+ "date_processed",
26
+ "permissions_data",
27
+ "record_locator",
28
+ "category_depth",
29
+ "parent_id",
30
+ "attached_filename",
31
+ "filetype",
32
+ "last_modified",
33
+ "file_directory",
34
+ "filename",
35
+ "languages",
36
+ "page_number",
37
+ "links",
38
+ "page_name",
39
+ "link_urls",
40
+ "link_texts",
41
+ "sent_from",
42
+ "sent_to",
43
+ "subject",
44
+ "section",
45
+ "header_footer_type",
46
+ "emphasized_text_contents",
47
+ "emphasized_text_tags",
48
+ "text_as_html",
49
+ "regex_metadata",
50
+ "detection_class_prob",
51
+ )
52
+
53
+ # _DATE_COLUMNS = ("date_created", "date_modified", "date_processed", "last_modified")
54
+
55
+
56
+ @dataclass
57
+ class BaseDuckDBUploadStager(UploadStager):
58
+
59
+ def run(
60
+ self,
61
+ elements_filepath: Path,
62
+ file_data: FileData,
63
+ output_dir: Path,
64
+ output_filename: str,
65
+ **kwargs: Any,
66
+ ) -> Path:
67
+ with open(elements_filepath) as elements_file:
68
+ elements_contents: list[dict] = json.load(elements_file)
69
+ output_path = Path(output_dir) / Path(f"{output_filename}.json")
70
+ output_path.parent.mkdir(parents=True, exist_ok=True)
71
+
72
+ output = []
73
+ for data in elements_contents:
74
+ metadata: dict[str, Any] = data.pop("metadata", {})
75
+ data_source = metadata.pop("data_source", {})
76
+ coordinates = metadata.pop("coordinates", {})
77
+
78
+ data.update(metadata)
79
+ data.update(data_source)
80
+ data.update(coordinates)
81
+
82
+ data["id"] = str(uuid.uuid4())
83
+
84
+ # remove extraneous, not supported columns
85
+ data = {k: v for k, v in data.items() if k in _COLUMNS}
86
+
87
+ output.append(data)
88
+
89
+ df = pd.DataFrame.from_dict(output)
90
+
91
+ for column in filter(
92
+ lambda x: x in df.columns,
93
+ ("version", "page_number", "regex_metadata"),
94
+ ):
95
+ df[column] = df[column].apply(str)
96
+
97
+ with output_path.open("w") as output_file:
98
+ df.to_json(output_file, orient="records", lines=True)
99
+ return output_path
@@ -0,0 +1,118 @@
1
+ from dataclasses import dataclass, field
2
+ from pathlib import Path
3
+ from typing import TYPE_CHECKING, Any, Callable, Optional
4
+
5
+ import pandas as pd
6
+ from pydantic import Field, Secret
7
+
8
+ from unstructured_ingest.error import DestinationConnectionError
9
+ from unstructured_ingest.utils.dep_check import requires_dependencies
10
+ from unstructured_ingest.v2.interfaces import (
11
+ AccessConfig,
12
+ ConnectionConfig,
13
+ FileData,
14
+ Uploader,
15
+ UploaderConfig,
16
+ UploadStagerConfig,
17
+ )
18
+ from unstructured_ingest.v2.logger import logger
19
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
20
+ from unstructured_ingest.v2.processes.connectors.duckdb.base import BaseDuckDBUploadStager
21
+
22
+ if TYPE_CHECKING:
23
+ from duckdb import DuckDBPyConnection as DuckDBConnection
24
+
25
+ CONNECTOR_TYPE = "duckdb"
26
+
27
+
28
+ class DuckDBAccessConfig(AccessConfig):
29
+ pass
30
+
31
+
32
+ class DuckDBConnectionConfig(ConnectionConfig):
33
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
34
+ database: Optional[str] = Field(
35
+ default=None,
36
+ description="Database name. Path to the DuckDB .db file. If the file does "
37
+ "not exist, it will be created at the specified path.",
38
+ )
39
+ db_schema: Optional[str] = Field(
40
+ default="main",
41
+ description="Schema name. Schema in the database where the elements table is located.",
42
+ )
43
+ table: Optional[str] = Field(
44
+ default="elements",
45
+ description="Table name. Table name into which the elements data is inserted.",
46
+ )
47
+ access_config: Secret[DuckDBAccessConfig] = Field(
48
+ default=DuckDBAccessConfig(), validate_default=True
49
+ )
50
+
51
+ def __post_init__(self):
52
+ if self.database is None:
53
+ raise ValueError(
54
+ "A DuckDB connection requires a path to a *.db or *.duckdb file "
55
+ "through the `database` argument"
56
+ )
57
+
58
+
59
+ class DuckDBUploadStagerConfig(UploadStagerConfig):
60
+ pass
61
+
62
+
63
+ @dataclass
64
+ class DuckDBUploadStager(BaseDuckDBUploadStager):
65
+ upload_stager_config: DuckDBUploadStagerConfig = field(
66
+ default_factory=lambda: DuckDBUploadStagerConfig()
67
+ )
68
+
69
+
70
+ class DuckDBUploaderConfig(UploaderConfig):
71
+ batch_size: int = Field(default=50, description="[Not-used] Number of records per batch")
72
+
73
+
74
+ @dataclass
75
+ class DuckDBUploader(Uploader):
76
+ connector_type: str = CONNECTOR_TYPE
77
+ upload_config: DuckDBUploaderConfig
78
+ connection_config: DuckDBConnectionConfig
79
+
80
+ def precheck(self) -> None:
81
+ try:
82
+ cursor = self.connection().cursor()
83
+ cursor.execute("SELECT 1;")
84
+ cursor.close()
85
+ except Exception as e:
86
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
87
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
88
+
89
+ @property
90
+ def connection(self) -> Callable[[], "DuckDBConnection"]:
91
+ return self._make_duckdb_connection
92
+
93
+ @requires_dependencies(["duckdb"], extras="duckdb")
94
+ def _make_duckdb_connection(self) -> "DuckDBConnection":
95
+ import duckdb
96
+
97
+ return duckdb.connect(self.connection_config.database)
98
+
99
+ def upload_contents(self, path: Path) -> None:
100
+ df_elements = pd.read_json(path, orient="records", lines=True)
101
+ logger.debug(f"uploading {len(df_elements)} entries to {self.connection_config.database} ")
102
+
103
+ with self.connection() as conn:
104
+ conn.query(
105
+ f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df_elements" # noqa: E501
106
+ )
107
+
108
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
109
+ self.upload_contents(path=path)
110
+
111
+
112
+ duckdb_destination_entry = DestinationRegistryEntry(
113
+ connection_config=DuckDBConnectionConfig,
114
+ uploader=DuckDBUploader,
115
+ uploader_config=DuckDBUploaderConfig,
116
+ upload_stager=DuckDBUploadStager,
117
+ upload_stager_config=DuckDBUploadStagerConfig,
118
+ )
@@ -0,0 +1,133 @@
1
+ from dataclasses import dataclass, field
2
+ from pathlib import Path
3
+ from typing import TYPE_CHECKING, Any, Callable, Optional
4
+
5
+ import pandas as pd
6
+ from pydantic import Field, Secret
7
+
8
+ from unstructured_ingest.__version__ import __version__ as unstructured_io_ingest_version
9
+ from unstructured_ingest.error import DestinationConnectionError
10
+ from unstructured_ingest.utils.dep_check import requires_dependencies
11
+ from unstructured_ingest.v2.interfaces import (
12
+ AccessConfig,
13
+ ConnectionConfig,
14
+ FileData,
15
+ Uploader,
16
+ UploaderConfig,
17
+ UploadStagerConfig,
18
+ )
19
+ from unstructured_ingest.v2.logger import logger
20
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
21
+ from unstructured_ingest.v2.processes.connectors.duckdb.base import BaseDuckDBUploadStager
22
+
23
+ if TYPE_CHECKING:
24
+ from duckdb import DuckDBPyConnection as MotherDuckConnection
25
+
26
+ CONNECTOR_TYPE = "motherduck"
27
+
28
+
29
+ class MotherDuckAccessConfig(AccessConfig):
30
+ md_token: Optional[str] = Field(default=None, description="MotherDuck token")
31
+
32
+
33
+ class MotherDuckConnectionConfig(ConnectionConfig):
34
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
35
+ database: Optional[str] = Field(
36
+ default=None,
37
+ description="Database name. Name of the MotherDuck database.",
38
+ )
39
+ db_schema: Optional[str] = Field(
40
+ default="main",
41
+ description="Schema name. Schema in the database where the elements table is located.",
42
+ )
43
+ table: Optional[str] = Field(
44
+ default="elements",
45
+ description="Table name. Table name into which the elements data is inserted.",
46
+ )
47
+ access_config: Secret[MotherDuckAccessConfig] = Field(
48
+ default=MotherDuckAccessConfig(), validate_default=True
49
+ )
50
+
51
+ def __post_init__(self):
52
+ if self.database is None:
53
+ raise ValueError(
54
+ "A MotherDuck connection requires a database (string) to be passed "
55
+ "through the `database` argument"
56
+ )
57
+ if self.access_config.md_token is None:
58
+ raise ValueError(
59
+ "A MotherDuck connection requires a md_token (MotherDuck token) to be passed "
60
+ "using MotherDuckAccessConfig through the `access_config` argument"
61
+ )
62
+
63
+
64
+ class MotherDuckUploadStagerConfig(UploadStagerConfig):
65
+ pass
66
+
67
+
68
+ @dataclass
69
+ class MotherDuckUploadStager(BaseDuckDBUploadStager):
70
+ upload_stager_config: MotherDuckUploadStagerConfig = field(
71
+ default_factory=lambda: MotherDuckUploadStagerConfig()
72
+ )
73
+
74
+
75
+ class MotherDuckUploaderConfig(UploaderConfig):
76
+ batch_size: int = Field(default=50, description="[Not-used] Number of records per batch")
77
+
78
+
79
+ @dataclass
80
+ class MotherDuckUploader(Uploader):
81
+ connector_type: str = CONNECTOR_TYPE
82
+ upload_config: MotherDuckUploaderConfig
83
+ connection_config: MotherDuckConnectionConfig
84
+
85
+ def precheck(self) -> None:
86
+ try:
87
+ cursor = self.connection().cursor()
88
+ cursor.execute("SELECT 1;")
89
+ cursor.close()
90
+ except Exception as e:
91
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
92
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
93
+
94
+ @property
95
+ def connection(self) -> Callable[[], "MotherDuckConnection"]:
96
+ return self._make_motherduck_connection
97
+
98
+ @requires_dependencies(["duckdb"], extras="duckdb")
99
+ def _make_motherduck_connection(self) -> "MotherDuckConnection":
100
+ import duckdb
101
+
102
+ access_config = self.connection_config.access_config.get_secret_value()
103
+ conn = duckdb.connect(
104
+ f"md:?motherduck_token={access_config.md_token}",
105
+ config={
106
+ "custom_user_agent": f"unstructured-io-ingest/{unstructured_io_ingest_version}"
107
+ },
108
+ )
109
+
110
+ conn.sql(f"USE {self.connection_config.database}")
111
+
112
+ return conn
113
+
114
+ def upload_contents(self, path: Path) -> None:
115
+ df_elements = pd.read_json(path, orient="records", lines=True)
116
+ logger.debug(f"uploading {len(df_elements)} entries to {self.connection_config.database} ")
117
+
118
+ with self.connection() as conn:
119
+ conn.query(
120
+ f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df_elements" # noqa: E501
121
+ )
122
+
123
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
124
+ self.upload_contents(path=path)
125
+
126
+
127
+ motherduck_destination_entry = DestinationRegistryEntry(
128
+ connection_config=MotherDuckConnectionConfig,
129
+ uploader=MotherDuckUploader,
130
+ uploader_config=MotherDuckUploaderConfig,
131
+ upload_stager=MotherDuckUploadStager,
132
+ upload_stager_config=MotherDuckUploadStagerConfig,
133
+ )
@@ -1,7 +1,7 @@
1
+ import collections
1
2
  import hashlib
2
3
  import json
3
4
  import sys
4
- import uuid
5
5
  from contextlib import contextmanager
6
6
  from dataclasses import dataclass, field
7
7
  from pathlib import Path
@@ -41,6 +41,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
41
41
  DestinationRegistryEntry,
42
42
  SourceRegistryEntry,
43
43
  )
44
+ from unstructured_ingest.v2.utils import get_enhanced_element_id
44
45
 
45
46
  if TYPE_CHECKING:
46
47
  from elasticsearch import Elasticsearch as ElasticsearchClient
@@ -326,7 +327,7 @@ class ElasticsearchUploadStager(UploadStager):
326
327
  def conform_dict(self, data: dict, file_data: FileData) -> dict:
327
328
  resp = {
328
329
  "_index": self.upload_stager_config.index_name,
329
- "_id": str(uuid.uuid4()),
330
+ "_id": get_enhanced_element_id(element_dict=data, file_data=file_data),
330
331
  "_source": {
331
332
  "element_id": data.pop("element_id", None),
332
333
  "embeddings": data.pop("embeddings", None),
@@ -425,7 +426,10 @@ class ElasticsearchUploader(Uploader):
425
426
  if failures := delete_resp.get("failures"):
426
427
  raise WriteError(f"failed to delete records: {failures}")
427
428
 
428
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
429
+ @requires_dependencies(["elasticsearch"], extras="elasticsearch")
430
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None: # type: ignore
431
+ from elasticsearch.helpers.errors import BulkIndexError
432
+
429
433
  parallel_bulk = self.load_parallel_bulk()
430
434
  with path.open("r") as file:
431
435
  elements_dict = json.load(file)
@@ -449,18 +453,33 @@ class ElasticsearchUploader(Uploader):
449
453
  for batch in generator_batching_wbytes(
450
454
  elements_dict, batch_size_limit_bytes=self.upload_config.batch_size_bytes
451
455
  ):
452
- for success, info in parallel_bulk(
453
- client=client,
454
- actions=batch,
455
- thread_count=self.upload_config.num_threads,
456
- ):
457
- if not success:
458
- logger.error(
459
- "upload failed for a batch in "
460
- f"{(self.__class__.__name__).replace('Uploader', '')} "
461
- "destination connector:",
462
- info,
463
- )
456
+ try:
457
+ iterator = parallel_bulk(
458
+ client=client,
459
+ actions=batch,
460
+ thread_count=self.upload_config.num_threads,
461
+ )
462
+ collections.deque(iterator, maxlen=0)
463
+ except BulkIndexError as e:
464
+ sanitized_errors = [
465
+ self._sanitize_bulk_index_error(error) for error in e.errors
466
+ ]
467
+ logger.error(
468
+ f"Batch upload failed - {e} - with following errors: {sanitized_errors}"
469
+ )
470
+ raise e
471
+ except Exception as e:
472
+ logger.error(f"Batch upload failed - {e}")
473
+ raise e
474
+
475
+ def _sanitize_bulk_index_error(self, error: dict[str, dict]) -> dict:
476
+ """Remove data uploaded to index from the log, leave only error information.
477
+
478
+ Error structure is `{<operation-type>: {..., "data": <uploaded-object>}}`
479
+ """
480
+ for error_data in error.values():
481
+ error_data.pop("data", None)
482
+ return error
464
483
 
465
484
 
466
485
  elasticsearch_source_entry = SourceRegistryEntry(
@@ -110,10 +110,14 @@ class AzureIndexer(FsspecIndexer):
110
110
  def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
111
111
  path = file_data["name"]
112
112
  date_created = (
113
- file_data.get("creation_time").timestamp() if "creation_time" in file_data else None
113
+ str(file_data.get("creation_time").timestamp())
114
+ if "creation_time" in file_data
115
+ else None
114
116
  )
115
117
  date_modified = (
116
- file_data.get("last_modified").timestamp() if "last_modified" in file_data else None
118
+ str(file_data.get("last_modified").timestamp())
119
+ if "last_modified" in file_data
120
+ else None
117
121
  )
118
122
 
119
123
  file_size = file_data.get("size") if "size" in file_data else None
@@ -3,10 +3,11 @@ from __future__ import annotations
3
3
  from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
5
  from time import time
6
- from typing import Any, Generator, Optional
6
+ from typing import Annotated, Any, Generator, Optional
7
7
 
8
8
  from dateutil import parser
9
9
  from pydantic import Field, Secret
10
+ from pydantic.functional_validators import BeforeValidator
10
11
 
11
12
  from unstructured_ingest.utils.dep_check import requires_dependencies
12
13
  from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
@@ -23,7 +24,9 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
23
24
  FsspecIndexerConfig,
24
25
  FsspecUploader,
25
26
  FsspecUploaderConfig,
27
+ SourceConnectionError,
26
28
  )
29
+ from unstructured_ingest.v2.processes.connectors.utils import conform_string_to_dict
27
30
 
28
31
  CONNECTOR_TYPE = "box"
29
32
 
@@ -33,26 +36,35 @@ class BoxIndexerConfig(FsspecIndexerConfig):
33
36
 
34
37
 
35
38
  class BoxAccessConfig(FsspecAccessConfig):
36
- box_app_config: Optional[str] = Field(
37
- default=None, description="Path to Box app credentials as json file."
39
+ box_app_config: Annotated[dict, BeforeValidator(conform_string_to_dict)] = Field(
40
+ description="Box app credentials as a JSON string."
38
41
  )
39
42
 
40
43
 
41
44
  class BoxConnectionConfig(FsspecConnectionConfig):
42
45
  supported_protocols: list[str] = field(default_factory=lambda: ["box"], init=False)
43
- access_config: Secret[BoxAccessConfig] = Field(default=BoxAccessConfig(), validate_default=True)
46
+ access_config: Secret[BoxAccessConfig]
44
47
  connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
45
48
 
46
49
  def get_access_config(self) -> dict[str, Any]:
47
- # Return access_kwargs with oauth. The oauth object cannot be stored directly in the config
48
- # because it is not serializable.
49
50
  from boxsdk import JWTAuth
50
51
 
51
52
  ac = self.access_config.get_secret_value()
53
+ settings_dict = ac.box_app_config
54
+
55
+ # Create and authenticate the JWTAuth object
56
+ oauth = JWTAuth.from_settings_dictionary(settings_dict)
57
+ try:
58
+ oauth.authenticate_instance()
59
+ except Exception as e:
60
+ raise SourceConnectionError(f"Failed to authenticate with Box: {e}")
61
+
62
+ if not oauth.access_token:
63
+ raise SourceConnectionError("Authentication failed: No access token generated.")
64
+
65
+ # Prepare the access configuration with the authenticated oauth
52
66
  access_kwargs_with_oauth: dict[str, Any] = {
53
- "oauth": JWTAuth.from_settings_file(
54
- ac.box_app_config,
55
- ),
67
+ "oauth": oauth,
56
68
  }
57
69
  access_config: dict[str, Any] = ac.model_dump()
58
70
  access_config.pop("box_app_config", None)
@@ -80,9 +92,9 @@ class BoxIndexer(FsspecIndexer):
80
92
  date_created = None
81
93
  date_modified = None
82
94
  if modified_at_str := file_data.get("modified_at"):
83
- date_modified = parser.parse(modified_at_str).timestamp()
95
+ date_modified = str(parser.parse(modified_at_str).timestamp())
84
96
  if created_at_str := file_data.get("created_at"):
85
- date_created = parser.parse(created_at_str).timestamp()
97
+ date_created = str(parser.parse(created_at_str).timestamp())
86
98
 
87
99
  file_size = file_data.get("size") if "size" in file_data else None
88
100
 
@@ -297,7 +297,7 @@ class FsspecUploader(Uploader):
297
297
  **self.connection_config.get_access_config(),
298
298
  )
299
299
  upload_path = Path(self.upload_config.path_without_protocol) / "_empty"
300
- fs.write_bytes(path=str(upload_path), value=b"")
300
+ fs.write_bytes(path=upload_path.as_posix(), value=b"")
301
301
  except Exception as e:
302
302
  logger.error(f"failed to validate connection: {e}", exc_info=True)
303
303
  raise DestinationConnectionError(f"failed to validate connection: {e}")
@@ -314,11 +314,11 @@ class FsspecUploader(Uploader):
314
314
  path_str = str(path.resolve())
315
315
  upload_path = self.get_upload_path(file_data=file_data)
316
316
  logger.debug(f"writing local file {path_str} to {upload_path}")
317
- self.fs.upload(lpath=path_str, rpath=str(upload_path))
317
+ self.fs.upload(lpath=path_str, rpath=upload_path.as_posix())
318
318
 
319
319
  async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
320
320
  upload_path = self.get_upload_path(file_data=file_data)
321
321
  path_str = str(path.resolve())
322
322
  # Odd that fsspec doesn't run exists() as async even when client support async
323
323
  logger.debug(f"writing local file {path_str} to {upload_path}")
324
- self.fs.upload(lpath=path_str, rpath=str(upload_path))
324
+ self.fs.upload(lpath=path_str, rpath=upload_path.as_posix())
@@ -113,9 +113,9 @@ class GcsIndexer(FsspecIndexer):
113
113
  date_created = None
114
114
  date_modified = None
115
115
  if modified_at_str := file_data.get("updated"):
116
- date_modified = parser.parse(modified_at_str).timestamp()
116
+ date_modified = str(parser.parse(modified_at_str).timestamp())
117
117
  if created_at_str := file_data.get("timeCreated"):
118
- date_created = parser.parse(created_at_str).timestamp()
118
+ date_created = str(parser.parse(created_at_str).timestamp())
119
119
 
120
120
  file_size = file_data.get("size") if "size" in file_data else None
121
121
 
@@ -30,7 +30,6 @@ CONNECTOR_TYPE = "sftp"
30
30
 
31
31
 
32
32
  class SftpIndexerConfig(FsspecIndexerConfig):
33
-
34
33
  def model_post_init(self, __context: Any) -> None:
35
34
  super().model_post_init(__context)
36
35
  _, ext = os.path.splitext(self.remote_url)
@@ -99,8 +98,8 @@ class SftpIndexer(FsspecIndexer):
99
98
 
100
99
  def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
101
100
  path = file_data["name"]
102
- date_created = file_data.get("time").timestamp() if "time" in file_data else None
103
- date_modified = file_data.get("mtime").timestamp() if "mtime" in file_data else None
101
+ date_created = str(file_data.get("time").timestamp()) if "time" in file_data else None
102
+ date_modified = str(file_data.get("mtime").timestamp()) if "mtime" in file_data else None
104
103
 
105
104
  file_size = file_data.get("size") if "size" in file_data else None
106
105
 
@@ -1,6 +1,6 @@
1
1
  import socket
2
2
  from dataclasses import dataclass
3
- from typing import TYPE_CHECKING, Optional
3
+ from typing import TYPE_CHECKING
4
4
 
5
5
  from pydantic import Field, Secret, SecretStr
6
6
 
@@ -26,10 +26,10 @@ CONNECTOR_TYPE = "kafka-cloud"
26
26
 
27
27
 
28
28
  class CloudKafkaAccessConfig(KafkaAccessConfig):
29
- api_key: Optional[SecretStr] = Field(
30
- description="Kafka API key to connect at the server", alias="kafka_api_key", default=None
29
+ kafka_api_key: SecretStr = Field(
30
+ description="Kafka API key to connect at the server", default=None
31
31
  )
32
- secret: Optional[SecretStr] = Field(description="", default=None)
32
+ secret: SecretStr = Field(description="", default=None)
33
33
 
34
34
 
35
35
  class CloudKafkaConnectionConfig(KafkaConnectionConfig):
@@ -43,11 +43,11 @@ class CloudKafkaConnectionConfig(KafkaConnectionConfig):
43
43
  conf = {
44
44
  "bootstrap.servers": f"{bootstrap}:{port}",
45
45
  "client.id": socket.gethostname(),
46
- "group.id": "default_group_id",
46
+ "group.id": self.group_id,
47
47
  "enable.auto.commit": "false",
48
48
  "auto.offset.reset": "earliest",
49
- "sasl.username": access_config.api_key,
50
- "sasl.password": access_config.secret,
49
+ "sasl.username": access_config.kafka_api_key.get_secret_value(),
50
+ "sasl.password": access_config.secret.get_secret_value(),
51
51
  "sasl.mechanism": "PLAIN",
52
52
  "security.protocol": "SASL_SSL",
53
53
  }
@@ -61,7 +61,7 @@ class CloudKafkaConnectionConfig(KafkaConnectionConfig):
61
61
 
62
62
  conf = {
63
63
  "bootstrap.servers": f"{bootstrap}:{port}",
64
- "sasl.username": access_config.api_key,
64
+ "sasl.username": access_config.kafka_api_key,
65
65
  "sasl.password": access_config.secret,
66
66
  "sasl.mechanism": "PLAIN",
67
67
  "security.protocol": "SASL_SSL",
@@ -43,6 +43,11 @@ class KafkaConnectionConfig(ConnectionConfig, ABC):
43
43
  access_config: Secret[KafkaAccessConfig]
44
44
  bootstrap_server: str
45
45
  port: int
46
+ group_id: str = Field(
47
+ description="A consumer group is a way to allow a pool of consumers "
48
+ "to divide the consumption of data over topics and partitions.",
49
+ default="default_group_id",
50
+ )
46
51
 
47
52
  @abstractmethod
48
53
  def get_consumer_configuration(self) -> dict:
@@ -75,7 +80,7 @@ class KafkaConnectionConfig(ConnectionConfig, ABC):
75
80
  class KafkaIndexerConfig(IndexerConfig):
76
81
  topic: str = Field(description="which topic to consume from")
77
82
  num_messages_to_consume: Optional[int] = 100
78
- timeout: Optional[float] = Field(default=1.0, description="polling timeout")
83
+ timeout: Optional[float] = Field(default=3.0, description="polling timeout", ge=3.0)
79
84
 
80
85
  def update_consumer(self, consumer: "Consumer") -> None:
81
86
  consumer.subscribe([self.topic])
@@ -157,7 +162,9 @@ class KafkaIndexer(Indexer, ABC):
157
162
  def precheck(self):
158
163
  try:
159
164
  with self.get_consumer() as consumer:
160
- cluster_meta = consumer.list_topics(timeout=self.index_config.timeout)
165
+ # timeout needs at least 3 secs, more info:
166
+ # https://forum.confluent.io/t/kafkacat-connect-failure-to-confcloud-ssl/2513
167
+ cluster_meta = consumer.list_topics(timeout=5)
161
168
  current_topics = [
162
169
  topic for topic in cluster_meta.topics if topic != "__consumer_offsets"
163
170
  ]
@@ -39,7 +39,7 @@ class LocalKafkaConnectionConfig(KafkaConnectionConfig):
39
39
 
40
40
  conf = {
41
41
  "bootstrap.servers": f"{bootstrap}:{port}",
42
- "group.id": "default_group_id",
42
+ "group.id": self.group_id,
43
43
  "enable.auto.commit": "false",
44
44
  "auto.offset.reset": "earliest",
45
45
  }