unstructured-ingest 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/test_duckdb.py +82 -0
- test/integration/connectors/duckdb/test_motherduck.py +106 -0
- test/integration/connectors/test_kafka.py +109 -6
- test/integration/connectors/test_qdrant.py +55 -0
- test/unit/v2/connectors/test_confluence.py +39 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +1 -0
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +24 -21
- unstructured_ingest/v2/processes/connectors/chroma.py +6 -5
- unstructured_ingest/v2/processes/connectors/confluence.py +14 -2
- unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +99 -0
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +118 -0
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +133 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +34 -15
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +6 -2
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +23 -11
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +3 -3
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +2 -2
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +2 -3
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +8 -8
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +9 -2
- unstructured_ingest/v2/processes/connectors/kafka/local.py +1 -1
- unstructured_ingest/v2/processes/connectors/kdbai.py +2 -2
- unstructured_ingest/v2/processes/connectors/pinecone.py +2 -2
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +6 -4
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +2 -1
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +7 -9
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +62 -24
- unstructured_ingest/v2/processes/connectors/sql/sql.py +8 -3
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +6 -9
- unstructured_ingest/v2/utils.py +9 -0
- {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/METADATA +19 -17
- {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/RECORD +39 -31
- {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.6.dist-info → unstructured_ingest-0.3.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import uuid
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from unstructured_ingest.v2.interfaces import FileData, UploadStager
|
|
10
|
+
|
|
11
|
+
_COLUMNS = (
|
|
12
|
+
"id",
|
|
13
|
+
"element_id",
|
|
14
|
+
"text",
|
|
15
|
+
"embeddings",
|
|
16
|
+
"type",
|
|
17
|
+
"system",
|
|
18
|
+
"layout_width",
|
|
19
|
+
"layout_height",
|
|
20
|
+
"points",
|
|
21
|
+
"url",
|
|
22
|
+
"version",
|
|
23
|
+
"date_created",
|
|
24
|
+
"date_modified",
|
|
25
|
+
"date_processed",
|
|
26
|
+
"permissions_data",
|
|
27
|
+
"record_locator",
|
|
28
|
+
"category_depth",
|
|
29
|
+
"parent_id",
|
|
30
|
+
"attached_filename",
|
|
31
|
+
"filetype",
|
|
32
|
+
"last_modified",
|
|
33
|
+
"file_directory",
|
|
34
|
+
"filename",
|
|
35
|
+
"languages",
|
|
36
|
+
"page_number",
|
|
37
|
+
"links",
|
|
38
|
+
"page_name",
|
|
39
|
+
"link_urls",
|
|
40
|
+
"link_texts",
|
|
41
|
+
"sent_from",
|
|
42
|
+
"sent_to",
|
|
43
|
+
"subject",
|
|
44
|
+
"section",
|
|
45
|
+
"header_footer_type",
|
|
46
|
+
"emphasized_text_contents",
|
|
47
|
+
"emphasized_text_tags",
|
|
48
|
+
"text_as_html",
|
|
49
|
+
"regex_metadata",
|
|
50
|
+
"detection_class_prob",
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# _DATE_COLUMNS = ("date_created", "date_modified", "date_processed", "last_modified")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class BaseDuckDBUploadStager(UploadStager):
|
|
58
|
+
|
|
59
|
+
def run(
|
|
60
|
+
self,
|
|
61
|
+
elements_filepath: Path,
|
|
62
|
+
file_data: FileData,
|
|
63
|
+
output_dir: Path,
|
|
64
|
+
output_filename: str,
|
|
65
|
+
**kwargs: Any,
|
|
66
|
+
) -> Path:
|
|
67
|
+
with open(elements_filepath) as elements_file:
|
|
68
|
+
elements_contents: list[dict] = json.load(elements_file)
|
|
69
|
+
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
70
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
71
|
+
|
|
72
|
+
output = []
|
|
73
|
+
for data in elements_contents:
|
|
74
|
+
metadata: dict[str, Any] = data.pop("metadata", {})
|
|
75
|
+
data_source = metadata.pop("data_source", {})
|
|
76
|
+
coordinates = metadata.pop("coordinates", {})
|
|
77
|
+
|
|
78
|
+
data.update(metadata)
|
|
79
|
+
data.update(data_source)
|
|
80
|
+
data.update(coordinates)
|
|
81
|
+
|
|
82
|
+
data["id"] = str(uuid.uuid4())
|
|
83
|
+
|
|
84
|
+
# remove extraneous, not supported columns
|
|
85
|
+
data = {k: v for k, v in data.items() if k in _COLUMNS}
|
|
86
|
+
|
|
87
|
+
output.append(data)
|
|
88
|
+
|
|
89
|
+
df = pd.DataFrame.from_dict(output)
|
|
90
|
+
|
|
91
|
+
for column in filter(
|
|
92
|
+
lambda x: x in df.columns,
|
|
93
|
+
("version", "page_number", "regex_metadata"),
|
|
94
|
+
):
|
|
95
|
+
df[column] = df[column].apply(str)
|
|
96
|
+
|
|
97
|
+
with output_path.open("w") as output_file:
|
|
98
|
+
df.to_json(output_file, orient="records", lines=True)
|
|
99
|
+
return output_path
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Callable, Optional
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from pydantic import Field, Secret
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
9
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
|
+
from unstructured_ingest.v2.interfaces import (
|
|
11
|
+
AccessConfig,
|
|
12
|
+
ConnectionConfig,
|
|
13
|
+
FileData,
|
|
14
|
+
Uploader,
|
|
15
|
+
UploaderConfig,
|
|
16
|
+
UploadStagerConfig,
|
|
17
|
+
)
|
|
18
|
+
from unstructured_ingest.v2.logger import logger
|
|
19
|
+
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
20
|
+
from unstructured_ingest.v2.processes.connectors.duckdb.base import BaseDuckDBUploadStager
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from duckdb import DuckDBPyConnection as DuckDBConnection
|
|
24
|
+
|
|
25
|
+
CONNECTOR_TYPE = "duckdb"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DuckDBAccessConfig(AccessConfig):
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class DuckDBConnectionConfig(ConnectionConfig):
|
|
33
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
34
|
+
database: Optional[str] = Field(
|
|
35
|
+
default=None,
|
|
36
|
+
description="Database name. Path to the DuckDB .db file. If the file does "
|
|
37
|
+
"not exist, it will be created at the specified path.",
|
|
38
|
+
)
|
|
39
|
+
db_schema: Optional[str] = Field(
|
|
40
|
+
default="main",
|
|
41
|
+
description="Schema name. Schema in the database where the elements table is located.",
|
|
42
|
+
)
|
|
43
|
+
table: Optional[str] = Field(
|
|
44
|
+
default="elements",
|
|
45
|
+
description="Table name. Table name into which the elements data is inserted.",
|
|
46
|
+
)
|
|
47
|
+
access_config: Secret[DuckDBAccessConfig] = Field(
|
|
48
|
+
default=DuckDBAccessConfig(), validate_default=True
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def __post_init__(self):
|
|
52
|
+
if self.database is None:
|
|
53
|
+
raise ValueError(
|
|
54
|
+
"A DuckDB connection requires a path to a *.db or *.duckdb file "
|
|
55
|
+
"through the `database` argument"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class DuckDBUploadStagerConfig(UploadStagerConfig):
|
|
60
|
+
pass
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class DuckDBUploadStager(BaseDuckDBUploadStager):
|
|
65
|
+
upload_stager_config: DuckDBUploadStagerConfig = field(
|
|
66
|
+
default_factory=lambda: DuckDBUploadStagerConfig()
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class DuckDBUploaderConfig(UploaderConfig):
|
|
71
|
+
batch_size: int = Field(default=50, description="[Not-used] Number of records per batch")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass
|
|
75
|
+
class DuckDBUploader(Uploader):
|
|
76
|
+
connector_type: str = CONNECTOR_TYPE
|
|
77
|
+
upload_config: DuckDBUploaderConfig
|
|
78
|
+
connection_config: DuckDBConnectionConfig
|
|
79
|
+
|
|
80
|
+
def precheck(self) -> None:
|
|
81
|
+
try:
|
|
82
|
+
cursor = self.connection().cursor()
|
|
83
|
+
cursor.execute("SELECT 1;")
|
|
84
|
+
cursor.close()
|
|
85
|
+
except Exception as e:
|
|
86
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
87
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def connection(self) -> Callable[[], "DuckDBConnection"]:
|
|
91
|
+
return self._make_duckdb_connection
|
|
92
|
+
|
|
93
|
+
@requires_dependencies(["duckdb"], extras="duckdb")
|
|
94
|
+
def _make_duckdb_connection(self) -> "DuckDBConnection":
|
|
95
|
+
import duckdb
|
|
96
|
+
|
|
97
|
+
return duckdb.connect(self.connection_config.database)
|
|
98
|
+
|
|
99
|
+
def upload_contents(self, path: Path) -> None:
|
|
100
|
+
df_elements = pd.read_json(path, orient="records", lines=True)
|
|
101
|
+
logger.debug(f"uploading {len(df_elements)} entries to {self.connection_config.database} ")
|
|
102
|
+
|
|
103
|
+
with self.connection() as conn:
|
|
104
|
+
conn.query(
|
|
105
|
+
f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df_elements" # noqa: E501
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
109
|
+
self.upload_contents(path=path)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
duckdb_destination_entry = DestinationRegistryEntry(
|
|
113
|
+
connection_config=DuckDBConnectionConfig,
|
|
114
|
+
uploader=DuckDBUploader,
|
|
115
|
+
uploader_config=DuckDBUploaderConfig,
|
|
116
|
+
upload_stager=DuckDBUploadStager,
|
|
117
|
+
upload_stager_config=DuckDBUploadStagerConfig,
|
|
118
|
+
)
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Callable, Optional
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from pydantic import Field, Secret
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.__version__ import __version__ as unstructured_io_ingest_version
|
|
9
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
10
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
|
+
from unstructured_ingest.v2.interfaces import (
|
|
12
|
+
AccessConfig,
|
|
13
|
+
ConnectionConfig,
|
|
14
|
+
FileData,
|
|
15
|
+
Uploader,
|
|
16
|
+
UploaderConfig,
|
|
17
|
+
UploadStagerConfig,
|
|
18
|
+
)
|
|
19
|
+
from unstructured_ingest.v2.logger import logger
|
|
20
|
+
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
21
|
+
from unstructured_ingest.v2.processes.connectors.duckdb.base import BaseDuckDBUploadStager
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from duckdb import DuckDBPyConnection as MotherDuckConnection
|
|
25
|
+
|
|
26
|
+
CONNECTOR_TYPE = "motherduck"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class MotherDuckAccessConfig(AccessConfig):
|
|
30
|
+
md_token: Optional[str] = Field(default=None, description="MotherDuck token")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class MotherDuckConnectionConfig(ConnectionConfig):
|
|
34
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
35
|
+
database: Optional[str] = Field(
|
|
36
|
+
default=None,
|
|
37
|
+
description="Database name. Name of the MotherDuck database.",
|
|
38
|
+
)
|
|
39
|
+
db_schema: Optional[str] = Field(
|
|
40
|
+
default="main",
|
|
41
|
+
description="Schema name. Schema in the database where the elements table is located.",
|
|
42
|
+
)
|
|
43
|
+
table: Optional[str] = Field(
|
|
44
|
+
default="elements",
|
|
45
|
+
description="Table name. Table name into which the elements data is inserted.",
|
|
46
|
+
)
|
|
47
|
+
access_config: Secret[MotherDuckAccessConfig] = Field(
|
|
48
|
+
default=MotherDuckAccessConfig(), validate_default=True
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def __post_init__(self):
|
|
52
|
+
if self.database is None:
|
|
53
|
+
raise ValueError(
|
|
54
|
+
"A MotherDuck connection requires a database (string) to be passed "
|
|
55
|
+
"through the `database` argument"
|
|
56
|
+
)
|
|
57
|
+
if self.access_config.md_token is None:
|
|
58
|
+
raise ValueError(
|
|
59
|
+
"A MotherDuck connection requires a md_token (MotherDuck token) to be passed "
|
|
60
|
+
"using MotherDuckAccessConfig through the `access_config` argument"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class MotherDuckUploadStagerConfig(UploadStagerConfig):
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass
|
|
69
|
+
class MotherDuckUploadStager(BaseDuckDBUploadStager):
|
|
70
|
+
upload_stager_config: MotherDuckUploadStagerConfig = field(
|
|
71
|
+
default_factory=lambda: MotherDuckUploadStagerConfig()
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class MotherDuckUploaderConfig(UploaderConfig):
|
|
76
|
+
batch_size: int = Field(default=50, description="[Not-used] Number of records per batch")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class MotherDuckUploader(Uploader):
|
|
81
|
+
connector_type: str = CONNECTOR_TYPE
|
|
82
|
+
upload_config: MotherDuckUploaderConfig
|
|
83
|
+
connection_config: MotherDuckConnectionConfig
|
|
84
|
+
|
|
85
|
+
def precheck(self) -> None:
|
|
86
|
+
try:
|
|
87
|
+
cursor = self.connection().cursor()
|
|
88
|
+
cursor.execute("SELECT 1;")
|
|
89
|
+
cursor.close()
|
|
90
|
+
except Exception as e:
|
|
91
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
92
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def connection(self) -> Callable[[], "MotherDuckConnection"]:
|
|
96
|
+
return self._make_motherduck_connection
|
|
97
|
+
|
|
98
|
+
@requires_dependencies(["duckdb"], extras="duckdb")
|
|
99
|
+
def _make_motherduck_connection(self) -> "MotherDuckConnection":
|
|
100
|
+
import duckdb
|
|
101
|
+
|
|
102
|
+
access_config = self.connection_config.access_config.get_secret_value()
|
|
103
|
+
conn = duckdb.connect(
|
|
104
|
+
f"md:?motherduck_token={access_config.md_token}",
|
|
105
|
+
config={
|
|
106
|
+
"custom_user_agent": f"unstructured-io-ingest/{unstructured_io_ingest_version}"
|
|
107
|
+
},
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
conn.sql(f"USE {self.connection_config.database}")
|
|
111
|
+
|
|
112
|
+
return conn
|
|
113
|
+
|
|
114
|
+
def upload_contents(self, path: Path) -> None:
|
|
115
|
+
df_elements = pd.read_json(path, orient="records", lines=True)
|
|
116
|
+
logger.debug(f"uploading {len(df_elements)} entries to {self.connection_config.database} ")
|
|
117
|
+
|
|
118
|
+
with self.connection() as conn:
|
|
119
|
+
conn.query(
|
|
120
|
+
f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df_elements" # noqa: E501
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
124
|
+
self.upload_contents(path=path)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
motherduck_destination_entry = DestinationRegistryEntry(
|
|
128
|
+
connection_config=MotherDuckConnectionConfig,
|
|
129
|
+
uploader=MotherDuckUploader,
|
|
130
|
+
uploader_config=MotherDuckUploaderConfig,
|
|
131
|
+
upload_stager=MotherDuckUploadStager,
|
|
132
|
+
upload_stager_config=MotherDuckUploadStagerConfig,
|
|
133
|
+
)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
+
import collections
|
|
1
2
|
import hashlib
|
|
2
3
|
import json
|
|
3
4
|
import sys
|
|
4
|
-
import uuid
|
|
5
5
|
from contextlib import contextmanager
|
|
6
6
|
from dataclasses import dataclass, field
|
|
7
7
|
from pathlib import Path
|
|
@@ -41,6 +41,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
41
41
|
DestinationRegistryEntry,
|
|
42
42
|
SourceRegistryEntry,
|
|
43
43
|
)
|
|
44
|
+
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
44
45
|
|
|
45
46
|
if TYPE_CHECKING:
|
|
46
47
|
from elasticsearch import Elasticsearch as ElasticsearchClient
|
|
@@ -326,7 +327,7 @@ class ElasticsearchUploadStager(UploadStager):
|
|
|
326
327
|
def conform_dict(self, data: dict, file_data: FileData) -> dict:
|
|
327
328
|
resp = {
|
|
328
329
|
"_index": self.upload_stager_config.index_name,
|
|
329
|
-
"_id":
|
|
330
|
+
"_id": get_enhanced_element_id(element_dict=data, file_data=file_data),
|
|
330
331
|
"_source": {
|
|
331
332
|
"element_id": data.pop("element_id", None),
|
|
332
333
|
"embeddings": data.pop("embeddings", None),
|
|
@@ -425,7 +426,10 @@ class ElasticsearchUploader(Uploader):
|
|
|
425
426
|
if failures := delete_resp.get("failures"):
|
|
426
427
|
raise WriteError(f"failed to delete records: {failures}")
|
|
427
428
|
|
|
428
|
-
|
|
429
|
+
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
430
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None: # type: ignore
|
|
431
|
+
from elasticsearch.helpers.errors import BulkIndexError
|
|
432
|
+
|
|
429
433
|
parallel_bulk = self.load_parallel_bulk()
|
|
430
434
|
with path.open("r") as file:
|
|
431
435
|
elements_dict = json.load(file)
|
|
@@ -449,18 +453,33 @@ class ElasticsearchUploader(Uploader):
|
|
|
449
453
|
for batch in generator_batching_wbytes(
|
|
450
454
|
elements_dict, batch_size_limit_bytes=self.upload_config.batch_size_bytes
|
|
451
455
|
):
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
456
|
+
try:
|
|
457
|
+
iterator = parallel_bulk(
|
|
458
|
+
client=client,
|
|
459
|
+
actions=batch,
|
|
460
|
+
thread_count=self.upload_config.num_threads,
|
|
461
|
+
)
|
|
462
|
+
collections.deque(iterator, maxlen=0)
|
|
463
|
+
except BulkIndexError as e:
|
|
464
|
+
sanitized_errors = [
|
|
465
|
+
self._sanitize_bulk_index_error(error) for error in e.errors
|
|
466
|
+
]
|
|
467
|
+
logger.error(
|
|
468
|
+
f"Batch upload failed - {e} - with following errors: {sanitized_errors}"
|
|
469
|
+
)
|
|
470
|
+
raise e
|
|
471
|
+
except Exception as e:
|
|
472
|
+
logger.error(f"Batch upload failed - {e}")
|
|
473
|
+
raise e
|
|
474
|
+
|
|
475
|
+
def _sanitize_bulk_index_error(self, error: dict[str, dict]) -> dict:
|
|
476
|
+
"""Remove data uploaded to index from the log, leave only error information.
|
|
477
|
+
|
|
478
|
+
Error structure is `{<operation-type>: {..., "data": <uploaded-object>}}`
|
|
479
|
+
"""
|
|
480
|
+
for error_data in error.values():
|
|
481
|
+
error_data.pop("data", None)
|
|
482
|
+
return error
|
|
464
483
|
|
|
465
484
|
|
|
466
485
|
elasticsearch_source_entry = SourceRegistryEntry(
|
|
@@ -110,10 +110,14 @@ class AzureIndexer(FsspecIndexer):
|
|
|
110
110
|
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
111
111
|
path = file_data["name"]
|
|
112
112
|
date_created = (
|
|
113
|
-
file_data.get("creation_time").timestamp()
|
|
113
|
+
str(file_data.get("creation_time").timestamp())
|
|
114
|
+
if "creation_time" in file_data
|
|
115
|
+
else None
|
|
114
116
|
)
|
|
115
117
|
date_modified = (
|
|
116
|
-
file_data.get("last_modified").timestamp()
|
|
118
|
+
str(file_data.get("last_modified").timestamp())
|
|
119
|
+
if "last_modified" in file_data
|
|
120
|
+
else None
|
|
117
121
|
)
|
|
118
122
|
|
|
119
123
|
file_size = file_data.get("size") if "size" in file_data else None
|
|
@@ -3,10 +3,11 @@ from __future__ import annotations
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from time import time
|
|
6
|
-
from typing import Any, Generator, Optional
|
|
6
|
+
from typing import Annotated, Any, Generator, Optional
|
|
7
7
|
|
|
8
8
|
from dateutil import parser
|
|
9
9
|
from pydantic import Field, Secret
|
|
10
|
+
from pydantic.functional_validators import BeforeValidator
|
|
10
11
|
|
|
11
12
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
13
|
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
|
|
@@ -23,7 +24,9 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
23
24
|
FsspecIndexerConfig,
|
|
24
25
|
FsspecUploader,
|
|
25
26
|
FsspecUploaderConfig,
|
|
27
|
+
SourceConnectionError,
|
|
26
28
|
)
|
|
29
|
+
from unstructured_ingest.v2.processes.connectors.utils import conform_string_to_dict
|
|
27
30
|
|
|
28
31
|
CONNECTOR_TYPE = "box"
|
|
29
32
|
|
|
@@ -33,26 +36,35 @@ class BoxIndexerConfig(FsspecIndexerConfig):
|
|
|
33
36
|
|
|
34
37
|
|
|
35
38
|
class BoxAccessConfig(FsspecAccessConfig):
|
|
36
|
-
box_app_config:
|
|
37
|
-
|
|
39
|
+
box_app_config: Annotated[dict, BeforeValidator(conform_string_to_dict)] = Field(
|
|
40
|
+
description="Box app credentials as a JSON string."
|
|
38
41
|
)
|
|
39
42
|
|
|
40
43
|
|
|
41
44
|
class BoxConnectionConfig(FsspecConnectionConfig):
|
|
42
45
|
supported_protocols: list[str] = field(default_factory=lambda: ["box"], init=False)
|
|
43
|
-
access_config: Secret[BoxAccessConfig]
|
|
46
|
+
access_config: Secret[BoxAccessConfig]
|
|
44
47
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
45
48
|
|
|
46
49
|
def get_access_config(self) -> dict[str, Any]:
|
|
47
|
-
# Return access_kwargs with oauth. The oauth object cannot be stored directly in the config
|
|
48
|
-
# because it is not serializable.
|
|
49
50
|
from boxsdk import JWTAuth
|
|
50
51
|
|
|
51
52
|
ac = self.access_config.get_secret_value()
|
|
53
|
+
settings_dict = ac.box_app_config
|
|
54
|
+
|
|
55
|
+
# Create and authenticate the JWTAuth object
|
|
56
|
+
oauth = JWTAuth.from_settings_dictionary(settings_dict)
|
|
57
|
+
try:
|
|
58
|
+
oauth.authenticate_instance()
|
|
59
|
+
except Exception as e:
|
|
60
|
+
raise SourceConnectionError(f"Failed to authenticate with Box: {e}")
|
|
61
|
+
|
|
62
|
+
if not oauth.access_token:
|
|
63
|
+
raise SourceConnectionError("Authentication failed: No access token generated.")
|
|
64
|
+
|
|
65
|
+
# Prepare the access configuration with the authenticated oauth
|
|
52
66
|
access_kwargs_with_oauth: dict[str, Any] = {
|
|
53
|
-
"oauth":
|
|
54
|
-
ac.box_app_config,
|
|
55
|
-
),
|
|
67
|
+
"oauth": oauth,
|
|
56
68
|
}
|
|
57
69
|
access_config: dict[str, Any] = ac.model_dump()
|
|
58
70
|
access_config.pop("box_app_config", None)
|
|
@@ -80,9 +92,9 @@ class BoxIndexer(FsspecIndexer):
|
|
|
80
92
|
date_created = None
|
|
81
93
|
date_modified = None
|
|
82
94
|
if modified_at_str := file_data.get("modified_at"):
|
|
83
|
-
date_modified = parser.parse(modified_at_str).timestamp()
|
|
95
|
+
date_modified = str(parser.parse(modified_at_str).timestamp())
|
|
84
96
|
if created_at_str := file_data.get("created_at"):
|
|
85
|
-
date_created = parser.parse(created_at_str).timestamp()
|
|
97
|
+
date_created = str(parser.parse(created_at_str).timestamp())
|
|
86
98
|
|
|
87
99
|
file_size = file_data.get("size") if "size" in file_data else None
|
|
88
100
|
|
|
@@ -297,7 +297,7 @@ class FsspecUploader(Uploader):
|
|
|
297
297
|
**self.connection_config.get_access_config(),
|
|
298
298
|
)
|
|
299
299
|
upload_path = Path(self.upload_config.path_without_protocol) / "_empty"
|
|
300
|
-
fs.write_bytes(path=
|
|
300
|
+
fs.write_bytes(path=upload_path.as_posix(), value=b"")
|
|
301
301
|
except Exception as e:
|
|
302
302
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
303
303
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
@@ -314,11 +314,11 @@ class FsspecUploader(Uploader):
|
|
|
314
314
|
path_str = str(path.resolve())
|
|
315
315
|
upload_path = self.get_upload_path(file_data=file_data)
|
|
316
316
|
logger.debug(f"writing local file {path_str} to {upload_path}")
|
|
317
|
-
self.fs.upload(lpath=path_str, rpath=
|
|
317
|
+
self.fs.upload(lpath=path_str, rpath=upload_path.as_posix())
|
|
318
318
|
|
|
319
319
|
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
320
320
|
upload_path = self.get_upload_path(file_data=file_data)
|
|
321
321
|
path_str = str(path.resolve())
|
|
322
322
|
# Odd that fsspec doesn't run exists() as async even when client support async
|
|
323
323
|
logger.debug(f"writing local file {path_str} to {upload_path}")
|
|
324
|
-
self.fs.upload(lpath=path_str, rpath=
|
|
324
|
+
self.fs.upload(lpath=path_str, rpath=upload_path.as_posix())
|
|
@@ -113,9 +113,9 @@ class GcsIndexer(FsspecIndexer):
|
|
|
113
113
|
date_created = None
|
|
114
114
|
date_modified = None
|
|
115
115
|
if modified_at_str := file_data.get("updated"):
|
|
116
|
-
date_modified = parser.parse(modified_at_str).timestamp()
|
|
116
|
+
date_modified = str(parser.parse(modified_at_str).timestamp())
|
|
117
117
|
if created_at_str := file_data.get("timeCreated"):
|
|
118
|
-
date_created = parser.parse(created_at_str).timestamp()
|
|
118
|
+
date_created = str(parser.parse(created_at_str).timestamp())
|
|
119
119
|
|
|
120
120
|
file_size = file_data.get("size") if "size" in file_data else None
|
|
121
121
|
|
|
@@ -30,7 +30,6 @@ CONNECTOR_TYPE = "sftp"
|
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
class SftpIndexerConfig(FsspecIndexerConfig):
|
|
33
|
-
|
|
34
33
|
def model_post_init(self, __context: Any) -> None:
|
|
35
34
|
super().model_post_init(__context)
|
|
36
35
|
_, ext = os.path.splitext(self.remote_url)
|
|
@@ -99,8 +98,8 @@ class SftpIndexer(FsspecIndexer):
|
|
|
99
98
|
|
|
100
99
|
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
101
100
|
path = file_data["name"]
|
|
102
|
-
date_created = file_data.get("time").timestamp() if "time" in file_data else None
|
|
103
|
-
date_modified = file_data.get("mtime").timestamp() if "mtime" in file_data else None
|
|
101
|
+
date_created = str(file_data.get("time").timestamp()) if "time" in file_data else None
|
|
102
|
+
date_modified = str(file_data.get("mtime").timestamp()) if "mtime" in file_data else None
|
|
104
103
|
|
|
105
104
|
file_size = file_data.get("size") if "size" in file_data else None
|
|
106
105
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import socket
|
|
2
2
|
from dataclasses import dataclass
|
|
3
|
-
from typing import TYPE_CHECKING
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
4
|
|
|
5
5
|
from pydantic import Field, Secret, SecretStr
|
|
6
6
|
|
|
@@ -26,10 +26,10 @@ CONNECTOR_TYPE = "kafka-cloud"
|
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
class CloudKafkaAccessConfig(KafkaAccessConfig):
|
|
29
|
-
|
|
30
|
-
description="Kafka API key to connect at the server",
|
|
29
|
+
kafka_api_key: SecretStr = Field(
|
|
30
|
+
description="Kafka API key to connect at the server", default=None
|
|
31
31
|
)
|
|
32
|
-
secret:
|
|
32
|
+
secret: SecretStr = Field(description="", default=None)
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
class CloudKafkaConnectionConfig(KafkaConnectionConfig):
|
|
@@ -43,11 +43,11 @@ class CloudKafkaConnectionConfig(KafkaConnectionConfig):
|
|
|
43
43
|
conf = {
|
|
44
44
|
"bootstrap.servers": f"{bootstrap}:{port}",
|
|
45
45
|
"client.id": socket.gethostname(),
|
|
46
|
-
"group.id":
|
|
46
|
+
"group.id": self.group_id,
|
|
47
47
|
"enable.auto.commit": "false",
|
|
48
48
|
"auto.offset.reset": "earliest",
|
|
49
|
-
"sasl.username": access_config.
|
|
50
|
-
"sasl.password": access_config.secret,
|
|
49
|
+
"sasl.username": access_config.kafka_api_key.get_secret_value(),
|
|
50
|
+
"sasl.password": access_config.secret.get_secret_value(),
|
|
51
51
|
"sasl.mechanism": "PLAIN",
|
|
52
52
|
"security.protocol": "SASL_SSL",
|
|
53
53
|
}
|
|
@@ -61,7 +61,7 @@ class CloudKafkaConnectionConfig(KafkaConnectionConfig):
|
|
|
61
61
|
|
|
62
62
|
conf = {
|
|
63
63
|
"bootstrap.servers": f"{bootstrap}:{port}",
|
|
64
|
-
"sasl.username": access_config.
|
|
64
|
+
"sasl.username": access_config.kafka_api_key,
|
|
65
65
|
"sasl.password": access_config.secret,
|
|
66
66
|
"sasl.mechanism": "PLAIN",
|
|
67
67
|
"security.protocol": "SASL_SSL",
|
|
@@ -43,6 +43,11 @@ class KafkaConnectionConfig(ConnectionConfig, ABC):
|
|
|
43
43
|
access_config: Secret[KafkaAccessConfig]
|
|
44
44
|
bootstrap_server: str
|
|
45
45
|
port: int
|
|
46
|
+
group_id: str = Field(
|
|
47
|
+
description="A consumer group is a way to allow a pool of consumers "
|
|
48
|
+
"to divide the consumption of data over topics and partitions.",
|
|
49
|
+
default="default_group_id",
|
|
50
|
+
)
|
|
46
51
|
|
|
47
52
|
@abstractmethod
|
|
48
53
|
def get_consumer_configuration(self) -> dict:
|
|
@@ -75,7 +80,7 @@ class KafkaConnectionConfig(ConnectionConfig, ABC):
|
|
|
75
80
|
class KafkaIndexerConfig(IndexerConfig):
|
|
76
81
|
topic: str = Field(description="which topic to consume from")
|
|
77
82
|
num_messages_to_consume: Optional[int] = 100
|
|
78
|
-
timeout: Optional[float] = Field(default=
|
|
83
|
+
timeout: Optional[float] = Field(default=3.0, description="polling timeout", ge=3.0)
|
|
79
84
|
|
|
80
85
|
def update_consumer(self, consumer: "Consumer") -> None:
|
|
81
86
|
consumer.subscribe([self.topic])
|
|
@@ -157,7 +162,9 @@ class KafkaIndexer(Indexer, ABC):
|
|
|
157
162
|
def precheck(self):
|
|
158
163
|
try:
|
|
159
164
|
with self.get_consumer() as consumer:
|
|
160
|
-
|
|
165
|
+
# timeout needs at least 3 secs, more info:
|
|
166
|
+
# https://forum.confluent.io/t/kafkacat-connect-failure-to-confcloud-ssl/2513
|
|
167
|
+
cluster_meta = consumer.list_topics(timeout=5)
|
|
161
168
|
current_topics = [
|
|
162
169
|
topic for topic in cluster_meta.topics if topic != "__consumer_offsets"
|
|
163
170
|
]
|