unstructured-ingest 1.0.55__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/azure_openai.py +7 -2
- unstructured_ingest/embed/openai.py +11 -4
- unstructured_ingest/interfaces/connector.py +7 -1
- unstructured_ingest/otel.py +16 -1
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +106 -14
- unstructured_ingest/processes/connectors/fsspec/s3.py +14 -4
- unstructured_ingest/processes/connectors/onedrive.py +9 -8
- unstructured_ingest/processes/connectors/sharepoint.py +3 -2
- unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- unstructured_ingest/utils/__init__.py +1 -1
- unstructured_ingest/utils/html.py +1 -0
- unstructured_ingest/utils/tls.py +15 -0
- {unstructured_ingest-1.0.55.dist-info → unstructured_ingest-1.1.0.dist-info}/METADATA +2 -1
- {unstructured_ingest-1.0.55.dist-info → unstructured_ingest-1.1.0.dist-info}/RECORD +20 -17
- {unstructured_ingest-1.0.55.dist-info → unstructured_ingest-1.1.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-1.0.55.dist-info → unstructured_ingest-1.1.0.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-1.0.55.dist-info → unstructured_ingest-1.1.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.0
|
|
1
|
+
__version__ = "1.1.0" # pragma: no cover
|
|
@@ -9,6 +9,7 @@ from unstructured_ingest.embed.openai import (
|
|
|
9
9
|
OpenAIEmbeddingEncoder,
|
|
10
10
|
)
|
|
11
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
|
+
from unstructured_ingest.utils.tls import ssl_context_with_optional_ca_override
|
|
12
13
|
|
|
13
14
|
if TYPE_CHECKING:
|
|
14
15
|
from openai import AsyncAzureOpenAI, AzureOpenAI
|
|
@@ -23,9 +24,11 @@ class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
|
|
|
23
24
|
|
|
24
25
|
@requires_dependencies(["openai"], extras="openai")
|
|
25
26
|
def get_client(self) -> "AzureOpenAI":
|
|
26
|
-
from openai import AzureOpenAI
|
|
27
|
+
from openai import AzureOpenAI, DefaultHttpxClient
|
|
27
28
|
|
|
29
|
+
client = DefaultHttpxClient(verify=ssl_context_with_optional_ca_override())
|
|
28
30
|
return AzureOpenAI(
|
|
31
|
+
http_client=client,
|
|
29
32
|
api_key=self.api_key.get_secret_value(),
|
|
30
33
|
api_version=self.api_version,
|
|
31
34
|
azure_endpoint=self.azure_endpoint,
|
|
@@ -33,9 +36,11 @@ class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
|
|
|
33
36
|
|
|
34
37
|
@requires_dependencies(["openai"], extras="openai")
|
|
35
38
|
def get_async_client(self) -> "AsyncAzureOpenAI":
|
|
36
|
-
from openai import AsyncAzureOpenAI
|
|
39
|
+
from openai import AsyncAzureOpenAI, DefaultAsyncHttpxClient
|
|
37
40
|
|
|
41
|
+
client = DefaultAsyncHttpxClient(verify=ssl_context_with_optional_ca_override())
|
|
38
42
|
return AsyncAzureOpenAI(
|
|
43
|
+
http_client=client,
|
|
39
44
|
api_key=self.api_key.get_secret_value(),
|
|
40
45
|
api_version=self.api_version,
|
|
41
46
|
azure_endpoint=self.azure_endpoint,
|
|
@@ -18,6 +18,7 @@ from unstructured_ingest.errors_v2 import (
|
|
|
18
18
|
)
|
|
19
19
|
from unstructured_ingest.logger import logger
|
|
20
20
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
21
|
+
from unstructured_ingest.utils.tls import ssl_context_with_optional_ca_override
|
|
21
22
|
|
|
22
23
|
if TYPE_CHECKING:
|
|
23
24
|
from openai import AsyncOpenAI, OpenAI
|
|
@@ -86,15 +87,21 @@ class OpenAIEmbeddingConfig(EmbeddingConfig):
|
|
|
86
87
|
|
|
87
88
|
@requires_dependencies(["openai"], extras="openai")
|
|
88
89
|
def get_client(self) -> "OpenAI":
|
|
89
|
-
from openai import OpenAI
|
|
90
|
+
from openai import DefaultHttpxClient, OpenAI
|
|
90
91
|
|
|
91
|
-
|
|
92
|
+
client = DefaultHttpxClient(verify=ssl_context_with_optional_ca_override())
|
|
93
|
+
return OpenAI(
|
|
94
|
+
api_key=self.api_key.get_secret_value(), http_client=client, base_url=self.base_url
|
|
95
|
+
)
|
|
92
96
|
|
|
93
97
|
@requires_dependencies(["openai"], extras="openai")
|
|
94
98
|
def get_async_client(self) -> "AsyncOpenAI":
|
|
95
|
-
from openai import AsyncOpenAI
|
|
99
|
+
from openai import AsyncOpenAI, DefaultAsyncHttpxClient
|
|
96
100
|
|
|
97
|
-
|
|
101
|
+
client = DefaultAsyncHttpxClient(verify=ssl_context_with_optional_ca_override())
|
|
102
|
+
return AsyncOpenAI(
|
|
103
|
+
api_key=self.api_key.get_secret_value(), http_client=client, base_url=self.base_url
|
|
104
|
+
)
|
|
98
105
|
|
|
99
106
|
|
|
100
107
|
@dataclass
|
|
@@ -5,6 +5,8 @@ from typing import Any, TypeVar, Union
|
|
|
5
5
|
from pydantic import BaseModel, Secret, model_validator
|
|
6
6
|
from pydantic.types import _SecretBase
|
|
7
7
|
|
|
8
|
+
from unstructured_ingest.processes.utils.logging.connector import ConnectorLoggingMixin
|
|
9
|
+
|
|
8
10
|
|
|
9
11
|
class AccessConfig(BaseModel):
|
|
10
12
|
"""Meant to designate holding any sensitive information associated with other configs
|
|
@@ -46,5 +48,9 @@ ConnectionConfigT = TypeVar("ConnectionConfigT", bound=ConnectionConfig)
|
|
|
46
48
|
|
|
47
49
|
|
|
48
50
|
@dataclass
|
|
49
|
-
class BaseConnector(ABC):
|
|
51
|
+
class BaseConnector(ABC, ConnectorLoggingMixin):
|
|
50
52
|
connection_config: ConnectionConfigT
|
|
53
|
+
|
|
54
|
+
def __post_init__(self):
|
|
55
|
+
"""Initialize the logging mixin after dataclass initialization."""
|
|
56
|
+
ConnectorLoggingMixin.__init__(self)
|
unstructured_ingest/otel.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
import os
|
|
2
3
|
from dataclasses import dataclass, field
|
|
3
4
|
from typing import Callable, ClassVar, Optional, Protocol, Sequence
|
|
@@ -31,13 +32,27 @@ class LogSpanExporter(ConsoleSpanExporter):
|
|
|
31
32
|
self.log_out(self.formatter(span))
|
|
32
33
|
return SpanExportResult.SUCCESS
|
|
33
34
|
|
|
35
|
+
def get_log_out() -> Callable:
|
|
36
|
+
level_names_mapping = {
|
|
37
|
+
'CRITICAL': logging.CRITICAL,
|
|
38
|
+
'FATAL': logging.FATAL,
|
|
39
|
+
'ERROR': logging.ERROR,
|
|
40
|
+
'WARN': logging.WARNING,
|
|
41
|
+
'WARNING': logging.WARNING,
|
|
42
|
+
'INFO': logging.INFO,
|
|
43
|
+
'DEBUG': logging.DEBUG,
|
|
44
|
+
'NOTSET': logging.NOTSET,
|
|
45
|
+
}
|
|
46
|
+
log_level = os.getenv("OTEL_LOG_LEVEL", "DEBUG").upper()
|
|
47
|
+
log_level_int = level_names_mapping.get(log_level, logging.DEBUG)
|
|
48
|
+
return lambda message: logger.log(log_level_int, message)
|
|
34
49
|
|
|
35
50
|
@dataclass
|
|
36
51
|
class OtelHandler:
|
|
37
52
|
otel_endpoint: Optional[str] = None
|
|
38
53
|
service_name: str = "unstructured-ingest"
|
|
39
54
|
trace_provider: TracerProvider = field(init=False)
|
|
40
|
-
log_out: Callable = field(default=
|
|
55
|
+
log_out: Callable = field(default=get_log_out())
|
|
41
56
|
trace_context_key: ClassVar[str] = "_trace_context"
|
|
42
57
|
|
|
43
58
|
def init_trace(self):
|
|
@@ -28,7 +28,6 @@ from unstructured_ingest.interfaces import (
|
|
|
28
28
|
Uploader,
|
|
29
29
|
UploaderConfig,
|
|
30
30
|
)
|
|
31
|
-
from unstructured_ingest.logger import logger
|
|
32
31
|
from unstructured_ingest.processes.connectors.fsspec.utils import sterilize_dict
|
|
33
32
|
from unstructured_ingest.utils.filesystem import mkdir_concurrent_safe
|
|
34
33
|
|
|
@@ -106,6 +105,12 @@ class FsspecIndexer(Indexer):
|
|
|
106
105
|
def precheck(self) -> None:
|
|
107
106
|
from fsspec import get_filesystem_class
|
|
108
107
|
|
|
108
|
+
self.log_operation_start(
|
|
109
|
+
"Connection validation",
|
|
110
|
+
protocol=self.index_config.protocol,
|
|
111
|
+
path=self.index_config.path_without_protocol,
|
|
112
|
+
)
|
|
113
|
+
|
|
109
114
|
try:
|
|
110
115
|
fs = get_filesystem_class(self.index_config.protocol)(
|
|
111
116
|
**self.connection_config.get_access_config(),
|
|
@@ -113,13 +118,24 @@ class FsspecIndexer(Indexer):
|
|
|
113
118
|
files = fs.ls(path=self.index_config.path_without_protocol, detail=True)
|
|
114
119
|
valid_files = [x.get("name") for x in files if x.get("type") == "file"]
|
|
115
120
|
if not valid_files:
|
|
121
|
+
self.log_operation_complete("Connection validation", count=0)
|
|
116
122
|
return
|
|
117
123
|
file_to_sample = valid_files[0]
|
|
118
|
-
|
|
124
|
+
self.log_debug(f"attempting to make HEAD request for file: {file_to_sample}")
|
|
119
125
|
with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
|
|
120
126
|
client.head(path=file_to_sample)
|
|
127
|
+
|
|
128
|
+
self.log_connection_validated(
|
|
129
|
+
connector_type=self.connector_type,
|
|
130
|
+
endpoint=f"{self.index_config.protocol}://{self.index_config.path_without_protocol}",
|
|
131
|
+
)
|
|
132
|
+
|
|
121
133
|
except Exception as e:
|
|
122
|
-
|
|
134
|
+
self.log_connection_failed(
|
|
135
|
+
connector_type=self.connector_type,
|
|
136
|
+
error=e,
|
|
137
|
+
endpoint=f"{self.index_config.protocol}://{self.index_config.path_without_protocol}",
|
|
138
|
+
)
|
|
123
139
|
raise self.wrap_error(e=e)
|
|
124
140
|
|
|
125
141
|
def get_file_info(self) -> list[dict[str, Any]]:
|
|
@@ -150,7 +166,7 @@ class FsspecIndexer(Indexer):
|
|
|
150
166
|
|
|
151
167
|
def sample_n_files(self, files: list[dict[str, Any]], n) -> list[dict[str, Any]]:
|
|
152
168
|
if len(files) <= n:
|
|
153
|
-
|
|
169
|
+
self.log_warning(
|
|
154
170
|
f"number of files to be sampled={n} is not smaller than the number"
|
|
155
171
|
f" of files found ({len(files)}). Returning all of the files as the"
|
|
156
172
|
" sample."
|
|
@@ -201,9 +217,22 @@ class FsspecIndexer(Indexer):
|
|
|
201
217
|
init_file_data.additional_metadata = self.get_metadata(file_info=file_info)
|
|
202
218
|
|
|
203
219
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
220
|
+
self.log_indexing_start(f"{self.connector_type} files")
|
|
221
|
+
|
|
204
222
|
files = self.get_file_info()
|
|
205
|
-
|
|
223
|
+
total_files = len(files)
|
|
224
|
+
|
|
225
|
+
self.log_operation_start("File indexing", total_files=total_files)
|
|
226
|
+
|
|
227
|
+
for i, file_info in enumerate(files):
|
|
206
228
|
file_path = self.get_path(file_info=file_info)
|
|
229
|
+
|
|
230
|
+
# Only log progress for larger operations
|
|
231
|
+
if total_files > 5:
|
|
232
|
+
self.log_progress(
|
|
233
|
+
current=i + 1, total=total_files, item_type="files", operation="Indexing"
|
|
234
|
+
)
|
|
235
|
+
|
|
207
236
|
# Note: we remove any remaining leading slashes (Box introduces these)
|
|
208
237
|
# to get a valid relative path
|
|
209
238
|
rel_path = file_path.replace(self.index_config.path_without_protocol, "").lstrip("/")
|
|
@@ -223,6 +252,8 @@ class FsspecIndexer(Indexer):
|
|
|
223
252
|
display_name=file_path,
|
|
224
253
|
)
|
|
225
254
|
|
|
255
|
+
self.log_indexing_complete(f"{self.connector_type} files", total_files)
|
|
256
|
+
|
|
226
257
|
|
|
227
258
|
class FsspecDownloaderConfig(DownloaderConfig):
|
|
228
259
|
pass
|
|
@@ -272,25 +303,57 @@ class FsspecDownloader(Downloader):
|
|
|
272
303
|
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
273
304
|
download_path = self.get_download_path(file_data=file_data)
|
|
274
305
|
mkdir_concurrent_safe(download_path.parent)
|
|
306
|
+
|
|
307
|
+
rpath = file_data.additional_metadata["original_file_path"]
|
|
308
|
+
file_size = file_data.metadata.filesize_bytes
|
|
309
|
+
self.log_download_start(file_path=rpath, file_id=file_data.identifier, file_size=file_size)
|
|
310
|
+
|
|
275
311
|
try:
|
|
276
|
-
rpath = file_data.additional_metadata["original_file_path"]
|
|
277
312
|
with self.connection_config.get_client(protocol=self.protocol) as client:
|
|
278
313
|
client.get_file(rpath=rpath, lpath=download_path.as_posix())
|
|
279
314
|
self.handle_directory_download(lpath=download_path)
|
|
315
|
+
|
|
280
316
|
except Exception as e:
|
|
317
|
+
self.log_error(
|
|
318
|
+
"File download failed",
|
|
319
|
+
error=e,
|
|
320
|
+
context={"file_path": rpath, "file_id": file_data.identifier},
|
|
321
|
+
)
|
|
281
322
|
raise self.wrap_error(e=e)
|
|
323
|
+
|
|
324
|
+
self.log_download_complete(
|
|
325
|
+
file_path=rpath,
|
|
326
|
+
file_id=file_data.identifier,
|
|
327
|
+
download_path=str(download_path),
|
|
328
|
+
)
|
|
329
|
+
|
|
282
330
|
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
283
331
|
|
|
284
332
|
async def async_run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
285
333
|
download_path = self.get_download_path(file_data=file_data)
|
|
286
334
|
mkdir_concurrent_safe(download_path.parent)
|
|
335
|
+
rpath = file_data.additional_metadata["original_file_path"]
|
|
336
|
+
file_size = file_data.metadata.filesize_bytes
|
|
337
|
+
self.log_download_start(file_path=rpath, file_id=file_data.identifier, file_size=file_size)
|
|
338
|
+
|
|
287
339
|
try:
|
|
288
|
-
rpath = file_data.additional_metadata["original_file_path"]
|
|
289
340
|
with self.connection_config.get_client(protocol=self.protocol) as client:
|
|
290
341
|
await client.get_file(rpath=rpath, lpath=download_path.as_posix())
|
|
291
342
|
self.handle_directory_download(lpath=download_path)
|
|
292
343
|
except Exception as e:
|
|
344
|
+
self.log_error(
|
|
345
|
+
"File download failed",
|
|
346
|
+
error=e,
|
|
347
|
+
context={"file_path": rpath, "file_id": file_data.identifier},
|
|
348
|
+
)
|
|
293
349
|
raise self.wrap_error(e=e)
|
|
350
|
+
|
|
351
|
+
self.log_download_complete(
|
|
352
|
+
file_path=rpath,
|
|
353
|
+
file_id=file_data.identifier,
|
|
354
|
+
download_path=str(download_path),
|
|
355
|
+
)
|
|
356
|
+
|
|
294
357
|
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
295
358
|
|
|
296
359
|
|
|
@@ -321,6 +384,7 @@ class FsspecUploader(Uploader):
|
|
|
321
384
|
)
|
|
322
385
|
|
|
323
386
|
def __post_init__(self):
|
|
387
|
+
super().__post_init__()
|
|
324
388
|
# TODO once python3.9 no longer supported and kw_only is allowed in dataclasses, remove:
|
|
325
389
|
if not self.upload_config:
|
|
326
390
|
raise TypeError(
|
|
@@ -334,6 +398,8 @@ class FsspecUploader(Uploader):
|
|
|
334
398
|
def precheck(self) -> None:
|
|
335
399
|
from fsspec import get_filesystem_class
|
|
336
400
|
|
|
401
|
+
self.log_operation_start("Connection validation", protocol=self.upload_config.protocol)
|
|
402
|
+
|
|
337
403
|
try:
|
|
338
404
|
fs = get_filesystem_class(self.upload_config.protocol)(
|
|
339
405
|
**self.connection_config.get_access_config(),
|
|
@@ -341,7 +407,16 @@ class FsspecUploader(Uploader):
|
|
|
341
407
|
upload_path = Path(self.upload_config.path_without_protocol) / "_empty"
|
|
342
408
|
fs.write_bytes(path=upload_path.as_posix(), value=b"")
|
|
343
409
|
except Exception as e:
|
|
410
|
+
self.log_connection_failed(
|
|
411
|
+
connector_type=self.connector_type,
|
|
412
|
+
error=e,
|
|
413
|
+
endpoint=f"{self.upload_config.protocol}://{self.upload_config.path_without_protocol}",
|
|
414
|
+
)
|
|
344
415
|
raise self.wrap_error(e=e)
|
|
416
|
+
self.log_connection_validated(
|
|
417
|
+
connector_type=self.connector_type,
|
|
418
|
+
endpoint=f"{self.upload_config.protocol}://{self.upload_config.path_without_protocol}",
|
|
419
|
+
)
|
|
345
420
|
|
|
346
421
|
def get_upload_path(self, file_data: FileData) -> Path:
|
|
347
422
|
upload_path = Path(
|
|
@@ -353,14 +428,31 @@ class FsspecUploader(Uploader):
|
|
|
353
428
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
354
429
|
path_str = str(path.resolve())
|
|
355
430
|
upload_path = self.get_upload_path(file_data=file_data)
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
431
|
+
self.log_upload_start(file_path=path_str, destination=upload_path.as_posix())
|
|
432
|
+
try:
|
|
433
|
+
with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
|
|
434
|
+
client.upload(lpath=path_str, rpath=upload_path.as_posix())
|
|
435
|
+
except Exception as e:
|
|
436
|
+
self.log_error(
|
|
437
|
+
"File upload failed",
|
|
438
|
+
error=e,
|
|
439
|
+
context={"file_path": path_str, "destination": upload_path.as_posix()},
|
|
440
|
+
)
|
|
441
|
+
raise self.wrap_error(e=e)
|
|
442
|
+
self.log_upload_complete(file_path=path_str, destination=upload_path.as_posix())
|
|
359
443
|
|
|
360
444
|
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
361
445
|
path_str = str(path.resolve())
|
|
362
446
|
upload_path = self.get_upload_path(file_data=file_data)
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
447
|
+
self.log_upload_start(file_path=path_str, destination=upload_path.as_posix())
|
|
448
|
+
try:
|
|
449
|
+
with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
|
|
450
|
+
client.upload(lpath=path_str, rpath=upload_path.as_posix())
|
|
451
|
+
except Exception as e:
|
|
452
|
+
self.log_error(
|
|
453
|
+
"File upload failed",
|
|
454
|
+
error=e,
|
|
455
|
+
context={"file_path": path_str, "destination": upload_path.as_posix()},
|
|
456
|
+
)
|
|
457
|
+
raise self.wrap_error(e=e)
|
|
458
|
+
self.log_upload_complete(file_path=path_str, destination=upload_path.as_posix())
|
|
@@ -104,7 +104,13 @@ class S3ConnectionConfig(FsspecConnectionConfig):
|
|
|
104
104
|
return UserError(message)
|
|
105
105
|
if http_code >= 500:
|
|
106
106
|
return ProviderError(message)
|
|
107
|
-
logger.error(
|
|
107
|
+
logger.error(
|
|
108
|
+
"Unhandled exception from S3 (type: %s, endpoint: %s): %s",
|
|
109
|
+
type(e).__name__,
|
|
110
|
+
self.endpoint_url or "default",
|
|
111
|
+
e,
|
|
112
|
+
exc_info=True,
|
|
113
|
+
)
|
|
108
114
|
return e
|
|
109
115
|
|
|
110
116
|
|
|
@@ -122,6 +128,10 @@ class S3Indexer(FsspecIndexer):
|
|
|
122
128
|
|
|
123
129
|
def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
|
|
124
130
|
path = file_info["Key"]
|
|
131
|
+
|
|
132
|
+
self.log_debug("Getting metadata for S3 object", context={"file_path": path})
|
|
133
|
+
self.log_file_operation("Getting metadata", file_path=path)
|
|
134
|
+
|
|
125
135
|
date_created = None
|
|
126
136
|
date_modified = None
|
|
127
137
|
modified = file_info.get("LastModified")
|
|
@@ -147,9 +157,9 @@ class S3Indexer(FsspecIndexer):
|
|
|
147
157
|
record_locator["metadata"] = metadata
|
|
148
158
|
issue_characters = [char for char in CHARACTERS_TO_AVOID if char in path]
|
|
149
159
|
if issue_characters:
|
|
150
|
-
|
|
151
|
-
f"File path
|
|
152
|
-
|
|
160
|
+
self.log_warning(
|
|
161
|
+
f"File path contains characters that can cause issues with S3: {issue_characters}",
|
|
162
|
+
context={"path": path, "problematic_characters": issue_characters},
|
|
153
163
|
)
|
|
154
164
|
return FileDataSourceMetadata(
|
|
155
165
|
date_created=date_created,
|
|
@@ -115,23 +115,24 @@ class OnedriveConnectionConfig(ConnectionConfig):
|
|
|
115
115
|
except ValueError as exc:
|
|
116
116
|
logger.error("Couldn't set up credentials.")
|
|
117
117
|
raise exc
|
|
118
|
-
|
|
118
|
+
|
|
119
119
|
if "error" in token:
|
|
120
120
|
error_codes = token.get("error_codes", [])
|
|
121
121
|
error_type = token.get("error", "")
|
|
122
122
|
error_description = token.get("error_description", "")
|
|
123
|
-
|
|
123
|
+
|
|
124
124
|
# 7000215: Invalid client secret provided
|
|
125
125
|
# 7000218: Invalid client id provided
|
|
126
126
|
# 700016: Application not found in directory
|
|
127
127
|
# 90002: Tenant not found
|
|
128
128
|
auth_error_codes = [7000215, 7000218, 700016, 90002]
|
|
129
|
-
|
|
130
|
-
if
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
129
|
+
|
|
130
|
+
if any(code in error_codes for code in auth_error_codes) or error_type in [
|
|
131
|
+
"invalid_client",
|
|
132
|
+
"unauthorized_client",
|
|
133
|
+
"invalid_grant",
|
|
134
|
+
]:
|
|
135
|
+
raise UserAuthError(f"Authentication failed: {error_type}: {error_description}")
|
|
135
136
|
else:
|
|
136
137
|
raise SourceConnectionNetworkError(
|
|
137
138
|
f"Failed to fetch token: {error_type}: {error_description}"
|
|
@@ -87,6 +87,7 @@ class SharepointIndexerConfig(OnedriveIndexerConfig):
|
|
|
87
87
|
# TODO: We can probably make path non-optional on OnedriveIndexerConfig once tested
|
|
88
88
|
path: str = Field(default="")
|
|
89
89
|
|
|
90
|
+
|
|
90
91
|
@dataclass
|
|
91
92
|
class SharepointIndexer(OnedriveIndexer):
|
|
92
93
|
connection_config: SharepointConnectionConfig
|
|
@@ -114,14 +115,14 @@ class SharepointIndexer(OnedriveIndexer):
|
|
|
114
115
|
def _is_root_path(self, path: str) -> bool:
|
|
115
116
|
"""Check if the path represents root access (empty string or legacy default)."""
|
|
116
117
|
return not path or not path.strip() or path == LEGACY_DEFAULT_PATH
|
|
117
|
-
|
|
118
|
+
|
|
118
119
|
def _get_target_drive_item(self, site_drive_item: DriveItem, path: str) -> DriveItem:
|
|
119
120
|
"""Get the drive item to search in based on the path."""
|
|
120
121
|
if self._is_root_path(path):
|
|
121
122
|
return site_drive_item
|
|
122
123
|
else:
|
|
123
124
|
return site_drive_item.get_by_path(path).get().execute_query()
|
|
124
|
-
|
|
125
|
+
|
|
125
126
|
def _validate_folder_path(self, site_drive_item: DriveItem, path: str) -> None:
|
|
126
127
|
"""Validate that a specific folder path exists and is accessible."""
|
|
127
128
|
from office365.runtime.client_request_exception import ClientRequestException
|
|
@@ -0,0 +1,365 @@
|
|
|
1
|
+
from typing import Any, Dict, Optional
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.logger import logger
|
|
4
|
+
from unstructured_ingest.processes.utils.logging.sanitizer import DataSanitizer
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class LoggingConfig:
|
|
8
|
+
"""Configuration for connector logging behavior."""
|
|
9
|
+
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
log_file_paths: bool = False,
|
|
13
|
+
log_document_locations: Optional[bool] = None,
|
|
14
|
+
log_ids: bool = False,
|
|
15
|
+
log_document_ids: Optional[bool] = None,
|
|
16
|
+
log_progress_interval: int = 10,
|
|
17
|
+
sanitize_logs: bool = True,
|
|
18
|
+
show_connection_details: bool = False,
|
|
19
|
+
):
|
|
20
|
+
# Backward compatibility: if new parameters aren't specified, use old ones
|
|
21
|
+
self.log_file_paths = log_file_paths
|
|
22
|
+
self.log_document_locations = (
|
|
23
|
+
log_document_locations if log_document_locations is not None else log_file_paths
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
self.log_ids = log_ids
|
|
27
|
+
self.log_document_ids = log_document_ids if log_document_ids is not None else log_ids
|
|
28
|
+
|
|
29
|
+
self.log_progress_interval = log_progress_interval
|
|
30
|
+
self.sanitize_logs = sanitize_logs
|
|
31
|
+
self.show_connection_details = show_connection_details
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ConnectorLoggingMixin:
|
|
35
|
+
"""Mixin class providing standardized logging patterns for connectors."""
|
|
36
|
+
|
|
37
|
+
def __init__(self, *args, **kwargs):
|
|
38
|
+
"""
|
|
39
|
+
Initialize the mixin by setting up logging configuration and data sanitization.
|
|
40
|
+
|
|
41
|
+
This method ensures that the mixin provides standardized logging patterns for connectors.
|
|
42
|
+
It initializes:
|
|
43
|
+
- `_logging_config`: Manages logging behavior and settings.
|
|
44
|
+
- `_sanitizer`: Handles sanitization of sensitive data in logs.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
*args: Positional arguments passed to the parent class.
|
|
48
|
+
**kwargs: Keyword arguments passed to the parent class.
|
|
49
|
+
"""
|
|
50
|
+
super().__init__(*args, **kwargs)
|
|
51
|
+
self._logging_config = LoggingConfig()
|
|
52
|
+
self._sanitizer = DataSanitizer()
|
|
53
|
+
|
|
54
|
+
def set_logging_config(self, config: LoggingConfig):
|
|
55
|
+
"""Set the logging configuration for this connector."""
|
|
56
|
+
self._logging_config = config
|
|
57
|
+
|
|
58
|
+
def _should_sanitize(self) -> bool:
|
|
59
|
+
"""Check if log sanitization is enabled."""
|
|
60
|
+
return self._logging_config.sanitize_logs
|
|
61
|
+
|
|
62
|
+
def log_operation_start(self, operation: str, **kwargs):
|
|
63
|
+
"""Log the start of a major operation."""
|
|
64
|
+
logger.info("Starting %s", operation)
|
|
65
|
+
|
|
66
|
+
if kwargs:
|
|
67
|
+
if self._should_sanitize():
|
|
68
|
+
sanitized_kwargs = self._sanitizer.sanitize_dict(kwargs)
|
|
69
|
+
logger.debug("%s parameters: %s", operation, sanitized_kwargs)
|
|
70
|
+
else:
|
|
71
|
+
logger.debug("%s parameters: %s", operation, kwargs)
|
|
72
|
+
|
|
73
|
+
def log_operation_complete(self, operation: str, count: Optional[int] = None, **kwargs):
|
|
74
|
+
"""Log the completion of a major operation."""
|
|
75
|
+
if count is not None:
|
|
76
|
+
logger.info("Completed %s (%s items)", operation, count)
|
|
77
|
+
else:
|
|
78
|
+
logger.info("Completed %s", operation)
|
|
79
|
+
|
|
80
|
+
if kwargs:
|
|
81
|
+
if self._should_sanitize():
|
|
82
|
+
sanitized_kwargs = self._sanitizer.sanitize_dict(kwargs)
|
|
83
|
+
logger.debug("%s results: %s", operation, sanitized_kwargs)
|
|
84
|
+
else:
|
|
85
|
+
logger.debug("%s results: %s", operation, kwargs)
|
|
86
|
+
|
|
87
|
+
def log_connection_validated(self, connector_type: str, endpoint: Optional[str] = None):
|
|
88
|
+
"""Log successful connection validation."""
|
|
89
|
+
if self._logging_config.show_connection_details and endpoint:
|
|
90
|
+
if self._should_sanitize():
|
|
91
|
+
sanitized_endpoint = self._sanitizer.sanitize_url(endpoint)
|
|
92
|
+
logger.debug(
|
|
93
|
+
"Connection to %s validated successfully: %s",
|
|
94
|
+
connector_type,
|
|
95
|
+
sanitized_endpoint,
|
|
96
|
+
)
|
|
97
|
+
else:
|
|
98
|
+
logger.debug(
|
|
99
|
+
"Connection to %s validated successfully: %s", connector_type, endpoint
|
|
100
|
+
)
|
|
101
|
+
else:
|
|
102
|
+
logger.debug("Connection to %s validated successfully", connector_type)
|
|
103
|
+
|
|
104
|
+
def log_connection_failed(
|
|
105
|
+
self, connector_type: str, error: Exception, endpoint: Optional[str] = None
|
|
106
|
+
):
|
|
107
|
+
"""Log connection validation failure."""
|
|
108
|
+
if endpoint:
|
|
109
|
+
if self._should_sanitize():
|
|
110
|
+
sanitized_endpoint = self._sanitizer.sanitize_url(endpoint)
|
|
111
|
+
logger.error(
|
|
112
|
+
"Failed to validate %s connection to %s: %s",
|
|
113
|
+
connector_type,
|
|
114
|
+
sanitized_endpoint,
|
|
115
|
+
error,
|
|
116
|
+
exc_info=True,
|
|
117
|
+
)
|
|
118
|
+
else:
|
|
119
|
+
logger.error(
|
|
120
|
+
"Failed to validate %s connection to %s: %s",
|
|
121
|
+
connector_type,
|
|
122
|
+
endpoint,
|
|
123
|
+
error,
|
|
124
|
+
exc_info=True,
|
|
125
|
+
)
|
|
126
|
+
else:
|
|
127
|
+
logger.error(
|
|
128
|
+
"Failed to validate %s connection: %s", connector_type, error, exc_info=True
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
def log_progress(
|
|
132
|
+
self, current: int, total: int, item_type: str = "items", operation: str = "Processing"
|
|
133
|
+
):
|
|
134
|
+
"""Log progress for long-running operations."""
|
|
135
|
+
if total > 0 and current % self._logging_config.log_progress_interval == 0:
|
|
136
|
+
progress = (current / total) * 100
|
|
137
|
+
logger.info("%s: %s/%s %s (%.1f%%)", operation, current, total, item_type, progress)
|
|
138
|
+
|
|
139
|
+
def log_batch_progress(
|
|
140
|
+
self, batch_num: int, total_batches: int, batch_size: int, operation: str = "Processing"
|
|
141
|
+
):
|
|
142
|
+
"""Log progress for batch operations."""
|
|
143
|
+
logger.info("%s batch %s/%s (%s items)", operation, batch_num, total_batches, batch_size)
|
|
144
|
+
|
|
145
|
+
def log_document_operation(
|
|
146
|
+
self,
|
|
147
|
+
operation: str,
|
|
148
|
+
document_location: Optional[str] = None,
|
|
149
|
+
document_id: Optional[str] = None,
|
|
150
|
+
content_size: Optional[int] = None,
|
|
151
|
+
**kwargs,
|
|
152
|
+
):
|
|
153
|
+
"""Log document-related operations (universal for all connector types)."""
|
|
154
|
+
if self._logging_config.log_document_locations and document_location:
|
|
155
|
+
if self._should_sanitize():
|
|
156
|
+
sanitized_location = self._sanitizer.sanitize_location(document_location)
|
|
157
|
+
logger.debug("%s: %s", operation, sanitized_location)
|
|
158
|
+
else:
|
|
159
|
+
logger.debug("%s: %s", operation, document_location)
|
|
160
|
+
elif self._logging_config.log_document_ids and document_id:
|
|
161
|
+
if self._should_sanitize():
|
|
162
|
+
sanitized_id = self._sanitizer.sanitize_document_id(document_id)
|
|
163
|
+
logger.debug("%s: %s", operation, sanitized_id)
|
|
164
|
+
else:
|
|
165
|
+
logger.debug("%s: %s", operation, document_id)
|
|
166
|
+
else:
|
|
167
|
+
logger.debug("%s: <document>", operation)
|
|
168
|
+
|
|
169
|
+
if content_size is not None:
|
|
170
|
+
kwargs["content_size"] = content_size
|
|
171
|
+
|
|
172
|
+
if kwargs:
|
|
173
|
+
if self._should_sanitize():
|
|
174
|
+
sanitized_kwargs = self._sanitizer.sanitize_dict(kwargs)
|
|
175
|
+
logger.debug("%s details: %s", operation, sanitized_kwargs)
|
|
176
|
+
else:
|
|
177
|
+
logger.debug("%s details: %s", operation, kwargs)
|
|
178
|
+
|
|
179
|
+
def log_file_operation(
|
|
180
|
+
self,
|
|
181
|
+
operation: str,
|
|
182
|
+
file_path: Optional[str] = None,
|
|
183
|
+
file_id: Optional[str] = None,
|
|
184
|
+
**kwargs,
|
|
185
|
+
):
|
|
186
|
+
"""Log file-related operations (backward compatibility wrapper)."""
|
|
187
|
+
self.log_document_operation(
|
|
188
|
+
operation=operation, document_location=file_path, document_id=file_id, **kwargs
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
def log_document_download_start(
|
|
192
|
+
self,
|
|
193
|
+
document_location: Optional[str] = None,
|
|
194
|
+
document_id: Optional[str] = None,
|
|
195
|
+
content_size: Optional[int] = None,
|
|
196
|
+
):
|
|
197
|
+
"""Log the start of a document download/retrieval."""
|
|
198
|
+
logger.info("Starting document download")
|
|
199
|
+
|
|
200
|
+
self.log_document_operation(
|
|
201
|
+
"Download",
|
|
202
|
+
document_location=document_location,
|
|
203
|
+
document_id=document_id,
|
|
204
|
+
content_size=content_size,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
def log_document_download_complete(
|
|
208
|
+
self,
|
|
209
|
+
document_location: Optional[str] = None,
|
|
210
|
+
document_id: Optional[str] = None,
|
|
211
|
+
download_path: Optional[str] = None,
|
|
212
|
+
content_size: Optional[int] = None,
|
|
213
|
+
items_retrieved: Optional[int] = None,
|
|
214
|
+
):
|
|
215
|
+
"""Log the completion of a document download/retrieval."""
|
|
216
|
+
logger.info("Document download completed")
|
|
217
|
+
|
|
218
|
+
details = {}
|
|
219
|
+
if download_path:
|
|
220
|
+
details["download_path"] = download_path
|
|
221
|
+
if items_retrieved is not None:
|
|
222
|
+
details["items_retrieved"] = items_retrieved
|
|
223
|
+
|
|
224
|
+
self.log_document_operation(
|
|
225
|
+
"Download completed",
|
|
226
|
+
document_location=document_location,
|
|
227
|
+
document_id=document_id,
|
|
228
|
+
content_size=content_size,
|
|
229
|
+
**details,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
def log_download_start(
|
|
233
|
+
self,
|
|
234
|
+
file_path: Optional[str] = None,
|
|
235
|
+
file_id: Optional[str] = None,
|
|
236
|
+
file_size: Optional[int] = None,
|
|
237
|
+
):
|
|
238
|
+
"""Log the start of a file download (backward compatibility wrapper)."""
|
|
239
|
+
self.log_document_download_start(
|
|
240
|
+
document_location=file_path, document_id=file_id, content_size=file_size
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
def log_download_complete(
|
|
244
|
+
self,
|
|
245
|
+
file_path: Optional[str] = None,
|
|
246
|
+
file_id: Optional[str] = None,
|
|
247
|
+
download_path: Optional[str] = None,
|
|
248
|
+
file_size: Optional[int] = None,
|
|
249
|
+
):
|
|
250
|
+
"""Log the completion of a file download (backward compatibility wrapper)."""
|
|
251
|
+
self.log_document_download_complete(
|
|
252
|
+
document_location=file_path,
|
|
253
|
+
document_id=file_id,
|
|
254
|
+
download_path=download_path,
|
|
255
|
+
content_size=file_size,
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
def log_upload_start(
|
|
259
|
+
self,
|
|
260
|
+
file_path: Optional[str] = None,
|
|
261
|
+
destination: Optional[str] = None,
|
|
262
|
+
file_size: Optional[int] = None,
|
|
263
|
+
):
|
|
264
|
+
"""Log the start of a file upload."""
|
|
265
|
+
logger.info("Starting file upload")
|
|
266
|
+
|
|
267
|
+
details = {}
|
|
268
|
+
if destination:
|
|
269
|
+
details["destination"] = destination
|
|
270
|
+
|
|
271
|
+
self.log_file_operation("Upload", file_path=file_path, **details)
|
|
272
|
+
|
|
273
|
+
def log_upload_complete(
|
|
274
|
+
self,
|
|
275
|
+
file_path: Optional[str] = None,
|
|
276
|
+
destination: Optional[str] = None,
|
|
277
|
+
file_id: Optional[str] = None,
|
|
278
|
+
file_size: Optional[int] = None,
|
|
279
|
+
):
|
|
280
|
+
"""Log the completion of a file upload."""
|
|
281
|
+
logger.info("File upload completed")
|
|
282
|
+
|
|
283
|
+
details = {}
|
|
284
|
+
if destination:
|
|
285
|
+
details["destination"] = destination
|
|
286
|
+
if file_id:
|
|
287
|
+
details["file_id"] = file_id
|
|
288
|
+
|
|
289
|
+
self.log_file_operation("Upload completed", file_path=file_path, **details)
|
|
290
|
+
|
|
291
|
+
def log_indexing_start(self, source_type: str, count: Optional[int] = None):
|
|
292
|
+
"""Log the start of indexing operation."""
|
|
293
|
+
if count:
|
|
294
|
+
logger.info("Starting indexing of %s (%s items)", source_type, count)
|
|
295
|
+
else:
|
|
296
|
+
logger.info("Starting indexing of %s", source_type)
|
|
297
|
+
|
|
298
|
+
def log_indexing_complete(self, source_type: str, count: int):
|
|
299
|
+
"""Log the completion of indexing operation."""
|
|
300
|
+
logger.info("Indexing completed: %s %s items indexed", count, source_type)
|
|
301
|
+
|
|
302
|
+
def log_info(self, message: str, context: Optional[Dict[str, Any]] = None, **kwargs):
|
|
303
|
+
"""Log an info message with optional context and sanitization."""
|
|
304
|
+
logger.info(message)
|
|
305
|
+
self._log_context("Info", context, **kwargs)
|
|
306
|
+
|
|
307
|
+
def log_debug(self, message: str, context: Optional[Dict[str, Any]] = None, **kwargs):
|
|
308
|
+
"""Log a debug message with optional context and sanitization."""
|
|
309
|
+
logger.debug(message)
|
|
310
|
+
self._log_context("Debug", context, **kwargs)
|
|
311
|
+
|
|
312
|
+
def log_warning(self, message: str, context: Optional[Dict[str, Any]] = None, **kwargs):
|
|
313
|
+
"""Log a warning message with optional context and sanitization."""
|
|
314
|
+
logger.warning(message)
|
|
315
|
+
self._log_context("Warning", context, **kwargs)
|
|
316
|
+
|
|
317
|
+
def log_error(
|
|
318
|
+
self,
|
|
319
|
+
message: str,
|
|
320
|
+
error: Optional[Exception] = None,
|
|
321
|
+
context: Optional[Dict[str, Any]] = None,
|
|
322
|
+
**kwargs,
|
|
323
|
+
):
|
|
324
|
+
"""Log an error message with optional exception, context and sanitization."""
|
|
325
|
+
if error:
|
|
326
|
+
logger.error("%s: %s", message, error, exc_info=True)
|
|
327
|
+
else:
|
|
328
|
+
logger.error(message)
|
|
329
|
+
self._log_context("Error", context, **kwargs)
|
|
330
|
+
|
|
331
|
+
def _log_context(self, log_type: str, context: Optional[Dict[str, Any]], **kwargs):
|
|
332
|
+
"""Helper method to log context with sanitization."""
|
|
333
|
+
all_context = {}
|
|
334
|
+
if context:
|
|
335
|
+
all_context.update(context)
|
|
336
|
+
if kwargs:
|
|
337
|
+
all_context.update(kwargs)
|
|
338
|
+
|
|
339
|
+
if all_context:
|
|
340
|
+
if self._should_sanitize():
|
|
341
|
+
sanitized_context = self._sanitizer.sanitize_dict(all_context)
|
|
342
|
+
logger.debug("%s context: %s", log_type, sanitized_context)
|
|
343
|
+
else:
|
|
344
|
+
logger.debug("%s context: %s", log_type, all_context)
|
|
345
|
+
|
|
346
|
+
def log_api_call(self, method: str, endpoint: str, status_code: Optional[int] = None, **kwargs):
|
|
347
|
+
"""Log API call details."""
|
|
348
|
+
if self._should_sanitize():
|
|
349
|
+
sanitized_endpoint = self._sanitizer.sanitize_url(endpoint)
|
|
350
|
+
if status_code:
|
|
351
|
+
logger.debug("API call: %s %s -> %s", method, sanitized_endpoint, status_code)
|
|
352
|
+
else:
|
|
353
|
+
logger.debug("API call: %s %s", method, sanitized_endpoint)
|
|
354
|
+
else:
|
|
355
|
+
if status_code:
|
|
356
|
+
logger.debug("API call: %s %s -> %s", method, endpoint, status_code)
|
|
357
|
+
else:
|
|
358
|
+
logger.debug("API call: %s %s", method, endpoint)
|
|
359
|
+
|
|
360
|
+
if kwargs:
|
|
361
|
+
if self._should_sanitize():
|
|
362
|
+
sanitized_kwargs = self._sanitizer.sanitize_dict(kwargs)
|
|
363
|
+
logger.debug("API call details: %s", sanitized_kwargs)
|
|
364
|
+
else:
|
|
365
|
+
logger.debug("API call details: %s", kwargs)
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Any, Dict, Optional, Union
|
|
3
|
+
from urllib.parse import urlparse
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class DataSanitizer:
|
|
7
|
+
"""Utility class for sanitizing sensitive data in logs."""
|
|
8
|
+
|
|
9
|
+
@staticmethod
|
|
10
|
+
def sanitize_path(path: Union[str, Path]) -> str:
|
|
11
|
+
"""Sanitize file paths for logging, showing only filename and partial path."""
|
|
12
|
+
if not path:
|
|
13
|
+
return "<empty>"
|
|
14
|
+
|
|
15
|
+
path_str = str(path)
|
|
16
|
+
path_obj = Path(path_str)
|
|
17
|
+
|
|
18
|
+
if len(path_obj.parts) > 2:
|
|
19
|
+
return f".../{path_obj.parent.name}/{path_obj.name}"
|
|
20
|
+
return path_obj.name
|
|
21
|
+
|
|
22
|
+
@staticmethod
|
|
23
|
+
def sanitize_id(identifier: str) -> str:
|
|
24
|
+
"""Sanitize IDs for logging, showing only first/last few characters."""
|
|
25
|
+
if not identifier:
|
|
26
|
+
return "<id>"
|
|
27
|
+
if len(identifier) < 10:
|
|
28
|
+
half_len = len(identifier) // 2
|
|
29
|
+
return f"{identifier[:half_len]}..."
|
|
30
|
+
return f"{identifier[:4]}...{identifier[-4:]}"
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def sanitize_url(url: str) -> str:
|
|
34
|
+
"""Sanitize URLs for logging, removing sensitive query parameters."""
|
|
35
|
+
if not url:
|
|
36
|
+
return "<url>"
|
|
37
|
+
try:
|
|
38
|
+
parsed = urlparse(url)
|
|
39
|
+
return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
|
|
40
|
+
except (ValueError, TypeError):
|
|
41
|
+
return "<url>"
|
|
42
|
+
|
|
43
|
+
@staticmethod
|
|
44
|
+
def sanitize_token(token: str) -> str:
|
|
45
|
+
"""Sanitize tokens and secrets for logging."""
|
|
46
|
+
if not token:
|
|
47
|
+
return "<token>"
|
|
48
|
+
if len(token) < 10:
|
|
49
|
+
half_len = len(token) // 2
|
|
50
|
+
return f"{token[:half_len]}..."
|
|
51
|
+
return f"{token[:4]}...{token[-4:]}"
|
|
52
|
+
|
|
53
|
+
@staticmethod
|
|
54
|
+
def sanitize_location(location: Union[str, Path]) -> str:
|
|
55
|
+
"""Sanitize document locations (file paths, URLs, database references) for logging."""
|
|
56
|
+
if not location:
|
|
57
|
+
return "<empty>"
|
|
58
|
+
|
|
59
|
+
location_str = str(location)
|
|
60
|
+
|
|
61
|
+
# Handle URLs
|
|
62
|
+
if location_str.startswith(("http://", "https://", "ftp://", "ftps://")):
|
|
63
|
+
return DataSanitizer.sanitize_url(location_str)
|
|
64
|
+
|
|
65
|
+
# Handle database-style references (table:id, collection/document, etc.)
|
|
66
|
+
if ":" in location_str and not location_str.startswith("/"):
|
|
67
|
+
parts = location_str.split(":", 1)
|
|
68
|
+
if len(parts) == 2:
|
|
69
|
+
table_name, record_id = parts
|
|
70
|
+
return f"{table_name}:{DataSanitizer.sanitize_id(record_id)}"
|
|
71
|
+
|
|
72
|
+
return DataSanitizer.sanitize_path(location_str)
|
|
73
|
+
|
|
74
|
+
@staticmethod
|
|
75
|
+
def sanitize_document_id(document_id: str) -> str:
|
|
76
|
+
"""Sanitize document IDs for logging (alias for sanitize_id for clarity)."""
|
|
77
|
+
return DataSanitizer.sanitize_id(document_id)
|
|
78
|
+
|
|
79
|
+
@staticmethod
|
|
80
|
+
def sanitize_dict(data: Dict[str, Any], sensitive_keys: Optional[set] = None) -> Dict[str, Any]:
|
|
81
|
+
"""Sanitize dictionary data for logging."""
|
|
82
|
+
if sensitive_keys is None:
|
|
83
|
+
sensitive_keys = {
|
|
84
|
+
"password",
|
|
85
|
+
"token",
|
|
86
|
+
"secret",
|
|
87
|
+
"key",
|
|
88
|
+
"api_key",
|
|
89
|
+
"access_token",
|
|
90
|
+
"refresh_token",
|
|
91
|
+
"client_secret",
|
|
92
|
+
"private_key",
|
|
93
|
+
"credentials",
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
sanitized = {}
|
|
97
|
+
for k, v in data.items():
|
|
98
|
+
key_lower = k.lower()
|
|
99
|
+
if any(sensitive_key in key_lower for sensitive_key in sensitive_keys):
|
|
100
|
+
sanitized[k] = DataSanitizer.sanitize_token(str(v))
|
|
101
|
+
elif isinstance(v, dict):
|
|
102
|
+
sanitized[k] = DataSanitizer.sanitize_dict(v, sensitive_keys)
|
|
103
|
+
elif isinstance(v, (str, Path)) and (
|
|
104
|
+
"path" in key_lower
|
|
105
|
+
or "file" in key_lower
|
|
106
|
+
or "location" in key_lower
|
|
107
|
+
or "document_location" in key_lower
|
|
108
|
+
):
|
|
109
|
+
sanitized[k] = DataSanitizer.sanitize_location(v)
|
|
110
|
+
elif isinstance(v, str) and (
|
|
111
|
+
("id" in key_lower and len(str(v)) > 8)
|
|
112
|
+
or ("document_id" in key_lower and len(str(v)) > 8)
|
|
113
|
+
):
|
|
114
|
+
sanitized[k] = DataSanitizer.sanitize_document_id(v)
|
|
115
|
+
else:
|
|
116
|
+
sanitized[k] = v
|
|
117
|
+
return sanitized
|
|
@@ -129,6 +129,7 @@ class HtmlMixin(BaseModel):
|
|
|
129
129
|
)
|
|
130
130
|
result_file_data = file_data.model_copy(deep=True)
|
|
131
131
|
result_file_data.metadata.url = url
|
|
132
|
+
result_file_data.display_name = filename
|
|
132
133
|
if result_file_data.metadata.record_locator is None:
|
|
133
134
|
result_file_data.metadata.record_locator = {}
|
|
134
135
|
result_file_data.metadata.record_locator["parent_url"] = url
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import ssl
|
|
3
|
+
|
|
4
|
+
import certifi
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def ssl_context_with_optional_ca_override():
|
|
8
|
+
"""
|
|
9
|
+
# https://www.python-httpx.org/advanced/ssl/#working-with-ssl_cert_file-and-ssl_cert_dir
|
|
10
|
+
# We choose REQUESTS_CA_BUNDLE because that works with many other Python packages.
|
|
11
|
+
"""
|
|
12
|
+
return ssl.create_default_context(
|
|
13
|
+
cafile=os.environ.get("REQUESTS_CA_BUNDLE", certifi.where()),
|
|
14
|
+
capath=os.environ.get("REQUESTS_CA_BUNDLE"),
|
|
15
|
+
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: unstructured_ingest
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: Local ETL data pipeline to get data RAG ready
|
|
5
5
|
Author-email: Unstructured Technologies <devops@unstructuredai.io>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -18,6 +18,7 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
18
18
|
Classifier: Programming Language :: Python :: 3.12
|
|
19
19
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
20
|
Requires-Python: <3.13,>=3.9
|
|
21
|
+
Requires-Dist: certifi>=2025.7.14
|
|
21
22
|
Requires-Dist: click
|
|
22
23
|
Requires-Dist: opentelemetry-sdk
|
|
23
24
|
Requires-Dist: pydantic>=2.7
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
2
|
-
unstructured_ingest/__version__.py,sha256=
|
|
2
|
+
unstructured_ingest/__version__.py,sha256=OTJtt59bB59UuRwC7CjPgJNmkdDC7RUC5Ukrfd-P-CE,42
|
|
3
3
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
4
4
|
unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
|
|
5
5
|
unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
|
|
6
6
|
unstructured_ingest/main.py,sha256=82G_7eG4PNhc_xIqj4Y_sFbDV9VI-nwSfsfJQMzovMk,169
|
|
7
|
-
unstructured_ingest/otel.py,sha256=
|
|
7
|
+
unstructured_ingest/otel.py,sha256=wxnkdZqFtlypmOn4QX6uLxjGa7jSoFabP3PEG5FjH1g,4669
|
|
8
8
|
unstructured_ingest/unstructured_api.py,sha256=4e2ZNWIihk0eje4R3ZQ0NOYNbmMZDv_O-rnJo94kaGE,5127
|
|
9
9
|
unstructured_ingest/cli/README.md,sha256=lfsXY2jOO__OuDYcIs8N0yLhZWzrSQ_dyXbSFtEMlQ8,1504
|
|
10
10
|
unstructured_ingest/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -22,18 +22,18 @@ unstructured_ingest/data_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
|
|
|
22
22
|
unstructured_ingest/data_types/entities.py,sha256=ECc6EkZ5_ZUvK7uaALYOynfFmofIrHYIJZfb67hUIxA,371
|
|
23
23
|
unstructured_ingest/data_types/file_data.py,sha256=J0RQa7YXhhxiLVzhPbF5Hl2nzSpxLFK9vrP6RTBWlSg,3833
|
|
24
24
|
unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
-
unstructured_ingest/embed/azure_openai.py,sha256=
|
|
25
|
+
unstructured_ingest/embed/azure_openai.py,sha256=Q_buBkAcx9FBuTsAqKbRU8vd9vDh8JoDOEth4fFxHbg,2160
|
|
26
26
|
unstructured_ingest/embed/bedrock.py,sha256=dzfCsatB0i8hUp1YnXmoImoxgvUdZ4srKI6eSvn-lYM,9132
|
|
27
27
|
unstructured_ingest/embed/huggingface.py,sha256=6Gx9L3xa3cv9fX4AMuLsePJQF4T_jwkKjovfqF5X1NM,2435
|
|
28
28
|
unstructured_ingest/embed/interfaces.py,sha256=Y3PLhgWnMDmtpugE37hlAiBIbC8izrFFXXkrPVby-HY,5137
|
|
29
29
|
unstructured_ingest/embed/mixedbreadai.py,sha256=uKTqzoi4M_WeYZu-qc_TSxwJONOESzxVbBLUbD1Wbns,3922
|
|
30
30
|
unstructured_ingest/embed/octoai.py,sha256=yZuD7R4mEKS4Jjyae_IrNWogMPOFFS8gW5oUllj3ROU,4540
|
|
31
|
-
unstructured_ingest/embed/openai.py,sha256=
|
|
31
|
+
unstructured_ingest/embed/openai.py,sha256=09I5BIrb-iGsv92LOV46-F7oZ7j1JnJIOQFARNKVq3k,5029
|
|
32
32
|
unstructured_ingest/embed/togetherai.py,sha256=ykaveEUBxBGBzRlmWc9utCFQuUWHdbW4F9KAb-uBAJM,3630
|
|
33
33
|
unstructured_ingest/embed/vertexai.py,sha256=DphvPhiYdXTMrQxJCd-64vMs4iVdLY_BphHqz3n5HfM,3758
|
|
34
34
|
unstructured_ingest/embed/voyageai.py,sha256=EOrYzaoXOZ6C4fNkMlCgb8KA8rdfgVXN3USMFpnn0Bs,4698
|
|
35
35
|
unstructured_ingest/interfaces/__init__.py,sha256=QIkWqjsq9INTa89gPuXlMlQL4s3y5TqLmPkuVuTyXcs,795
|
|
36
|
-
unstructured_ingest/interfaces/connector.py,sha256=
|
|
36
|
+
unstructured_ingest/interfaces/connector.py,sha256=wYWIEAL99KdQDDzzDYSf_yE8p1wjThSPMgEV5qyfiPc,1885
|
|
37
37
|
unstructured_ingest/interfaces/downloader.py,sha256=xX0ZzsFRSzZb7SAeoeQph8sIbVq13DRw-3MYkdADrY0,2918
|
|
38
38
|
unstructured_ingest/interfaces/indexer.py,sha256=c2FwWJEQHfFD6vO-tGfYLpLiIs-TYViLAt8YmHfDbaM,824
|
|
39
39
|
unstructured_ingest/interfaces/process.py,sha256=S3A_9gkwwGC-iQxvnpj3Er6IJAjAT5npzpSgxuFAzUM,449
|
|
@@ -79,12 +79,12 @@ unstructured_ingest/processes/connectors/local.py,sha256=CesMduUiSPqdJpqIyW28icG
|
|
|
79
79
|
unstructured_ingest/processes/connectors/milvus.py,sha256=L-PM5osheNyNsLGYZmiF3rRmeulp7Ejk92JCoaQ_F9Y,12075
|
|
80
80
|
unstructured_ingest/processes/connectors/mongodb.py,sha256=idjolwS5TXShcIz2jR_socSgh8HOzJwyOnzE1qLUPBw,15362
|
|
81
81
|
unstructured_ingest/processes/connectors/neo4j.py,sha256=ztxvI9KY8RF5kYUuMGSzzN5mz7Fu_4Ai9P7dqCpJLc0,20267
|
|
82
|
-
unstructured_ingest/processes/connectors/onedrive.py,sha256=
|
|
82
|
+
unstructured_ingest/processes/connectors/onedrive.py,sha256=JPa30X2abVx9SHye_cLOOj4csj_ut8nMjwRnMcgHFhI,20163
|
|
83
83
|
unstructured_ingest/processes/connectors/outlook.py,sha256=6HHubZI_zttEfYp0XNd4Y1vhjsS8uSg7aZ2LBrTjfHk,9376
|
|
84
84
|
unstructured_ingest/processes/connectors/pinecone.py,sha256=jCabAqKQyBFzaGjphxLMr57y7P0Z15Jd9Jj-JM40YnU,15090
|
|
85
85
|
unstructured_ingest/processes/connectors/redisdb.py,sha256=rTihbfv0Mlk1eo5Izn-JXRu5Ad5C-KD58nSqeKsaZJ8,8024
|
|
86
86
|
unstructured_ingest/processes/connectors/salesforce.py,sha256=N_UoebrhzXZNWw-X7lg8_qAziXx5L_d8XHnHWKNNYR8,11767
|
|
87
|
-
unstructured_ingest/processes/connectors/sharepoint.py,sha256=
|
|
87
|
+
unstructured_ingest/processes/connectors/sharepoint.py,sha256=ooPJoAEHj-epEM39iiYbNWdDUdEwt466fLjIcYSNTM8,10670
|
|
88
88
|
unstructured_ingest/processes/connectors/slack.py,sha256=oboIfX7ayBMK0te5Nv50iyL3FQJFXJbRxZSQaCMp3kM,9318
|
|
89
89
|
unstructured_ingest/processes/connectors/utils.py,sha256=TAd0hb1f291N-q7-TUe6JKSCGkhqDyo7Ij8zmliBZUc,2071
|
|
90
90
|
unstructured_ingest/processes/connectors/vectara.py,sha256=xrC6jkgW8BII4UjdzUelDu122xT484cpfMTK2wl-sko,12292
|
|
@@ -109,9 +109,9 @@ unstructured_ingest/processes/connectors/fsspec/__init__.py,sha256=3HTdw4L4mdN4W
|
|
|
109
109
|
unstructured_ingest/processes/connectors/fsspec/azure.py,sha256=31VNiG5YnXfhrFX7QJ2O1ubeWHxbe1sYVIztefbscAQ,7148
|
|
110
110
|
unstructured_ingest/processes/connectors/fsspec/box.py,sha256=1gLS7xR2vbjgKBrQ4ZpI1fKTsJuIDfXuAzx_a4FzxG4,5873
|
|
111
111
|
unstructured_ingest/processes/connectors/fsspec/dropbox.py,sha256=HwwKjQmjM7yFk9Esh_F20xDisRPXGUkFduzaasByRDE,8355
|
|
112
|
-
unstructured_ingest/processes/connectors/fsspec/fsspec.py,sha256=
|
|
112
|
+
unstructured_ingest/processes/connectors/fsspec/fsspec.py,sha256=p0u6JL6ouEPe4R_i_rAhzlvSDyMO3-NDHiw_CtPaCTc,17875
|
|
113
113
|
unstructured_ingest/processes/connectors/fsspec/gcs.py,sha256=ouxISCKpZTAj3T6pWGYbASu93wytJjl5WSICvQcrgfE,7172
|
|
114
|
-
unstructured_ingest/processes/connectors/fsspec/s3.py,sha256=
|
|
114
|
+
unstructured_ingest/processes/connectors/fsspec/s3.py,sha256=P5nd3hamhLFO3l5nV3lMuIxHtb_rZYFP4F6q_py3xpc,7492
|
|
115
115
|
unstructured_ingest/processes/connectors/fsspec/sftp.py,sha256=pR_a2SgLjt8ffNkariHrPB1E0HVSTj5h3pt7KxTU3TI,6371
|
|
116
116
|
unstructured_ingest/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
|
|
117
117
|
unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py,sha256=kf0UpgdAY2KK1R1FbAB6GEBBAIOeYQ8cZIr3bp660qM,374
|
|
@@ -218,22 +218,25 @@ unstructured_ingest/processes/connectors/weaviate/weaviate.py,sha256=yB67gxvo3X0
|
|
|
218
218
|
unstructured_ingest/processes/connectors/zendesk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
219
219
|
unstructured_ingest/processes/connectors/zendesk/client.py,sha256=GvPIpx4aYdD58-edHgvCFjFao94uR0O5Yf4dT9NCmSk,11952
|
|
220
220
|
unstructured_ingest/processes/connectors/zendesk/zendesk.py,sha256=doS6d7ZhXBgJN8aPVf7vnQr8BciQbzX8-4yl4_hDZ7w,9253
|
|
221
|
-
unstructured_ingest/processes/utils/__init__.py,sha256=
|
|
221
|
+
unstructured_ingest/processes/utils/__init__.py,sha256=v3IQ-Ft0f7PoHhGcYiiD6Yrr6oi-RiGeD6nTKowbEDk,199
|
|
222
222
|
unstructured_ingest/processes/utils/blob_storage.py,sha256=apMUmm9loxdbTRkkLH4VhG9kUVyiw9PFUJheSDxSxPk,1023
|
|
223
|
-
unstructured_ingest/utils/
|
|
223
|
+
unstructured_ingest/processes/utils/logging/connector.py,sha256=xKsXSavbu2U8ZP0KP7jk5192ZDr5HzaBCBCf0GKe1HI,14109
|
|
224
|
+
unstructured_ingest/processes/utils/logging/sanitizer.py,sha256=ZG4Cdcc2yrVmmgdUOJCaUKgp5mZhBpEOMjAbj5Cth_s,4251
|
|
225
|
+
unstructured_ingest/utils/__init__.py,sha256=mU8mlrdah00MPuZM6JqXTkrpXK-sDYiv5y5Mwl8eesM,158
|
|
224
226
|
unstructured_ingest/utils/chunking.py,sha256=9b3sXMA6L8RW5xAkKQbwdtVudGLAcj_sgT6Grh5tyYM,1870
|
|
225
227
|
unstructured_ingest/utils/compression.py,sha256=PPC-ys3qEAtELf6-irhp8v8M634pFFCJEvA6o7PXaLI,2712
|
|
226
228
|
unstructured_ingest/utils/constants.py,sha256=pDspTYz-nEojHBqrZNfssGEiujmVa02pIWL63PQP9sU,103
|
|
227
229
|
unstructured_ingest/utils/data_prep.py,sha256=yqrv7x_nlj0y3uaN0m0Bnsekb7VIQnwABWPa24KU5QI,7426
|
|
228
230
|
unstructured_ingest/utils/dep_check.py,sha256=SXXcUna2H0RtxA6j1S2NGkvQa9JP2DujWhmyBa7776Y,2400
|
|
229
231
|
unstructured_ingest/utils/filesystem.py,sha256=nWxpQd8ogTgmXb7ZouupX6sE5v_qFXNzPl4DtZSStwE,1036
|
|
230
|
-
unstructured_ingest/utils/html.py,sha256=
|
|
232
|
+
unstructured_ingest/utils/html.py,sha256=lm5lVYhVl7ztntquxzMLVQ8EmK7wkvYgNvlIuHnenoM,6865
|
|
231
233
|
unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01qAbElH0,1201
|
|
232
234
|
unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
|
|
233
235
|
unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
|
|
234
236
|
unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
|
|
235
|
-
unstructured_ingest
|
|
236
|
-
unstructured_ingest-1.0.
|
|
237
|
-
unstructured_ingest-1.0.
|
|
238
|
-
unstructured_ingest-1.0.
|
|
239
|
-
unstructured_ingest-1.0.
|
|
237
|
+
unstructured_ingest/utils/tls.py,sha256=Ra8Mii1F4VqErRreg76PBI0eAqPBC009l0sSHa8FdnA,448
|
|
238
|
+
unstructured_ingest-1.1.0.dist-info/METADATA,sha256=tJonV6SbQB5XL3BeyL8coDFhzzChMKGuSPQWQ3aoOdE,8875
|
|
239
|
+
unstructured_ingest-1.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
240
|
+
unstructured_ingest-1.1.0.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
241
|
+
unstructured_ingest-1.1.0.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
242
|
+
unstructured_ingest-1.1.0.dist-info/RECORD,,
|
|
File without changes
|
{unstructured_ingest-1.0.55.dist-info → unstructured_ingest-1.1.0.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.55.dist-info → unstructured_ingest-1.1.0.dist-info}/licenses/LICENSE.md
RENAMED
|
File without changes
|