unstructured-ingest 1.0.56__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "1.0.56" # pragma: no cover
1
+ __version__ = "1.1.0" # pragma: no cover
@@ -9,6 +9,7 @@ from unstructured_ingest.embed.openai import (
9
9
  OpenAIEmbeddingEncoder,
10
10
  )
11
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
12
+ from unstructured_ingest.utils.tls import ssl_context_with_optional_ca_override
12
13
 
13
14
  if TYPE_CHECKING:
14
15
  from openai import AsyncAzureOpenAI, AzureOpenAI
@@ -23,9 +24,11 @@ class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
23
24
 
24
25
  @requires_dependencies(["openai"], extras="openai")
25
26
  def get_client(self) -> "AzureOpenAI":
26
- from openai import AzureOpenAI
27
+ from openai import AzureOpenAI, DefaultHttpxClient
27
28
 
29
+ client = DefaultHttpxClient(verify=ssl_context_with_optional_ca_override())
28
30
  return AzureOpenAI(
31
+ http_client=client,
29
32
  api_key=self.api_key.get_secret_value(),
30
33
  api_version=self.api_version,
31
34
  azure_endpoint=self.azure_endpoint,
@@ -33,9 +36,11 @@ class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
33
36
 
34
37
  @requires_dependencies(["openai"], extras="openai")
35
38
  def get_async_client(self) -> "AsyncAzureOpenAI":
36
- from openai import AsyncAzureOpenAI
39
+ from openai import AsyncAzureOpenAI, DefaultAsyncHttpxClient
37
40
 
41
+ client = DefaultAsyncHttpxClient(verify=ssl_context_with_optional_ca_override())
38
42
  return AsyncAzureOpenAI(
43
+ http_client=client,
39
44
  api_key=self.api_key.get_secret_value(),
40
45
  api_version=self.api_version,
41
46
  azure_endpoint=self.azure_endpoint,
@@ -18,6 +18,7 @@ from unstructured_ingest.errors_v2 import (
18
18
  )
19
19
  from unstructured_ingest.logger import logger
20
20
  from unstructured_ingest.utils.dep_check import requires_dependencies
21
+ from unstructured_ingest.utils.tls import ssl_context_with_optional_ca_override
21
22
 
22
23
  if TYPE_CHECKING:
23
24
  from openai import AsyncOpenAI, OpenAI
@@ -86,15 +87,21 @@ class OpenAIEmbeddingConfig(EmbeddingConfig):
86
87
 
87
88
  @requires_dependencies(["openai"], extras="openai")
88
89
  def get_client(self) -> "OpenAI":
89
- from openai import OpenAI
90
+ from openai import DefaultHttpxClient, OpenAI
90
91
 
91
- return OpenAI(api_key=self.api_key.get_secret_value(), base_url=self.base_url)
92
+ client = DefaultHttpxClient(verify=ssl_context_with_optional_ca_override())
93
+ return OpenAI(
94
+ api_key=self.api_key.get_secret_value(), http_client=client, base_url=self.base_url
95
+ )
92
96
 
93
97
  @requires_dependencies(["openai"], extras="openai")
94
98
  def get_async_client(self) -> "AsyncOpenAI":
95
- from openai import AsyncOpenAI
99
+ from openai import AsyncOpenAI, DefaultAsyncHttpxClient
96
100
 
97
- return AsyncOpenAI(api_key=self.api_key.get_secret_value(), base_url=self.base_url)
101
+ client = DefaultAsyncHttpxClient(verify=ssl_context_with_optional_ca_override())
102
+ return AsyncOpenAI(
103
+ api_key=self.api_key.get_secret_value(), http_client=client, base_url=self.base_url
104
+ )
98
105
 
99
106
 
100
107
  @dataclass
@@ -5,6 +5,8 @@ from typing import Any, TypeVar, Union
5
5
  from pydantic import BaseModel, Secret, model_validator
6
6
  from pydantic.types import _SecretBase
7
7
 
8
+ from unstructured_ingest.processes.utils.logging.connector import ConnectorLoggingMixin
9
+
8
10
 
9
11
  class AccessConfig(BaseModel):
10
12
  """Meant to designate holding any sensitive information associated with other configs
@@ -46,5 +48,9 @@ ConnectionConfigT = TypeVar("ConnectionConfigT", bound=ConnectionConfig)
46
48
 
47
49
 
48
50
  @dataclass
49
- class BaseConnector(ABC):
51
+ class BaseConnector(ABC, ConnectorLoggingMixin):
50
52
  connection_config: ConnectionConfigT
53
+
54
+ def __post_init__(self):
55
+ """Initialize the logging mixin after dataclass initialization."""
56
+ ConnectorLoggingMixin.__init__(self)
@@ -1,3 +1,4 @@
1
+ import logging
1
2
  import os
2
3
  from dataclasses import dataclass, field
3
4
  from typing import Callable, ClassVar, Optional, Protocol, Sequence
@@ -31,13 +32,27 @@ class LogSpanExporter(ConsoleSpanExporter):
31
32
  self.log_out(self.formatter(span))
32
33
  return SpanExportResult.SUCCESS
33
34
 
35
+ def get_log_out() -> Callable:
36
+ level_names_mapping = {
37
+ 'CRITICAL': logging.CRITICAL,
38
+ 'FATAL': logging.FATAL,
39
+ 'ERROR': logging.ERROR,
40
+ 'WARN': logging.WARNING,
41
+ 'WARNING': logging.WARNING,
42
+ 'INFO': logging.INFO,
43
+ 'DEBUG': logging.DEBUG,
44
+ 'NOTSET': logging.NOTSET,
45
+ }
46
+ log_level = os.getenv("OTEL_LOG_LEVEL", "DEBUG").upper()
47
+ log_level_int = level_names_mapping.get(log_level, logging.DEBUG)
48
+ return lambda message: logger.log(log_level_int, message)
34
49
 
35
50
  @dataclass
36
51
  class OtelHandler:
37
52
  otel_endpoint: Optional[str] = None
38
53
  service_name: str = "unstructured-ingest"
39
54
  trace_provider: TracerProvider = field(init=False)
40
- log_out: Callable = field(default=logger.info)
55
+ log_out: Callable = field(default=get_log_out())
41
56
  trace_context_key: ClassVar[str] = "_trace_context"
42
57
 
43
58
  def init_trace(self):
@@ -28,7 +28,6 @@ from unstructured_ingest.interfaces import (
28
28
  Uploader,
29
29
  UploaderConfig,
30
30
  )
31
- from unstructured_ingest.logger import logger
32
31
  from unstructured_ingest.processes.connectors.fsspec.utils import sterilize_dict
33
32
  from unstructured_ingest.utils.filesystem import mkdir_concurrent_safe
34
33
 
@@ -106,6 +105,12 @@ class FsspecIndexer(Indexer):
106
105
  def precheck(self) -> None:
107
106
  from fsspec import get_filesystem_class
108
107
 
108
+ self.log_operation_start(
109
+ "Connection validation",
110
+ protocol=self.index_config.protocol,
111
+ path=self.index_config.path_without_protocol,
112
+ )
113
+
109
114
  try:
110
115
  fs = get_filesystem_class(self.index_config.protocol)(
111
116
  **self.connection_config.get_access_config(),
@@ -113,13 +118,24 @@ class FsspecIndexer(Indexer):
113
118
  files = fs.ls(path=self.index_config.path_without_protocol, detail=True)
114
119
  valid_files = [x.get("name") for x in files if x.get("type") == "file"]
115
120
  if not valid_files:
121
+ self.log_operation_complete("Connection validation", count=0)
116
122
  return
117
123
  file_to_sample = valid_files[0]
118
- logger.debug(f"attempting to make HEAD request for file: {file_to_sample}")
124
+ self.log_debug(f"attempting to make HEAD request for file: {file_to_sample}")
119
125
  with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
120
126
  client.head(path=file_to_sample)
127
+
128
+ self.log_connection_validated(
129
+ connector_type=self.connector_type,
130
+ endpoint=f"{self.index_config.protocol}://{self.index_config.path_without_protocol}",
131
+ )
132
+
121
133
  except Exception as e:
122
- logger.error(f"failed to validate connection: {e}", exc_info=True)
134
+ self.log_connection_failed(
135
+ connector_type=self.connector_type,
136
+ error=e,
137
+ endpoint=f"{self.index_config.protocol}://{self.index_config.path_without_protocol}",
138
+ )
123
139
  raise self.wrap_error(e=e)
124
140
 
125
141
  def get_file_info(self) -> list[dict[str, Any]]:
@@ -150,7 +166,7 @@ class FsspecIndexer(Indexer):
150
166
 
151
167
  def sample_n_files(self, files: list[dict[str, Any]], n) -> list[dict[str, Any]]:
152
168
  if len(files) <= n:
153
- logger.warning(
169
+ self.log_warning(
154
170
  f"number of files to be sampled={n} is not smaller than the number"
155
171
  f" of files found ({len(files)}). Returning all of the files as the"
156
172
  " sample."
@@ -201,9 +217,22 @@ class FsspecIndexer(Indexer):
201
217
  init_file_data.additional_metadata = self.get_metadata(file_info=file_info)
202
218
 
203
219
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
220
+ self.log_indexing_start(f"{self.connector_type} files")
221
+
204
222
  files = self.get_file_info()
205
- for file_info in files:
223
+ total_files = len(files)
224
+
225
+ self.log_operation_start("File indexing", total_files=total_files)
226
+
227
+ for i, file_info in enumerate(files):
206
228
  file_path = self.get_path(file_info=file_info)
229
+
230
+ # Only log progress for larger operations
231
+ if total_files > 5:
232
+ self.log_progress(
233
+ current=i + 1, total=total_files, item_type="files", operation="Indexing"
234
+ )
235
+
207
236
  # Note: we remove any remaining leading slashes (Box introduces these)
208
237
  # to get a valid relative path
209
238
  rel_path = file_path.replace(self.index_config.path_without_protocol, "").lstrip("/")
@@ -223,6 +252,8 @@ class FsspecIndexer(Indexer):
223
252
  display_name=file_path,
224
253
  )
225
254
 
255
+ self.log_indexing_complete(f"{self.connector_type} files", total_files)
256
+
226
257
 
227
258
  class FsspecDownloaderConfig(DownloaderConfig):
228
259
  pass
@@ -272,25 +303,57 @@ class FsspecDownloader(Downloader):
272
303
  def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
273
304
  download_path = self.get_download_path(file_data=file_data)
274
305
  mkdir_concurrent_safe(download_path.parent)
306
+
307
+ rpath = file_data.additional_metadata["original_file_path"]
308
+ file_size = file_data.metadata.filesize_bytes
309
+ self.log_download_start(file_path=rpath, file_id=file_data.identifier, file_size=file_size)
310
+
275
311
  try:
276
- rpath = file_data.additional_metadata["original_file_path"]
277
312
  with self.connection_config.get_client(protocol=self.protocol) as client:
278
313
  client.get_file(rpath=rpath, lpath=download_path.as_posix())
279
314
  self.handle_directory_download(lpath=download_path)
315
+
280
316
  except Exception as e:
317
+ self.log_error(
318
+ "File download failed",
319
+ error=e,
320
+ context={"file_path": rpath, "file_id": file_data.identifier},
321
+ )
281
322
  raise self.wrap_error(e=e)
323
+
324
+ self.log_download_complete(
325
+ file_path=rpath,
326
+ file_id=file_data.identifier,
327
+ download_path=str(download_path),
328
+ )
329
+
282
330
  return self.generate_download_response(file_data=file_data, download_path=download_path)
283
331
 
284
332
  async def async_run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
285
333
  download_path = self.get_download_path(file_data=file_data)
286
334
  mkdir_concurrent_safe(download_path.parent)
335
+ rpath = file_data.additional_metadata["original_file_path"]
336
+ file_size = file_data.metadata.filesize_bytes
337
+ self.log_download_start(file_path=rpath, file_id=file_data.identifier, file_size=file_size)
338
+
287
339
  try:
288
- rpath = file_data.additional_metadata["original_file_path"]
289
340
  with self.connection_config.get_client(protocol=self.protocol) as client:
290
341
  await client.get_file(rpath=rpath, lpath=download_path.as_posix())
291
342
  self.handle_directory_download(lpath=download_path)
292
343
  except Exception as e:
344
+ self.log_error(
345
+ "File download failed",
346
+ error=e,
347
+ context={"file_path": rpath, "file_id": file_data.identifier},
348
+ )
293
349
  raise self.wrap_error(e=e)
350
+
351
+ self.log_download_complete(
352
+ file_path=rpath,
353
+ file_id=file_data.identifier,
354
+ download_path=str(download_path),
355
+ )
356
+
294
357
  return self.generate_download_response(file_data=file_data, download_path=download_path)
295
358
 
296
359
 
@@ -321,6 +384,7 @@ class FsspecUploader(Uploader):
321
384
  )
322
385
 
323
386
  def __post_init__(self):
387
+ super().__post_init__()
324
388
  # TODO once python3.9 no longer supported and kw_only is allowed in dataclasses, remove:
325
389
  if not self.upload_config:
326
390
  raise TypeError(
@@ -334,6 +398,8 @@ class FsspecUploader(Uploader):
334
398
  def precheck(self) -> None:
335
399
  from fsspec import get_filesystem_class
336
400
 
401
+ self.log_operation_start("Connection validation", protocol=self.upload_config.protocol)
402
+
337
403
  try:
338
404
  fs = get_filesystem_class(self.upload_config.protocol)(
339
405
  **self.connection_config.get_access_config(),
@@ -341,7 +407,16 @@ class FsspecUploader(Uploader):
341
407
  upload_path = Path(self.upload_config.path_without_protocol) / "_empty"
342
408
  fs.write_bytes(path=upload_path.as_posix(), value=b"")
343
409
  except Exception as e:
410
+ self.log_connection_failed(
411
+ connector_type=self.connector_type,
412
+ error=e,
413
+ endpoint=f"{self.upload_config.protocol}://{self.upload_config.path_without_protocol}",
414
+ )
344
415
  raise self.wrap_error(e=e)
416
+ self.log_connection_validated(
417
+ connector_type=self.connector_type,
418
+ endpoint=f"{self.upload_config.protocol}://{self.upload_config.path_without_protocol}",
419
+ )
345
420
 
346
421
  def get_upload_path(self, file_data: FileData) -> Path:
347
422
  upload_path = Path(
@@ -353,14 +428,31 @@ class FsspecUploader(Uploader):
353
428
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
354
429
  path_str = str(path.resolve())
355
430
  upload_path = self.get_upload_path(file_data=file_data)
356
- logger.debug(f"writing local file {path_str} to {upload_path}")
357
- with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
358
- client.upload(lpath=path_str, rpath=upload_path.as_posix())
431
+ self.log_upload_start(file_path=path_str, destination=upload_path.as_posix())
432
+ try:
433
+ with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
434
+ client.upload(lpath=path_str, rpath=upload_path.as_posix())
435
+ except Exception as e:
436
+ self.log_error(
437
+ "File upload failed",
438
+ error=e,
439
+ context={"file_path": path_str, "destination": upload_path.as_posix()},
440
+ )
441
+ raise self.wrap_error(e=e)
442
+ self.log_upload_complete(file_path=path_str, destination=upload_path.as_posix())
359
443
 
360
444
  async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
361
445
  path_str = str(path.resolve())
362
446
  upload_path = self.get_upload_path(file_data=file_data)
363
- # Odd that fsspec doesn't run exists() as async even when client support async
364
- logger.debug(f"writing local file {path_str} to {upload_path}")
365
- with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
366
- client.upload(lpath=path_str, rpath=upload_path.as_posix())
447
+ self.log_upload_start(file_path=path_str, destination=upload_path.as_posix())
448
+ try:
449
+ with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
450
+ client.upload(lpath=path_str, rpath=upload_path.as_posix())
451
+ except Exception as e:
452
+ self.log_error(
453
+ "File upload failed",
454
+ error=e,
455
+ context={"file_path": path_str, "destination": upload_path.as_posix()},
456
+ )
457
+ raise self.wrap_error(e=e)
458
+ self.log_upload_complete(file_path=path_str, destination=upload_path.as_posix())
@@ -104,7 +104,13 @@ class S3ConnectionConfig(FsspecConnectionConfig):
104
104
  return UserError(message)
105
105
  if http_code >= 500:
106
106
  return ProviderError(message)
107
- logger.error(f"unhandled exception from s3 ({type(e)}): {e}", exc_info=True)
107
+ logger.error(
108
+ "Unhandled exception from S3 (type: %s, endpoint: %s): %s",
109
+ type(e).__name__,
110
+ self.endpoint_url or "default",
111
+ e,
112
+ exc_info=True,
113
+ )
108
114
  return e
109
115
 
110
116
 
@@ -122,6 +128,10 @@ class S3Indexer(FsspecIndexer):
122
128
 
123
129
  def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
124
130
  path = file_info["Key"]
131
+
132
+ self.log_debug("Getting metadata for S3 object", context={"file_path": path})
133
+ self.log_file_operation("Getting metadata", file_path=path)
134
+
125
135
  date_created = None
126
136
  date_modified = None
127
137
  modified = file_info.get("LastModified")
@@ -147,9 +157,9 @@ class S3Indexer(FsspecIndexer):
147
157
  record_locator["metadata"] = metadata
148
158
  issue_characters = [char for char in CHARACTERS_TO_AVOID if char in path]
149
159
  if issue_characters:
150
- logger.warning(
151
- f"File path {path} contains characters "
152
- f"that can cause issues with S3: {issue_characters}"
160
+ self.log_warning(
161
+ f"File path contains characters that can cause issues with S3: {issue_characters}",
162
+ context={"path": path, "problematic_characters": issue_characters},
153
163
  )
154
164
  return FileDataSourceMetadata(
155
165
  date_created=date_created,
@@ -115,23 +115,24 @@ class OnedriveConnectionConfig(ConnectionConfig):
115
115
  except ValueError as exc:
116
116
  logger.error("Couldn't set up credentials.")
117
117
  raise exc
118
-
118
+
119
119
  if "error" in token:
120
120
  error_codes = token.get("error_codes", [])
121
121
  error_type = token.get("error", "")
122
122
  error_description = token.get("error_description", "")
123
-
123
+
124
124
  # 7000215: Invalid client secret provided
125
125
  # 7000218: Invalid client id provided
126
126
  # 700016: Application not found in directory
127
127
  # 90002: Tenant not found
128
128
  auth_error_codes = [7000215, 7000218, 700016, 90002]
129
-
130
- if (any(code in error_codes for code in auth_error_codes) or
131
- error_type in ["invalid_client", "unauthorized_client", "invalid_grant"]):
132
- raise UserAuthError(
133
- f"Authentication failed: {error_type}: {error_description}"
134
- )
129
+
130
+ if any(code in error_codes for code in auth_error_codes) or error_type in [
131
+ "invalid_client",
132
+ "unauthorized_client",
133
+ "invalid_grant",
134
+ ]:
135
+ raise UserAuthError(f"Authentication failed: {error_type}: {error_description}")
135
136
  else:
136
137
  raise SourceConnectionNetworkError(
137
138
  f"Failed to fetch token: {error_type}: {error_description}"
@@ -87,6 +87,7 @@ class SharepointIndexerConfig(OnedriveIndexerConfig):
87
87
  # TODO: We can probably make path non-optional on OnedriveIndexerConfig once tested
88
88
  path: str = Field(default="")
89
89
 
90
+
90
91
  @dataclass
91
92
  class SharepointIndexer(OnedriveIndexer):
92
93
  connection_config: SharepointConnectionConfig
@@ -114,14 +115,14 @@ class SharepointIndexer(OnedriveIndexer):
114
115
  def _is_root_path(self, path: str) -> bool:
115
116
  """Check if the path represents root access (empty string or legacy default)."""
116
117
  return not path or not path.strip() or path == LEGACY_DEFAULT_PATH
117
-
118
+
118
119
  def _get_target_drive_item(self, site_drive_item: DriveItem, path: str) -> DriveItem:
119
120
  """Get the drive item to search in based on the path."""
120
121
  if self._is_root_path(path):
121
122
  return site_drive_item
122
123
  else:
123
124
  return site_drive_item.get_by_path(path).get().execute_query()
124
-
125
+
125
126
  def _validate_folder_path(self, site_drive_item: DriveItem, path: str) -> None:
126
127
  """Validate that a specific folder path exists and is accessible."""
127
128
  from office365.runtime.client_request_exception import ClientRequestException
@@ -0,0 +1,8 @@
1
+ from .logging.connector import ConnectorLoggingMixin, LoggingConfig
2
+ from .logging.sanitizer import DataSanitizer
3
+
4
+ __all__ = [
5
+ "ConnectorLoggingMixin",
6
+ "DataSanitizer",
7
+ "LoggingConfig",
8
+ ]
@@ -0,0 +1,365 @@
1
+ from typing import Any, Dict, Optional
2
+
3
+ from unstructured_ingest.logger import logger
4
+ from unstructured_ingest.processes.utils.logging.sanitizer import DataSanitizer
5
+
6
+
7
+ class LoggingConfig:
8
+ """Configuration for connector logging behavior."""
9
+
10
+ def __init__(
11
+ self,
12
+ log_file_paths: bool = False,
13
+ log_document_locations: Optional[bool] = None,
14
+ log_ids: bool = False,
15
+ log_document_ids: Optional[bool] = None,
16
+ log_progress_interval: int = 10,
17
+ sanitize_logs: bool = True,
18
+ show_connection_details: bool = False,
19
+ ):
20
+ # Backward compatibility: if new parameters aren't specified, use old ones
21
+ self.log_file_paths = log_file_paths
22
+ self.log_document_locations = (
23
+ log_document_locations if log_document_locations is not None else log_file_paths
24
+ )
25
+
26
+ self.log_ids = log_ids
27
+ self.log_document_ids = log_document_ids if log_document_ids is not None else log_ids
28
+
29
+ self.log_progress_interval = log_progress_interval
30
+ self.sanitize_logs = sanitize_logs
31
+ self.show_connection_details = show_connection_details
32
+
33
+
34
+ class ConnectorLoggingMixin:
35
+ """Mixin class providing standardized logging patterns for connectors."""
36
+
37
+ def __init__(self, *args, **kwargs):
38
+ """
39
+ Initialize the mixin by setting up logging configuration and data sanitization.
40
+
41
+ This method ensures that the mixin provides standardized logging patterns for connectors.
42
+ It initializes:
43
+ - `_logging_config`: Manages logging behavior and settings.
44
+ - `_sanitizer`: Handles sanitization of sensitive data in logs.
45
+
46
+ Args:
47
+ *args: Positional arguments passed to the parent class.
48
+ **kwargs: Keyword arguments passed to the parent class.
49
+ """
50
+ super().__init__(*args, **kwargs)
51
+ self._logging_config = LoggingConfig()
52
+ self._sanitizer = DataSanitizer()
53
+
54
+ def set_logging_config(self, config: LoggingConfig):
55
+ """Set the logging configuration for this connector."""
56
+ self._logging_config = config
57
+
58
+ def _should_sanitize(self) -> bool:
59
+ """Check if log sanitization is enabled."""
60
+ return self._logging_config.sanitize_logs
61
+
62
+ def log_operation_start(self, operation: str, **kwargs):
63
+ """Log the start of a major operation."""
64
+ logger.info("Starting %s", operation)
65
+
66
+ if kwargs:
67
+ if self._should_sanitize():
68
+ sanitized_kwargs = self._sanitizer.sanitize_dict(kwargs)
69
+ logger.debug("%s parameters: %s", operation, sanitized_kwargs)
70
+ else:
71
+ logger.debug("%s parameters: %s", operation, kwargs)
72
+
73
+ def log_operation_complete(self, operation: str, count: Optional[int] = None, **kwargs):
74
+ """Log the completion of a major operation."""
75
+ if count is not None:
76
+ logger.info("Completed %s (%s items)", operation, count)
77
+ else:
78
+ logger.info("Completed %s", operation)
79
+
80
+ if kwargs:
81
+ if self._should_sanitize():
82
+ sanitized_kwargs = self._sanitizer.sanitize_dict(kwargs)
83
+ logger.debug("%s results: %s", operation, sanitized_kwargs)
84
+ else:
85
+ logger.debug("%s results: %s", operation, kwargs)
86
+
87
+ def log_connection_validated(self, connector_type: str, endpoint: Optional[str] = None):
88
+ """Log successful connection validation."""
89
+ if self._logging_config.show_connection_details and endpoint:
90
+ if self._should_sanitize():
91
+ sanitized_endpoint = self._sanitizer.sanitize_url(endpoint)
92
+ logger.debug(
93
+ "Connection to %s validated successfully: %s",
94
+ connector_type,
95
+ sanitized_endpoint,
96
+ )
97
+ else:
98
+ logger.debug(
99
+ "Connection to %s validated successfully: %s", connector_type, endpoint
100
+ )
101
+ else:
102
+ logger.debug("Connection to %s validated successfully", connector_type)
103
+
104
+ def log_connection_failed(
105
+ self, connector_type: str, error: Exception, endpoint: Optional[str] = None
106
+ ):
107
+ """Log connection validation failure."""
108
+ if endpoint:
109
+ if self._should_sanitize():
110
+ sanitized_endpoint = self._sanitizer.sanitize_url(endpoint)
111
+ logger.error(
112
+ "Failed to validate %s connection to %s: %s",
113
+ connector_type,
114
+ sanitized_endpoint,
115
+ error,
116
+ exc_info=True,
117
+ )
118
+ else:
119
+ logger.error(
120
+ "Failed to validate %s connection to %s: %s",
121
+ connector_type,
122
+ endpoint,
123
+ error,
124
+ exc_info=True,
125
+ )
126
+ else:
127
+ logger.error(
128
+ "Failed to validate %s connection: %s", connector_type, error, exc_info=True
129
+ )
130
+
131
+ def log_progress(
132
+ self, current: int, total: int, item_type: str = "items", operation: str = "Processing"
133
+ ):
134
+ """Log progress for long-running operations."""
135
+ if total > 0 and current % self._logging_config.log_progress_interval == 0:
136
+ progress = (current / total) * 100
137
+ logger.info("%s: %s/%s %s (%.1f%%)", operation, current, total, item_type, progress)
138
+
139
+ def log_batch_progress(
140
+ self, batch_num: int, total_batches: int, batch_size: int, operation: str = "Processing"
141
+ ):
142
+ """Log progress for batch operations."""
143
+ logger.info("%s batch %s/%s (%s items)", operation, batch_num, total_batches, batch_size)
144
+
145
+ def log_document_operation(
146
+ self,
147
+ operation: str,
148
+ document_location: Optional[str] = None,
149
+ document_id: Optional[str] = None,
150
+ content_size: Optional[int] = None,
151
+ **kwargs,
152
+ ):
153
+ """Log document-related operations (universal for all connector types)."""
154
+ if self._logging_config.log_document_locations and document_location:
155
+ if self._should_sanitize():
156
+ sanitized_location = self._sanitizer.sanitize_location(document_location)
157
+ logger.debug("%s: %s", operation, sanitized_location)
158
+ else:
159
+ logger.debug("%s: %s", operation, document_location)
160
+ elif self._logging_config.log_document_ids and document_id:
161
+ if self._should_sanitize():
162
+ sanitized_id = self._sanitizer.sanitize_document_id(document_id)
163
+ logger.debug("%s: %s", operation, sanitized_id)
164
+ else:
165
+ logger.debug("%s: %s", operation, document_id)
166
+ else:
167
+ logger.debug("%s: <document>", operation)
168
+
169
+ if content_size is not None:
170
+ kwargs["content_size"] = content_size
171
+
172
+ if kwargs:
173
+ if self._should_sanitize():
174
+ sanitized_kwargs = self._sanitizer.sanitize_dict(kwargs)
175
+ logger.debug("%s details: %s", operation, sanitized_kwargs)
176
+ else:
177
+ logger.debug("%s details: %s", operation, kwargs)
178
+
179
+ def log_file_operation(
180
+ self,
181
+ operation: str,
182
+ file_path: Optional[str] = None,
183
+ file_id: Optional[str] = None,
184
+ **kwargs,
185
+ ):
186
+ """Log file-related operations (backward compatibility wrapper)."""
187
+ self.log_document_operation(
188
+ operation=operation, document_location=file_path, document_id=file_id, **kwargs
189
+ )
190
+
191
+ def log_document_download_start(
192
+ self,
193
+ document_location: Optional[str] = None,
194
+ document_id: Optional[str] = None,
195
+ content_size: Optional[int] = None,
196
+ ):
197
+ """Log the start of a document download/retrieval."""
198
+ logger.info("Starting document download")
199
+
200
+ self.log_document_operation(
201
+ "Download",
202
+ document_location=document_location,
203
+ document_id=document_id,
204
+ content_size=content_size,
205
+ )
206
+
207
+ def log_document_download_complete(
208
+ self,
209
+ document_location: Optional[str] = None,
210
+ document_id: Optional[str] = None,
211
+ download_path: Optional[str] = None,
212
+ content_size: Optional[int] = None,
213
+ items_retrieved: Optional[int] = None,
214
+ ):
215
+ """Log the completion of a document download/retrieval."""
216
+ logger.info("Document download completed")
217
+
218
+ details = {}
219
+ if download_path:
220
+ details["download_path"] = download_path
221
+ if items_retrieved is not None:
222
+ details["items_retrieved"] = items_retrieved
223
+
224
+ self.log_document_operation(
225
+ "Download completed",
226
+ document_location=document_location,
227
+ document_id=document_id,
228
+ content_size=content_size,
229
+ **details,
230
+ )
231
+
232
+ def log_download_start(
233
+ self,
234
+ file_path: Optional[str] = None,
235
+ file_id: Optional[str] = None,
236
+ file_size: Optional[int] = None,
237
+ ):
238
+ """Log the start of a file download (backward compatibility wrapper)."""
239
+ self.log_document_download_start(
240
+ document_location=file_path, document_id=file_id, content_size=file_size
241
+ )
242
+
243
+ def log_download_complete(
244
+ self,
245
+ file_path: Optional[str] = None,
246
+ file_id: Optional[str] = None,
247
+ download_path: Optional[str] = None,
248
+ file_size: Optional[int] = None,
249
+ ):
250
+ """Log the completion of a file download (backward compatibility wrapper)."""
251
+ self.log_document_download_complete(
252
+ document_location=file_path,
253
+ document_id=file_id,
254
+ download_path=download_path,
255
+ content_size=file_size,
256
+ )
257
+
258
+ def log_upload_start(
259
+ self,
260
+ file_path: Optional[str] = None,
261
+ destination: Optional[str] = None,
262
+ file_size: Optional[int] = None,
263
+ ):
264
+ """Log the start of a file upload."""
265
+ logger.info("Starting file upload")
266
+
267
+ details = {}
268
+ if destination:
269
+ details["destination"] = destination
270
+
271
+ self.log_file_operation("Upload", file_path=file_path, **details)
272
+
273
+ def log_upload_complete(
274
+ self,
275
+ file_path: Optional[str] = None,
276
+ destination: Optional[str] = None,
277
+ file_id: Optional[str] = None,
278
+ file_size: Optional[int] = None,
279
+ ):
280
+ """Log the completion of a file upload."""
281
+ logger.info("File upload completed")
282
+
283
+ details = {}
284
+ if destination:
285
+ details["destination"] = destination
286
+ if file_id:
287
+ details["file_id"] = file_id
288
+
289
+ self.log_file_operation("Upload completed", file_path=file_path, **details)
290
+
291
+ def log_indexing_start(self, source_type: str, count: Optional[int] = None):
292
+ """Log the start of indexing operation."""
293
+ if count:
294
+ logger.info("Starting indexing of %s (%s items)", source_type, count)
295
+ else:
296
+ logger.info("Starting indexing of %s", source_type)
297
+
298
+ def log_indexing_complete(self, source_type: str, count: int):
299
+ """Log the completion of indexing operation."""
300
+ logger.info("Indexing completed: %s %s items indexed", count, source_type)
301
+
302
+ def log_info(self, message: str, context: Optional[Dict[str, Any]] = None, **kwargs):
303
+ """Log an info message with optional context and sanitization."""
304
+ logger.info(message)
305
+ self._log_context("Info", context, **kwargs)
306
+
307
+ def log_debug(self, message: str, context: Optional[Dict[str, Any]] = None, **kwargs):
308
+ """Log a debug message with optional context and sanitization."""
309
+ logger.debug(message)
310
+ self._log_context("Debug", context, **kwargs)
311
+
312
+ def log_warning(self, message: str, context: Optional[Dict[str, Any]] = None, **kwargs):
313
+ """Log a warning message with optional context and sanitization."""
314
+ logger.warning(message)
315
+ self._log_context("Warning", context, **kwargs)
316
+
317
+ def log_error(
318
+ self,
319
+ message: str,
320
+ error: Optional[Exception] = None,
321
+ context: Optional[Dict[str, Any]] = None,
322
+ **kwargs,
323
+ ):
324
+ """Log an error message with optional exception, context and sanitization."""
325
+ if error:
326
+ logger.error("%s: %s", message, error, exc_info=True)
327
+ else:
328
+ logger.error(message)
329
+ self._log_context("Error", context, **kwargs)
330
+
331
+ def _log_context(self, log_type: str, context: Optional[Dict[str, Any]], **kwargs):
332
+ """Helper method to log context with sanitization."""
333
+ all_context = {}
334
+ if context:
335
+ all_context.update(context)
336
+ if kwargs:
337
+ all_context.update(kwargs)
338
+
339
+ if all_context:
340
+ if self._should_sanitize():
341
+ sanitized_context = self._sanitizer.sanitize_dict(all_context)
342
+ logger.debug("%s context: %s", log_type, sanitized_context)
343
+ else:
344
+ logger.debug("%s context: %s", log_type, all_context)
345
+
346
+ def log_api_call(self, method: str, endpoint: str, status_code: Optional[int] = None, **kwargs):
347
+ """Log API call details."""
348
+ if self._should_sanitize():
349
+ sanitized_endpoint = self._sanitizer.sanitize_url(endpoint)
350
+ if status_code:
351
+ logger.debug("API call: %s %s -> %s", method, sanitized_endpoint, status_code)
352
+ else:
353
+ logger.debug("API call: %s %s", method, sanitized_endpoint)
354
+ else:
355
+ if status_code:
356
+ logger.debug("API call: %s %s -> %s", method, endpoint, status_code)
357
+ else:
358
+ logger.debug("API call: %s %s", method, endpoint)
359
+
360
+ if kwargs:
361
+ if self._should_sanitize():
362
+ sanitized_kwargs = self._sanitizer.sanitize_dict(kwargs)
363
+ logger.debug("API call details: %s", sanitized_kwargs)
364
+ else:
365
+ logger.debug("API call details: %s", kwargs)
@@ -0,0 +1,117 @@
1
+ from pathlib import Path
2
+ from typing import Any, Dict, Optional, Union
3
+ from urllib.parse import urlparse
4
+
5
+
6
+ class DataSanitizer:
7
+ """Utility class for sanitizing sensitive data in logs."""
8
+
9
+ @staticmethod
10
+ def sanitize_path(path: Union[str, Path]) -> str:
11
+ """Sanitize file paths for logging, showing only filename and partial path."""
12
+ if not path:
13
+ return "<empty>"
14
+
15
+ path_str = str(path)
16
+ path_obj = Path(path_str)
17
+
18
+ if len(path_obj.parts) > 2:
19
+ return f".../{path_obj.parent.name}/{path_obj.name}"
20
+ return path_obj.name
21
+
22
+ @staticmethod
23
+ def sanitize_id(identifier: str) -> str:
24
+ """Sanitize IDs for logging, showing only first/last few characters."""
25
+ if not identifier:
26
+ return "<id>"
27
+ if len(identifier) < 10:
28
+ half_len = len(identifier) // 2
29
+ return f"{identifier[:half_len]}..."
30
+ return f"{identifier[:4]}...{identifier[-4:]}"
31
+
32
+ @staticmethod
33
+ def sanitize_url(url: str) -> str:
34
+ """Sanitize URLs for logging, removing sensitive query parameters."""
35
+ if not url:
36
+ return "<url>"
37
+ try:
38
+ parsed = urlparse(url)
39
+ return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
40
+ except (ValueError, TypeError):
41
+ return "<url>"
42
+
43
+ @staticmethod
44
+ def sanitize_token(token: str) -> str:
45
+ """Sanitize tokens and secrets for logging."""
46
+ if not token:
47
+ return "<token>"
48
+ if len(token) < 10:
49
+ half_len = len(token) // 2
50
+ return f"{token[:half_len]}..."
51
+ return f"{token[:4]}...{token[-4:]}"
52
+
53
+ @staticmethod
54
+ def sanitize_location(location: Union[str, Path]) -> str:
55
+ """Sanitize document locations (file paths, URLs, database references) for logging."""
56
+ if not location:
57
+ return "<empty>"
58
+
59
+ location_str = str(location)
60
+
61
+ # Handle URLs
62
+ if location_str.startswith(("http://", "https://", "ftp://", "ftps://")):
63
+ return DataSanitizer.sanitize_url(location_str)
64
+
65
+ # Handle database-style references (table:id, collection/document, etc.)
66
+ if ":" in location_str and not location_str.startswith("/"):
67
+ parts = location_str.split(":", 1)
68
+ if len(parts) == 2:
69
+ table_name, record_id = parts
70
+ return f"{table_name}:{DataSanitizer.sanitize_id(record_id)}"
71
+
72
+ return DataSanitizer.sanitize_path(location_str)
73
+
74
+ @staticmethod
75
+ def sanitize_document_id(document_id: str) -> str:
76
+ """Sanitize document IDs for logging (alias for sanitize_id for clarity)."""
77
+ return DataSanitizer.sanitize_id(document_id)
78
+
79
+ @staticmethod
80
+ def sanitize_dict(data: Dict[str, Any], sensitive_keys: Optional[set] = None) -> Dict[str, Any]:
81
+ """Sanitize dictionary data for logging."""
82
+ if sensitive_keys is None:
83
+ sensitive_keys = {
84
+ "password",
85
+ "token",
86
+ "secret",
87
+ "key",
88
+ "api_key",
89
+ "access_token",
90
+ "refresh_token",
91
+ "client_secret",
92
+ "private_key",
93
+ "credentials",
94
+ }
95
+
96
+ sanitized = {}
97
+ for k, v in data.items():
98
+ key_lower = k.lower()
99
+ if any(sensitive_key in key_lower for sensitive_key in sensitive_keys):
100
+ sanitized[k] = DataSanitizer.sanitize_token(str(v))
101
+ elif isinstance(v, dict):
102
+ sanitized[k] = DataSanitizer.sanitize_dict(v, sensitive_keys)
103
+ elif isinstance(v, (str, Path)) and (
104
+ "path" in key_lower
105
+ or "file" in key_lower
106
+ or "location" in key_lower
107
+ or "document_location" in key_lower
108
+ ):
109
+ sanitized[k] = DataSanitizer.sanitize_location(v)
110
+ elif isinstance(v, str) and (
111
+ ("id" in key_lower and len(str(v)) > 8)
112
+ or ("document_id" in key_lower and len(str(v)) > 8)
113
+ ):
114
+ sanitized[k] = DataSanitizer.sanitize_document_id(v)
115
+ else:
116
+ sanitized[k] = v
117
+ return sanitized
@@ -2,4 +2,4 @@
2
2
 
3
3
  from unstructured_ingest.utils.filesystem import mkdir_concurrent_safe
4
4
 
5
- __all__ = ["mkdir_concurrent_safe"]
5
+ __all__ = ["mkdir_concurrent_safe"]
@@ -0,0 +1,15 @@
1
+ import os
2
+ import ssl
3
+
4
+ import certifi
5
+
6
+
7
+ def ssl_context_with_optional_ca_override():
8
+ """
9
+ # https://www.python-httpx.org/advanced/ssl/#working-with-ssl_cert_file-and-ssl_cert_dir
10
+ # We choose REQUESTS_CA_BUNDLE because that works with many other Python packages.
11
+ """
12
+ return ssl.create_default_context(
13
+ cafile=os.environ.get("REQUESTS_CA_BUNDLE", certifi.where()),
14
+ capath=os.environ.get("REQUESTS_CA_BUNDLE"),
15
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unstructured_ingest
3
- Version: 1.0.56
3
+ Version: 1.1.0
4
4
  Summary: Local ETL data pipeline to get data RAG ready
5
5
  Author-email: Unstructured Technologies <devops@unstructuredai.io>
6
6
  License-Expression: Apache-2.0
@@ -18,6 +18,7 @@ Classifier: Programming Language :: Python :: 3.11
18
18
  Classifier: Programming Language :: Python :: 3.12
19
19
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
20
  Requires-Python: <3.13,>=3.9
21
+ Requires-Dist: certifi>=2025.7.14
21
22
  Requires-Dist: click
22
23
  Requires-Dist: opentelemetry-sdk
23
24
  Requires-Dist: pydantic>=2.7
@@ -1,10 +1,10 @@
1
1
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
2
- unstructured_ingest/__version__.py,sha256=QS35e1wNMcYW0l5aYg7WzCRBflWOLvHKi5a0xe-dlyo,43
2
+ unstructured_ingest/__version__.py,sha256=OTJtt59bB59UuRwC7CjPgJNmkdDC7RUC5Ukrfd-P-CE,42
3
3
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
4
4
  unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
5
5
  unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
6
6
  unstructured_ingest/main.py,sha256=82G_7eG4PNhc_xIqj4Y_sFbDV9VI-nwSfsfJQMzovMk,169
7
- unstructured_ingest/otel.py,sha256=NsUqOolA0gt69eFhZLABjVpcKoM9aus-AbxIKqWqPTc,4127
7
+ unstructured_ingest/otel.py,sha256=wxnkdZqFtlypmOn4QX6uLxjGa7jSoFabP3PEG5FjH1g,4669
8
8
  unstructured_ingest/unstructured_api.py,sha256=4e2ZNWIihk0eje4R3ZQ0NOYNbmMZDv_O-rnJo94kaGE,5127
9
9
  unstructured_ingest/cli/README.md,sha256=lfsXY2jOO__OuDYcIs8N0yLhZWzrSQ_dyXbSFtEMlQ8,1504
10
10
  unstructured_ingest/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -22,18 +22,18 @@ unstructured_ingest/data_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
22
22
  unstructured_ingest/data_types/entities.py,sha256=ECc6EkZ5_ZUvK7uaALYOynfFmofIrHYIJZfb67hUIxA,371
23
23
  unstructured_ingest/data_types/file_data.py,sha256=J0RQa7YXhhxiLVzhPbF5Hl2nzSpxLFK9vrP6RTBWlSg,3833
24
24
  unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
- unstructured_ingest/embed/azure_openai.py,sha256=fk9yTG-Xr1TSu4n4l8O3DQo9-oceVL9fX_8rehwXsNM,1798
25
+ unstructured_ingest/embed/azure_openai.py,sha256=Q_buBkAcx9FBuTsAqKbRU8vd9vDh8JoDOEth4fFxHbg,2160
26
26
  unstructured_ingest/embed/bedrock.py,sha256=dzfCsatB0i8hUp1YnXmoImoxgvUdZ4srKI6eSvn-lYM,9132
27
27
  unstructured_ingest/embed/huggingface.py,sha256=6Gx9L3xa3cv9fX4AMuLsePJQF4T_jwkKjovfqF5X1NM,2435
28
28
  unstructured_ingest/embed/interfaces.py,sha256=Y3PLhgWnMDmtpugE37hlAiBIbC8izrFFXXkrPVby-HY,5137
29
29
  unstructured_ingest/embed/mixedbreadai.py,sha256=uKTqzoi4M_WeYZu-qc_TSxwJONOESzxVbBLUbD1Wbns,3922
30
30
  unstructured_ingest/embed/octoai.py,sha256=yZuD7R4mEKS4Jjyae_IrNWogMPOFFS8gW5oUllj3ROU,4540
31
- unstructured_ingest/embed/openai.py,sha256=TMEOPVfm_OSs4tb3Ymd6q5J49R_-YKvO4TOqCHb3bwk,4647
31
+ unstructured_ingest/embed/openai.py,sha256=09I5BIrb-iGsv92LOV46-F7oZ7j1JnJIOQFARNKVq3k,5029
32
32
  unstructured_ingest/embed/togetherai.py,sha256=ykaveEUBxBGBzRlmWc9utCFQuUWHdbW4F9KAb-uBAJM,3630
33
33
  unstructured_ingest/embed/vertexai.py,sha256=DphvPhiYdXTMrQxJCd-64vMs4iVdLY_BphHqz3n5HfM,3758
34
34
  unstructured_ingest/embed/voyageai.py,sha256=EOrYzaoXOZ6C4fNkMlCgb8KA8rdfgVXN3USMFpnn0Bs,4698
35
35
  unstructured_ingest/interfaces/__init__.py,sha256=QIkWqjsq9INTa89gPuXlMlQL4s3y5TqLmPkuVuTyXcs,795
36
- unstructured_ingest/interfaces/connector.py,sha256=qUFFJ3qgDMenTCZMtVRjq1DIwsVak6pxNjQOH2eVkMw,1623
36
+ unstructured_ingest/interfaces/connector.py,sha256=wYWIEAL99KdQDDzzDYSf_yE8p1wjThSPMgEV5qyfiPc,1885
37
37
  unstructured_ingest/interfaces/downloader.py,sha256=xX0ZzsFRSzZb7SAeoeQph8sIbVq13DRw-3MYkdADrY0,2918
38
38
  unstructured_ingest/interfaces/indexer.py,sha256=c2FwWJEQHfFD6vO-tGfYLpLiIs-TYViLAt8YmHfDbaM,824
39
39
  unstructured_ingest/interfaces/process.py,sha256=S3A_9gkwwGC-iQxvnpj3Er6IJAjAT5npzpSgxuFAzUM,449
@@ -79,12 +79,12 @@ unstructured_ingest/processes/connectors/local.py,sha256=CesMduUiSPqdJpqIyW28icG
79
79
  unstructured_ingest/processes/connectors/milvus.py,sha256=L-PM5osheNyNsLGYZmiF3rRmeulp7Ejk92JCoaQ_F9Y,12075
80
80
  unstructured_ingest/processes/connectors/mongodb.py,sha256=idjolwS5TXShcIz2jR_socSgh8HOzJwyOnzE1qLUPBw,15362
81
81
  unstructured_ingest/processes/connectors/neo4j.py,sha256=ztxvI9KY8RF5kYUuMGSzzN5mz7Fu_4Ai9P7dqCpJLc0,20267
82
- unstructured_ingest/processes/connectors/onedrive.py,sha256=fGwa-x9D3gyLQtaSXbz6pfiFiLpnO2GVtJmU5kb-qd0,20197
82
+ unstructured_ingest/processes/connectors/onedrive.py,sha256=JPa30X2abVx9SHye_cLOOj4csj_ut8nMjwRnMcgHFhI,20163
83
83
  unstructured_ingest/processes/connectors/outlook.py,sha256=6HHubZI_zttEfYp0XNd4Y1vhjsS8uSg7aZ2LBrTjfHk,9376
84
84
  unstructured_ingest/processes/connectors/pinecone.py,sha256=jCabAqKQyBFzaGjphxLMr57y7P0Z15Jd9Jj-JM40YnU,15090
85
85
  unstructured_ingest/processes/connectors/redisdb.py,sha256=rTihbfv0Mlk1eo5Izn-JXRu5Ad5C-KD58nSqeKsaZJ8,8024
86
86
  unstructured_ingest/processes/connectors/salesforce.py,sha256=N_UoebrhzXZNWw-X7lg8_qAziXx5L_d8XHnHWKNNYR8,11767
87
- unstructured_ingest/processes/connectors/sharepoint.py,sha256=IV6gs4vx4q-QEDwA-Rm6yYCwzopuVl8bKC8CcBU1Lkk,10677
87
+ unstructured_ingest/processes/connectors/sharepoint.py,sha256=ooPJoAEHj-epEM39iiYbNWdDUdEwt466fLjIcYSNTM8,10670
88
88
  unstructured_ingest/processes/connectors/slack.py,sha256=oboIfX7ayBMK0te5Nv50iyL3FQJFXJbRxZSQaCMp3kM,9318
89
89
  unstructured_ingest/processes/connectors/utils.py,sha256=TAd0hb1f291N-q7-TUe6JKSCGkhqDyo7Ij8zmliBZUc,2071
90
90
  unstructured_ingest/processes/connectors/vectara.py,sha256=xrC6jkgW8BII4UjdzUelDu122xT484cpfMTK2wl-sko,12292
@@ -109,9 +109,9 @@ unstructured_ingest/processes/connectors/fsspec/__init__.py,sha256=3HTdw4L4mdN4W
109
109
  unstructured_ingest/processes/connectors/fsspec/azure.py,sha256=31VNiG5YnXfhrFX7QJ2O1ubeWHxbe1sYVIztefbscAQ,7148
110
110
  unstructured_ingest/processes/connectors/fsspec/box.py,sha256=1gLS7xR2vbjgKBrQ4ZpI1fKTsJuIDfXuAzx_a4FzxG4,5873
111
111
  unstructured_ingest/processes/connectors/fsspec/dropbox.py,sha256=HwwKjQmjM7yFk9Esh_F20xDisRPXGUkFduzaasByRDE,8355
112
- unstructured_ingest/processes/connectors/fsspec/fsspec.py,sha256=S1siX888TfHAByEXuvOqkTbcNAzx-m5UNqhKjiEKR5s,14524
112
+ unstructured_ingest/processes/connectors/fsspec/fsspec.py,sha256=p0u6JL6ouEPe4R_i_rAhzlvSDyMO3-NDHiw_CtPaCTc,17875
113
113
  unstructured_ingest/processes/connectors/fsspec/gcs.py,sha256=ouxISCKpZTAj3T6pWGYbASu93wytJjl5WSICvQcrgfE,7172
114
- unstructured_ingest/processes/connectors/fsspec/s3.py,sha256=2ZV6b2E2pIsf_ab1Lty74FwpMnJZhpQUdamPgpwcKsQ,7141
114
+ unstructured_ingest/processes/connectors/fsspec/s3.py,sha256=P5nd3hamhLFO3l5nV3lMuIxHtb_rZYFP4F6q_py3xpc,7492
115
115
  unstructured_ingest/processes/connectors/fsspec/sftp.py,sha256=pR_a2SgLjt8ffNkariHrPB1E0HVSTj5h3pt7KxTU3TI,6371
116
116
  unstructured_ingest/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
117
117
  unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py,sha256=kf0UpgdAY2KK1R1FbAB6GEBBAIOeYQ8cZIr3bp660qM,374
@@ -218,9 +218,11 @@ unstructured_ingest/processes/connectors/weaviate/weaviate.py,sha256=yB67gxvo3X0
218
218
  unstructured_ingest/processes/connectors/zendesk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
219
219
  unstructured_ingest/processes/connectors/zendesk/client.py,sha256=GvPIpx4aYdD58-edHgvCFjFao94uR0O5Yf4dT9NCmSk,11952
220
220
  unstructured_ingest/processes/connectors/zendesk/zendesk.py,sha256=doS6d7ZhXBgJN8aPVf7vnQr8BciQbzX8-4yl4_hDZ7w,9253
221
- unstructured_ingest/processes/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
221
+ unstructured_ingest/processes/utils/__init__.py,sha256=v3IQ-Ft0f7PoHhGcYiiD6Yrr6oi-RiGeD6nTKowbEDk,199
222
222
  unstructured_ingest/processes/utils/blob_storage.py,sha256=apMUmm9loxdbTRkkLH4VhG9kUVyiw9PFUJheSDxSxPk,1023
223
- unstructured_ingest/utils/__init__.py,sha256=URnsQu-y3Vmc7vn6GVL2sYuXxlSJ3naR3c9o6oKSm3w,157
223
+ unstructured_ingest/processes/utils/logging/connector.py,sha256=xKsXSavbu2U8ZP0KP7jk5192ZDr5HzaBCBCf0GKe1HI,14109
224
+ unstructured_ingest/processes/utils/logging/sanitizer.py,sha256=ZG4Cdcc2yrVmmgdUOJCaUKgp5mZhBpEOMjAbj5Cth_s,4251
225
+ unstructured_ingest/utils/__init__.py,sha256=mU8mlrdah00MPuZM6JqXTkrpXK-sDYiv5y5Mwl8eesM,158
224
226
  unstructured_ingest/utils/chunking.py,sha256=9b3sXMA6L8RW5xAkKQbwdtVudGLAcj_sgT6Grh5tyYM,1870
225
227
  unstructured_ingest/utils/compression.py,sha256=PPC-ys3qEAtELf6-irhp8v8M634pFFCJEvA6o7PXaLI,2712
226
228
  unstructured_ingest/utils/constants.py,sha256=pDspTYz-nEojHBqrZNfssGEiujmVa02pIWL63PQP9sU,103
@@ -232,8 +234,9 @@ unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01q
232
234
  unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
233
235
  unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
234
236
  unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
235
- unstructured_ingest-1.0.56.dist-info/METADATA,sha256=LOovkqoz-Zu0Vn1qlv1uiS_FhhRVZYqDU7OAv_pdgEE,8842
236
- unstructured_ingest-1.0.56.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
237
- unstructured_ingest-1.0.56.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
238
- unstructured_ingest-1.0.56.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
239
- unstructured_ingest-1.0.56.dist-info/RECORD,,
237
+ unstructured_ingest/utils/tls.py,sha256=Ra8Mii1F4VqErRreg76PBI0eAqPBC009l0sSHa8FdnA,448
238
+ unstructured_ingest-1.1.0.dist-info/METADATA,sha256=tJonV6SbQB5XL3BeyL8coDFhzzChMKGuSPQWQ3aoOdE,8875
239
+ unstructured_ingest-1.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
240
+ unstructured_ingest-1.1.0.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
241
+ unstructured_ingest-1.1.0.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
242
+ unstructured_ingest-1.1.0.dist-info/RECORD,,