unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,365 @@
1
+ from typing import Any, Dict, Optional
2
+
3
+ from unstructured_ingest.logger import logger
4
+ from unstructured_ingest.processes.utils.logging.sanitizer import DataSanitizer
5
+
6
+
7
+ class LoggingConfig:
8
+ """Configuration for connector logging behavior."""
9
+
10
+ def __init__(
11
+ self,
12
+ log_file_paths: bool = False,
13
+ log_document_locations: Optional[bool] = None,
14
+ log_ids: bool = False,
15
+ log_document_ids: Optional[bool] = None,
16
+ log_progress_interval: int = 10,
17
+ sanitize_logs: bool = True,
18
+ show_connection_details: bool = False,
19
+ ):
20
+ # Backward compatibility: if new parameters aren't specified, use old ones
21
+ self.log_file_paths = log_file_paths
22
+ self.log_document_locations = (
23
+ log_document_locations if log_document_locations is not None else log_file_paths
24
+ )
25
+
26
+ self.log_ids = log_ids
27
+ self.log_document_ids = log_document_ids if log_document_ids is not None else log_ids
28
+
29
+ self.log_progress_interval = log_progress_interval
30
+ self.sanitize_logs = sanitize_logs
31
+ self.show_connection_details = show_connection_details
32
+
33
+
34
+ class ConnectorLoggingMixin:
35
+ """Mixin class providing standardized logging patterns for connectors."""
36
+
37
+ def __init__(self, *args, **kwargs):
38
+ """
39
+ Initialize the mixin by setting up logging configuration and data sanitization.
40
+
41
+ This method ensures that the mixin provides standardized logging patterns for connectors.
42
+ It initializes:
43
+ - `_logging_config`: Manages logging behavior and settings.
44
+ - `_sanitizer`: Handles sanitization of sensitive data in logs.
45
+
46
+ Args:
47
+ *args: Positional arguments passed to the parent class.
48
+ **kwargs: Keyword arguments passed to the parent class.
49
+ """
50
+ super().__init__(*args, **kwargs)
51
+ self._logging_config = LoggingConfig()
52
+ self._sanitizer = DataSanitizer()
53
+
54
+ def set_logging_config(self, config: LoggingConfig):
55
+ """Set the logging configuration for this connector."""
56
+ self._logging_config = config
57
+
58
+ def _should_sanitize(self) -> bool:
59
+ """Check if log sanitization is enabled."""
60
+ return self._logging_config.sanitize_logs
61
+
62
+ def log_operation_start(self, operation: str, **kwargs):
63
+ """Log the start of a major operation."""
64
+ logger.info("Starting %s", operation)
65
+
66
+ if kwargs:
67
+ if self._should_sanitize():
68
+ sanitized_kwargs = self._sanitizer.sanitize_dict(kwargs)
69
+ logger.debug("%s parameters: %s", operation, sanitized_kwargs)
70
+ else:
71
+ logger.debug("%s parameters: %s", operation, kwargs)
72
+
73
+ def log_operation_complete(self, operation: str, count: Optional[int] = None, **kwargs):
74
+ """Log the completion of a major operation."""
75
+ if count is not None:
76
+ logger.info("Completed %s (%s items)", operation, count)
77
+ else:
78
+ logger.info("Completed %s", operation)
79
+
80
+ if kwargs:
81
+ if self._should_sanitize():
82
+ sanitized_kwargs = self._sanitizer.sanitize_dict(kwargs)
83
+ logger.debug("%s results: %s", operation, sanitized_kwargs)
84
+ else:
85
+ logger.debug("%s results: %s", operation, kwargs)
86
+
87
+ def log_connection_validated(self, connector_type: str, endpoint: Optional[str] = None):
88
+ """Log successful connection validation."""
89
+ if self._logging_config.show_connection_details and endpoint:
90
+ if self._should_sanitize():
91
+ sanitized_endpoint = self._sanitizer.sanitize_url(endpoint)
92
+ logger.debug(
93
+ "Connection to %s validated successfully: %s",
94
+ connector_type,
95
+ sanitized_endpoint,
96
+ )
97
+ else:
98
+ logger.debug(
99
+ "Connection to %s validated successfully: %s", connector_type, endpoint
100
+ )
101
+ else:
102
+ logger.debug("Connection to %s validated successfully", connector_type)
103
+
104
+ def log_connection_failed(
105
+ self, connector_type: str, error: Exception, endpoint: Optional[str] = None
106
+ ):
107
+ """Log connection validation failure."""
108
+ if endpoint:
109
+ if self._should_sanitize():
110
+ sanitized_endpoint = self._sanitizer.sanitize_url(endpoint)
111
+ logger.error(
112
+ "Failed to validate %s connection to %s: %s",
113
+ connector_type,
114
+ sanitized_endpoint,
115
+ error,
116
+ exc_info=True,
117
+ )
118
+ else:
119
+ logger.error(
120
+ "Failed to validate %s connection to %s: %s",
121
+ connector_type,
122
+ endpoint,
123
+ error,
124
+ exc_info=True,
125
+ )
126
+ else:
127
+ logger.error(
128
+ "Failed to validate %s connection: %s", connector_type, error, exc_info=True
129
+ )
130
+
131
+ def log_progress(
132
+ self, current: int, total: int, item_type: str = "items", operation: str = "Processing"
133
+ ):
134
+ """Log progress for long-running operations."""
135
+ if total > 0 and current % self._logging_config.log_progress_interval == 0:
136
+ progress = (current / total) * 100
137
+ logger.info("%s: %s/%s %s (%.1f%%)", operation, current, total, item_type, progress)
138
+
139
+ def log_batch_progress(
140
+ self, batch_num: int, total_batches: int, batch_size: int, operation: str = "Processing"
141
+ ):
142
+ """Log progress for batch operations."""
143
+ logger.info("%s batch %s/%s (%s items)", operation, batch_num, total_batches, batch_size)
144
+
145
+ def log_document_operation(
146
+ self,
147
+ operation: str,
148
+ document_location: Optional[str] = None,
149
+ document_id: Optional[str] = None,
150
+ content_size: Optional[int] = None,
151
+ **kwargs,
152
+ ):
153
+ """Log document-related operations (universal for all connector types)."""
154
+ if self._logging_config.log_document_locations and document_location:
155
+ if self._should_sanitize():
156
+ sanitized_location = self._sanitizer.sanitize_location(document_location)
157
+ logger.debug("%s: %s", operation, sanitized_location)
158
+ else:
159
+ logger.debug("%s: %s", operation, document_location)
160
+ elif self._logging_config.log_document_ids and document_id:
161
+ if self._should_sanitize():
162
+ sanitized_id = self._sanitizer.sanitize_document_id(document_id)
163
+ logger.debug("%s: %s", operation, sanitized_id)
164
+ else:
165
+ logger.debug("%s: %s", operation, document_id)
166
+ else:
167
+ logger.debug("%s: <document>", operation)
168
+
169
+ if content_size is not None:
170
+ kwargs["content_size"] = content_size
171
+
172
+ if kwargs:
173
+ if self._should_sanitize():
174
+ sanitized_kwargs = self._sanitizer.sanitize_dict(kwargs)
175
+ logger.debug("%s details: %s", operation, sanitized_kwargs)
176
+ else:
177
+ logger.debug("%s details: %s", operation, kwargs)
178
+
179
+ def log_file_operation(
180
+ self,
181
+ operation: str,
182
+ file_path: Optional[str] = None,
183
+ file_id: Optional[str] = None,
184
+ **kwargs,
185
+ ):
186
+ """Log file-related operations (backward compatibility wrapper)."""
187
+ self.log_document_operation(
188
+ operation=operation, document_location=file_path, document_id=file_id, **kwargs
189
+ )
190
+
191
+ def log_document_download_start(
192
+ self,
193
+ document_location: Optional[str] = None,
194
+ document_id: Optional[str] = None,
195
+ content_size: Optional[int] = None,
196
+ ):
197
+ """Log the start of a document download/retrieval."""
198
+ logger.info("Starting document download")
199
+
200
+ self.log_document_operation(
201
+ "Download",
202
+ document_location=document_location,
203
+ document_id=document_id,
204
+ content_size=content_size,
205
+ )
206
+
207
+ def log_document_download_complete(
208
+ self,
209
+ document_location: Optional[str] = None,
210
+ document_id: Optional[str] = None,
211
+ download_path: Optional[str] = None,
212
+ content_size: Optional[int] = None,
213
+ items_retrieved: Optional[int] = None,
214
+ ):
215
+ """Log the completion of a document download/retrieval."""
216
+ logger.info("Document download completed")
217
+
218
+ details = {}
219
+ if download_path:
220
+ details["download_path"] = download_path
221
+ if items_retrieved is not None:
222
+ details["items_retrieved"] = items_retrieved
223
+
224
+ self.log_document_operation(
225
+ "Download completed",
226
+ document_location=document_location,
227
+ document_id=document_id,
228
+ content_size=content_size,
229
+ **details,
230
+ )
231
+
232
+ def log_download_start(
233
+ self,
234
+ file_path: Optional[str] = None,
235
+ file_id: Optional[str] = None,
236
+ file_size: Optional[int] = None,
237
+ ):
238
+ """Log the start of a file download (backward compatibility wrapper)."""
239
+ self.log_document_download_start(
240
+ document_location=file_path, document_id=file_id, content_size=file_size
241
+ )
242
+
243
+ def log_download_complete(
244
+ self,
245
+ file_path: Optional[str] = None,
246
+ file_id: Optional[str] = None,
247
+ download_path: Optional[str] = None,
248
+ file_size: Optional[int] = None,
249
+ ):
250
+ """Log the completion of a file download (backward compatibility wrapper)."""
251
+ self.log_document_download_complete(
252
+ document_location=file_path,
253
+ document_id=file_id,
254
+ download_path=download_path,
255
+ content_size=file_size,
256
+ )
257
+
258
+ def log_upload_start(
259
+ self,
260
+ file_path: Optional[str] = None,
261
+ destination: Optional[str] = None,
262
+ file_size: Optional[int] = None,
263
+ ):
264
+ """Log the start of a file upload."""
265
+ logger.info("Starting file upload")
266
+
267
+ details = {}
268
+ if destination:
269
+ details["destination"] = destination
270
+
271
+ self.log_file_operation("Upload", file_path=file_path, **details)
272
+
273
+ def log_upload_complete(
274
+ self,
275
+ file_path: Optional[str] = None,
276
+ destination: Optional[str] = None,
277
+ file_id: Optional[str] = None,
278
+ file_size: Optional[int] = None,
279
+ ):
280
+ """Log the completion of a file upload."""
281
+ logger.info("File upload completed")
282
+
283
+ details = {}
284
+ if destination:
285
+ details["destination"] = destination
286
+ if file_id:
287
+ details["file_id"] = file_id
288
+
289
+ self.log_file_operation("Upload completed", file_path=file_path, **details)
290
+
291
+ def log_indexing_start(self, source_type: str, count: Optional[int] = None):
292
+ """Log the start of indexing operation."""
293
+ if count:
294
+ logger.info("Starting indexing of %s (%s items)", source_type, count)
295
+ else:
296
+ logger.info("Starting indexing of %s", source_type)
297
+
298
+ def log_indexing_complete(self, source_type: str, count: int):
299
+ """Log the completion of indexing operation."""
300
+ logger.info("Indexing completed: %s %s items indexed", count, source_type)
301
+
302
+ def log_info(self, message: str, context: Optional[Dict[str, Any]] = None, **kwargs):
303
+ """Log an info message with optional context and sanitization."""
304
+ logger.info(message)
305
+ self._log_context("Info", context, **kwargs)
306
+
307
+ def log_debug(self, message: str, context: Optional[Dict[str, Any]] = None, **kwargs):
308
+ """Log a debug message with optional context and sanitization."""
309
+ logger.debug(message)
310
+ self._log_context("Debug", context, **kwargs)
311
+
312
+ def log_warning(self, message: str, context: Optional[Dict[str, Any]] = None, **kwargs):
313
+ """Log a warning message with optional context and sanitization."""
314
+ logger.warning(message)
315
+ self._log_context("Warning", context, **kwargs)
316
+
317
+ def log_error(
318
+ self,
319
+ message: str,
320
+ error: Optional[Exception] = None,
321
+ context: Optional[Dict[str, Any]] = None,
322
+ **kwargs,
323
+ ):
324
+ """Log an error message with optional exception, context and sanitization."""
325
+ if error:
326
+ logger.error("%s: %s", message, error, exc_info=True)
327
+ else:
328
+ logger.error(message)
329
+ self._log_context("Error", context, **kwargs)
330
+
331
+ def _log_context(self, log_type: str, context: Optional[Dict[str, Any]], **kwargs):
332
+ """Helper method to log context with sanitization."""
333
+ all_context = {}
334
+ if context:
335
+ all_context.update(context)
336
+ if kwargs:
337
+ all_context.update(kwargs)
338
+
339
+ if all_context:
340
+ if self._should_sanitize():
341
+ sanitized_context = self._sanitizer.sanitize_dict(all_context)
342
+ logger.debug("%s context: %s", log_type, sanitized_context)
343
+ else:
344
+ logger.debug("%s context: %s", log_type, all_context)
345
+
346
+ def log_api_call(self, method: str, endpoint: str, status_code: Optional[int] = None, **kwargs):
347
+ """Log API call details."""
348
+ if self._should_sanitize():
349
+ sanitized_endpoint = self._sanitizer.sanitize_url(endpoint)
350
+ if status_code:
351
+ logger.debug("API call: %s %s -> %s", method, sanitized_endpoint, status_code)
352
+ else:
353
+ logger.debug("API call: %s %s", method, sanitized_endpoint)
354
+ else:
355
+ if status_code:
356
+ logger.debug("API call: %s %s -> %s", method, endpoint, status_code)
357
+ else:
358
+ logger.debug("API call: %s %s", method, endpoint)
359
+
360
+ if kwargs:
361
+ if self._should_sanitize():
362
+ sanitized_kwargs = self._sanitizer.sanitize_dict(kwargs)
363
+ logger.debug("API call details: %s", sanitized_kwargs)
364
+ else:
365
+ logger.debug("API call details: %s", kwargs)
@@ -0,0 +1,117 @@
1
+ from pathlib import Path
2
+ from typing import Any, Dict, Optional, Union
3
+ from urllib.parse import urlparse
4
+
5
+
6
+ class DataSanitizer:
7
+ """Utility class for sanitizing sensitive data in logs."""
8
+
9
+ @staticmethod
10
+ def sanitize_path(path: Union[str, Path]) -> str:
11
+ """Sanitize file paths for logging, showing only filename and partial path."""
12
+ if not path:
13
+ return "<empty>"
14
+
15
+ path_str = str(path)
16
+ path_obj = Path(path_str)
17
+
18
+ if len(path_obj.parts) > 2:
19
+ return f".../{path_obj.parent.name}/{path_obj.name}"
20
+ return path_obj.name
21
+
22
+ @staticmethod
23
+ def sanitize_id(identifier: str) -> str:
24
+ """Sanitize IDs for logging, showing only first/last few characters."""
25
+ if not identifier:
26
+ return "<id>"
27
+ if len(identifier) < 10:
28
+ half_len = len(identifier) // 2
29
+ return f"{identifier[:half_len]}..."
30
+ return f"{identifier[:4]}...{identifier[-4:]}"
31
+
32
+ @staticmethod
33
+ def sanitize_url(url: str) -> str:
34
+ """Sanitize URLs for logging, removing sensitive query parameters."""
35
+ if not url:
36
+ return "<url>"
37
+ try:
38
+ parsed = urlparse(url)
39
+ return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
40
+ except (ValueError, TypeError):
41
+ return "<url>"
42
+
43
+ @staticmethod
44
+ def sanitize_token(token: str) -> str:
45
+ """Sanitize tokens and secrets for logging."""
46
+ if not token:
47
+ return "<token>"
48
+ if len(token) < 10:
49
+ half_len = len(token) // 2
50
+ return f"{token[:half_len]}..."
51
+ return f"{token[:4]}...{token[-4:]}"
52
+
53
+ @staticmethod
54
+ def sanitize_location(location: Union[str, Path]) -> str:
55
+ """Sanitize document locations (file paths, URLs, database references) for logging."""
56
+ if not location:
57
+ return "<empty>"
58
+
59
+ location_str = str(location)
60
+
61
+ # Handle URLs
62
+ if location_str.startswith(("http://", "https://", "ftp://", "ftps://")):
63
+ return DataSanitizer.sanitize_url(location_str)
64
+
65
+ # Handle database-style references (table:id, collection/document, etc.)
66
+ if ":" in location_str and not location_str.startswith("/"):
67
+ parts = location_str.split(":", 1)
68
+ if len(parts) == 2:
69
+ table_name, record_id = parts
70
+ return f"{table_name}:{DataSanitizer.sanitize_id(record_id)}"
71
+
72
+ return DataSanitizer.sanitize_path(location_str)
73
+
74
+ @staticmethod
75
+ def sanitize_document_id(document_id: str) -> str:
76
+ """Sanitize document IDs for logging (alias for sanitize_id for clarity)."""
77
+ return DataSanitizer.sanitize_id(document_id)
78
+
79
+ @staticmethod
80
+ def sanitize_dict(data: Dict[str, Any], sensitive_keys: Optional[set] = None) -> Dict[str, Any]:
81
+ """Sanitize dictionary data for logging."""
82
+ if sensitive_keys is None:
83
+ sensitive_keys = {
84
+ "password",
85
+ "token",
86
+ "secret",
87
+ "key",
88
+ "api_key",
89
+ "access_token",
90
+ "refresh_token",
91
+ "client_secret",
92
+ "private_key",
93
+ "credentials",
94
+ }
95
+
96
+ sanitized = {}
97
+ for k, v in data.items():
98
+ key_lower = k.lower()
99
+ if any(sensitive_key in key_lower for sensitive_key in sensitive_keys):
100
+ sanitized[k] = DataSanitizer.sanitize_token(str(v))
101
+ elif isinstance(v, dict):
102
+ sanitized[k] = DataSanitizer.sanitize_dict(v, sensitive_keys)
103
+ elif isinstance(v, (str, Path)) and (
104
+ "path" in key_lower
105
+ or "file" in key_lower
106
+ or "location" in key_lower
107
+ or "document_location" in key_lower
108
+ ):
109
+ sanitized[k] = DataSanitizer.sanitize_location(v)
110
+ elif isinstance(v, str) and (
111
+ ("id" in key_lower and len(str(v)) > 8)
112
+ or ("document_id" in key_lower and len(str(v)) > 8)
113
+ ):
114
+ sanitized[k] = DataSanitizer.sanitize_document_id(v)
115
+ else:
116
+ sanitized[k] = v
117
+ return sanitized
@@ -0,0 +1,140 @@
1
+ from dataclasses import fields
2
+ from pathlib import Path
3
+ from typing import TYPE_CHECKING, Optional
4
+
5
+ from unstructured_ingest.error import ProviderError, QuotaError, UserAuthError, UserError
6
+ from unstructured_ingest.logger import logger
7
+
8
+ if TYPE_CHECKING:
9
+ from unstructured_client.models.operations import PartitionRequest
10
+
11
+
12
+ def create_partition_request(filename: Path, parameters_dict: dict) -> "PartitionRequest":
13
+ """Given a filename and a dict of API parameters, return a PartitionRequest for use
14
+ by unstructured-client. Remove any params that aren't recognized by the SDK.
15
+
16
+ Args:
17
+ filename: Path to the file being partitioned
18
+ parameters_dict: A mapping of all API params we want to send
19
+
20
+ Returns: A PartitionRequest containing the file and all valid params
21
+ """
22
+ from unstructured_client.models.operations import PartitionRequest
23
+ from unstructured_client.models.shared import Files, PartitionParameters
24
+
25
+ # NOTE(austin): PartitionParameters is a Pydantic model in v0.26.0
26
+ # Prior to this it was a dataclass which doesn't have .__fields
27
+ try:
28
+ possible_fields = PartitionParameters.model_fields
29
+ except AttributeError:
30
+ possible_fields = [f.name for f in fields(PartitionParameters)]
31
+
32
+ filtered_partition_request = {k: v for k, v in parameters_dict.items() if k in possible_fields}
33
+ if len(filtered_partition_request) != len(parameters_dict):
34
+ logger.debug(
35
+ "Following fields were omitted due to not being "
36
+ "supported by the currently used unstructured client: {}".format(
37
+ ", ".join([v for v in parameters_dict if v not in filtered_partition_request])
38
+ )
39
+ )
40
+
41
+ logger.debug(f"using hosted partitioner with kwargs: {parameters_dict}")
42
+
43
+ with open(filename, "rb") as f:
44
+ files = Files(
45
+ content=f.read(),
46
+ file_name=str(filename.resolve()),
47
+ )
48
+ filtered_partition_request["files"] = files
49
+
50
+ partition_params = PartitionParameters(**filtered_partition_request)
51
+
52
+ return PartitionRequest(partition_parameters=partition_params)
53
+
54
+
55
+ def wrap_error(e: Exception) -> Exception:
56
+ from unstructured_client.models.errors.httpvalidationerror import HTTPValidationError
57
+ from unstructured_client.models.errors.sdkerror import SDKError
58
+ from unstructured_client.models.errors.servererror import ServerError
59
+
60
+ if isinstance(e, HTTPValidationError):
61
+ return UserError(e.data.detail)
62
+ if isinstance(e, ServerError):
63
+ return ProviderError(e.data.detail)
64
+
65
+ if not isinstance(e, SDKError):
66
+ logger.error(f"Uncaught Error calling API: {e}")
67
+ raise e
68
+ status_code = e.status_code
69
+ body = e.body
70
+ if status_code == 402:
71
+ return QuotaError(body)
72
+ if status_code in [401, 403]:
73
+ return UserAuthError(body)
74
+ if 400 <= status_code < 500:
75
+ return UserError(body)
76
+ if status_code >= 500:
77
+ return ProviderError(body)
78
+ logger.error(f"Uncaught Error calling API: {e}")
79
+ raise e
80
+
81
+
82
+ async def call_api_async(
83
+ server_url: Optional[str],
84
+ api_key: Optional[str],
85
+ filename: Path,
86
+ api_parameters: dict,
87
+ timeout_ms: Optional[int] = None,
88
+ ) -> list[dict]:
89
+ """Call the Unstructured API using unstructured-client.
90
+
91
+ Args:
92
+ server_url: The base URL where the API is hosted
93
+ api_key: The user's API key (can be empty if this is a self hosted API)
94
+ filename: Path to the file being partitioned
95
+ api_parameters: A dict containing the requested API parameters
96
+
97
+ Returns: A list of the file's elements, or an empty list if there was an error
98
+ """
99
+ from unstructured_client import UnstructuredClient
100
+
101
+ client = UnstructuredClient(server_url=server_url, api_key_auth=api_key)
102
+ partition_request = create_partition_request(filename=filename, parameters_dict=api_parameters)
103
+ try:
104
+ res = await client.general.partition_async(request=partition_request, timeout_ms=timeout_ms)
105
+ except Exception as e:
106
+ raise wrap_error(e)
107
+
108
+ return res.elements or []
109
+
110
+
111
+ def call_api(
112
+ server_url: Optional[str],
113
+ api_key: Optional[str],
114
+ filename: Path,
115
+ api_parameters: dict,
116
+ timeout_ms: Optional[int] = None,
117
+ ) -> list[dict]:
118
+ """Call the Unstructured API using unstructured-client.
119
+
120
+ Args:
121
+ server_url: The base URL where the API is hosted
122
+ api_key: The user's API key (can be empty if this is a self hosted API)
123
+ filename: Path to the file being partitioned
124
+ api_parameters: A dict containing the requested API parameters
125
+
126
+ Returns: A list of the file's elements, or an empty list if there was an error
127
+ """
128
+ from unstructured_client import UnstructuredClient
129
+
130
+ client = UnstructuredClient(
131
+ server_url=server_url,
132
+ api_key_auth=api_key,
133
+ )
134
+ partition_request = create_partition_request(filename=filename, parameters_dict=api_parameters)
135
+ try:
136
+ res = client.general.partition(request=partition_request, timeout_ms=timeout_ms)
137
+ except Exception as e:
138
+ raise wrap_error(e)
139
+
140
+ return res.elements or []
@@ -0,0 +1,5 @@
1
+ """Utility functions for unstructured-ingest."""
2
+
3
+ from unstructured_ingest.utils.filesystem import mkdir_concurrent_safe
4
+
5
+ __all__ = ["mkdir_concurrent_safe"]
@@ -0,0 +1,56 @@
1
+ import base64
2
+ import hashlib
3
+ import json
4
+ import zlib
5
+ from itertools import groupby
6
+
7
+
8
+ def id_to_hash(element: dict, sequence_number: int) -> str:
9
+ """Calculates and assigns a deterministic hash as an ID.
10
+
11
+ The hash ID is based on element's text, sequence number on page,
12
+ page number and its filename.
13
+
14
+ Args:
15
+ sequence_number: index on page
16
+
17
+ Returns: new ID value
18
+ """
19
+ filename = element["metadata"].get("filename")
20
+ text = element["text"]
21
+ page_number = element["metadata"].get("page_number")
22
+ data = f"{filename}{text}{page_number}{sequence_number}"
23
+ element["element_id"] = hashlib.sha256(data.encode()).hexdigest()[:32]
24
+ return element["element_id"]
25
+
26
+
27
+ def assign_and_map_hash_ids(elements: list[dict]) -> list[dict]:
28
+ # -- generate sequence number for each element on a page --
29
+ elements = elements.copy()
30
+ page_numbers = [e["metadata"].get("page_number") for e in elements]
31
+ page_seq_pairs = [
32
+ seq_on_page for page, group in groupby(page_numbers) for seq_on_page, _ in enumerate(group)
33
+ ]
34
+
35
+ # -- assign hash IDs to elements --
36
+ old_to_new_mapping = {
37
+ element["element_id"]: id_to_hash(element=element, sequence_number=seq_on_page_counter)
38
+ for element, seq_on_page_counter in zip(elements, page_seq_pairs)
39
+ }
40
+
41
+ # -- map old parent IDs to new ones --
42
+ for e in elements:
43
+ parent_id = e["metadata"].get("parent_id")
44
+ if not parent_id:
45
+ continue
46
+ e["metadata"]["parent_id"] = old_to_new_mapping[parent_id]
47
+
48
+ return elements
49
+
50
+
51
+ def elements_from_base64_gzipped_json(raw_s: str) -> list[dict]:
52
+ decoded_b64_bytes = base64.b64decode(raw_s)
53
+ elements_json_bytes = zlib.decompress(decoded_b64_bytes)
54
+ elements_json_str = elements_json_bytes.decode("utf-8")
55
+ element_dicts = json.loads(elements_json_str)
56
+ return element_dicts