unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,311 @@
1
+ import json
2
+ from contextlib import contextmanager
3
+ from dataclasses import dataclass, field
4
+ from datetime import datetime
5
+ from typing import TYPE_CHECKING, Any, Generator, Optional
6
+
7
+ from dateutil import parser
8
+ from pydantic import Field, Secret
9
+
10
+ from unstructured_ingest.data_types.file_data import FileData
11
+ from unstructured_ingest.error import (
12
+ DestinationConnectionError,
13
+ KeyError,
14
+ WriteError,
15
+ )
16
+ from unstructured_ingest.interfaces import (
17
+ AccessConfig,
18
+ ConnectionConfig,
19
+ Uploader,
20
+ UploaderConfig,
21
+ UploadStager,
22
+ UploadStagerConfig,
23
+ )
24
+ from unstructured_ingest.logger import logger
25
+ from unstructured_ingest.processes.connector_registry import (
26
+ DestinationRegistryEntry,
27
+ )
28
+ from unstructured_ingest.utils.constants import RECORD_ID_LABEL
29
+ from unstructured_ingest.utils.data_prep import flatten_dict
30
+ from unstructured_ingest.utils.dep_check import requires_dependencies
31
+
32
+ if TYPE_CHECKING:
33
+ from pymilvus import MilvusClient
34
+
35
+ CONNECTOR_TYPE = "milvus"
36
+
37
+
38
+ class MilvusAccessConfig(AccessConfig):
39
+ password: Optional[str] = Field(default=None, description="Milvus password")
40
+ token: Optional[str] = Field(default=None, description="Milvus access token")
41
+
42
+
43
+ class MilvusConnectionConfig(ConnectionConfig):
44
+ access_config: Secret[MilvusAccessConfig] = Field(
45
+ default=MilvusAccessConfig(), validate_default=True
46
+ )
47
+ uri: Optional[str] = Field(
48
+ default=None, description="Milvus uri", examples=["http://localhost:19530"]
49
+ )
50
+ user: Optional[str] = Field(default=None, description="Milvus user")
51
+ db_name: Optional[str] = Field(default=None, description="Milvus database name")
52
+
53
+ def get_connection_kwargs(self) -> dict[str, Any]:
54
+ access_config = self.access_config.get_secret_value()
55
+ access_config_dict = access_config.model_dump()
56
+ connection_config_dict = self.model_dump()
57
+ connection_config_dict.pop("access_config", None)
58
+ connection_config_dict.update(access_config_dict)
59
+ # Drop any that were not set explicitly
60
+ connection_config_dict = {k: v for k, v in connection_config_dict.items() if v is not None}
61
+ return connection_config_dict
62
+
63
+ @requires_dependencies(["pymilvus"], extras="milvus")
64
+ @contextmanager
65
+ def get_client(self) -> Generator["MilvusClient", None, None]:
66
+ from pymilvus import MilvusClient
67
+
68
+ client = None
69
+ try:
70
+ client = MilvusClient(**self.get_connection_kwargs())
71
+ yield client
72
+ finally:
73
+ if client:
74
+ client.close()
75
+
76
+
77
+ class MilvusUploadStagerConfig(UploadStagerConfig):
78
+ fields_to_include: Optional[list[str]] = None
79
+ """If set - list of fields to include in the output.
80
+ Unspecified fields are removed from the elements.
81
+ This action takes place after metadata flattening.
82
+ Missing fields will cause stager to throw KeyError."""
83
+
84
+ flatten_metadata: bool = True
85
+ """If set - flatten "metadata" key and put contents directly into data"""
86
+
87
+
88
+ @dataclass
89
+ class MilvusUploadStager(UploadStager):
90
+ upload_stager_config: MilvusUploadStagerConfig = field(
91
+ default_factory=lambda: MilvusUploadStagerConfig()
92
+ )
93
+
94
+ @staticmethod
95
+ def parse_date_string(date_string: str) -> float:
96
+ try:
97
+ timestamp = float(date_string)
98
+ return timestamp
99
+ except ValueError:
100
+ pass
101
+
102
+ try:
103
+ dt = datetime.fromisoformat(date_string.replace("Z", "+00:00"))
104
+ return dt.timestamp()
105
+ except ValueError:
106
+ pass
107
+
108
+ return parser.parse(date_string).timestamp()
109
+
110
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
111
+ working_data = element_dict.copy()
112
+
113
+ if self.upload_stager_config.flatten_metadata:
114
+ metadata: dict[str, Any] = working_data.pop("metadata", {})
115
+ flattened_metadata = flatten_dict(
116
+ metadata,
117
+ separator="_",
118
+ flatten_lists=False,
119
+ remove_none=True,
120
+ )
121
+ working_data.update(flattened_metadata)
122
+
123
+ # TODO: milvus sdk doesn't seem to support defaults via the schema yet,
124
+ # remove once that gets updated
125
+ defaults = {"is_continuation": False}
126
+ for default in defaults:
127
+ if default not in working_data:
128
+ working_data[default] = defaults[default]
129
+
130
+ if self.upload_stager_config.fields_to_include:
131
+ data_keys = set(working_data.keys())
132
+ for data_key in data_keys:
133
+ if data_key not in self.upload_stager_config.fields_to_include:
134
+ working_data.pop(data_key)
135
+ for field_include_key in self.upload_stager_config.fields_to_include:
136
+ if field_include_key not in working_data:
137
+ raise KeyError(f"Field '{field_include_key}' is missing in data!")
138
+
139
+ datetime_columns = [
140
+ "data_source_date_created",
141
+ "data_source_date_modified",
142
+ "data_source_date_processed",
143
+ "last_modified",
144
+ ]
145
+
146
+ json_dumps_fields = ["languages", "data_source_permissions_data"]
147
+
148
+ for datetime_column in datetime_columns:
149
+ if datetime_column in working_data:
150
+ working_data[datetime_column] = self.parse_date_string(
151
+ working_data[datetime_column]
152
+ )
153
+ for json_dumps_field in json_dumps_fields:
154
+ if json_dumps_field in working_data:
155
+ working_data[json_dumps_field] = json.dumps(working_data[json_dumps_field])
156
+ working_data[RECORD_ID_LABEL] = file_data.identifier
157
+ return working_data
158
+
159
+
160
+ class MilvusUploaderConfig(UploaderConfig):
161
+ db_name: Optional[str] = Field(default=None, description="Milvus database name")
162
+ collection_name: str = Field(description="Milvus collections to write to")
163
+ record_id_key: str = Field(
164
+ default=RECORD_ID_LABEL,
165
+ description="searchable key to find entries for the same record on previous runs",
166
+ )
167
+
168
+
169
+ @dataclass
170
+ class MilvusUploader(Uploader):
171
+ connection_config: MilvusConnectionConfig
172
+ upload_config: MilvusUploaderConfig
173
+ connector_type: str = CONNECTOR_TYPE
174
+
175
+ def has_dynamic_fields_enabled(self) -> bool:
176
+ """Check if the target collection has dynamic fields enabled."""
177
+ try:
178
+ with self.get_client() as client:
179
+ collection_info = client.describe_collection(self.upload_config.collection_name)
180
+
181
+ # Check if dynamic field is enabled
182
+ # The schema info should contain enable_dynamic_field or enableDynamicField
183
+ schema_info = collection_info.get(
184
+ "enable_dynamic_field",
185
+ collection_info.get("enableDynamicField", False),
186
+ )
187
+ return bool(schema_info)
188
+ except Exception as e:
189
+ logger.warning(f"Could not determine if collection has dynamic fields enabled: {e}")
190
+ return False
191
+
192
+ @DestinationConnectionError.wrap
193
+ def precheck(self):
194
+ from pymilvus import MilvusException
195
+
196
+ try:
197
+ with self.get_client() as client:
198
+ if not client.has_collection(self.upload_config.collection_name):
199
+ raise DestinationConnectionError(
200
+ f"Collection '{self.upload_config.collection_name}' does not exist"
201
+ )
202
+
203
+ except MilvusException as milvus_exception:
204
+ raise DestinationConnectionError(
205
+ f"failed to precheck Milvus: {str(milvus_exception.message)}"
206
+ ) from milvus_exception
207
+
208
+ @contextmanager
209
+ def get_client(self) -> Generator["MilvusClient", None, None]:
210
+ with self.connection_config.get_client() as client:
211
+ if db_name := self.upload_config.db_name:
212
+ client.using_database(db_name=db_name)
213
+ yield client
214
+
215
+ def delete_by_record_id(self, file_data: FileData) -> None:
216
+ logger.info(
217
+ f"deleting any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
218
+ f"from milvus collection {self.upload_config.collection_name}"
219
+ )
220
+ with self.get_client() as client:
221
+ delete_filter = f'{self.upload_config.record_id_key} == "{file_data.identifier}"'
222
+ resp = client.delete(
223
+ collection_name=self.upload_config.collection_name, filter=delete_filter
224
+ )
225
+ logger.info(
226
+ "deleted {} records from milvus collection {}".format(
227
+ resp["delete_count"], self.upload_config.collection_name
228
+ )
229
+ )
230
+
231
+ @requires_dependencies(["pymilvus"], extras="milvus")
232
+ def _prepare_data_for_insert(self, data: list[dict]) -> list[dict]:
233
+ """
234
+ Conforms the provided data to the schema of the target Milvus collection.
235
+ - If dynamic fields are enabled, it ensures JSON-stringified fields are decoded.
236
+ - If dynamic fields are disabled, it filters out any fields not present in the schema.
237
+ """
238
+
239
+ dynamic_fields_enabled = self.has_dynamic_fields_enabled()
240
+
241
+ # If dynamic fields are enabled, 'languages' field needs to be a list
242
+ if dynamic_fields_enabled:
243
+ logger.debug("Dynamic fields enabled, ensuring 'languages' field is a list.")
244
+ prepared_data = []
245
+ for item in data:
246
+ new_item = item.copy()
247
+ if "languages" in new_item and isinstance(new_item["languages"], str):
248
+ try:
249
+ new_item["languages"] = json.loads(new_item["languages"])
250
+ except (json.JSONDecodeError, TypeError):
251
+ logger.warning(
252
+ f"Could not JSON decode languages field: {new_item['languages']}. "
253
+ "Leaving as string.",
254
+ )
255
+ prepared_data.append(new_item)
256
+ return prepared_data
257
+
258
+ # If dynamic fields are not enabled, we need to filter out the metadata fields
259
+ # to avoid insertion errors for fields not defined in the schema
260
+ with self.get_client() as client:
261
+ collection_info = client.describe_collection(
262
+ self.upload_config.collection_name,
263
+ )
264
+ schema_fields = {
265
+ field["name"]
266
+ for field in collection_info.get("fields", [])
267
+ if not field.get("auto_id", False)
268
+ }
269
+ # Remove metadata fields that are not part of the base schema
270
+ filtered_data = []
271
+ for item in data:
272
+ filtered_item = {key: value for key, value in item.items() if key in schema_fields}
273
+ filtered_data.append(filtered_item)
274
+ return filtered_data
275
+
276
+ @requires_dependencies(["pymilvus"], extras="milvus")
277
+ def insert_results(self, data: list[dict]):
278
+ from pymilvus import MilvusException
279
+
280
+ logger.info(
281
+ f"uploading {len(data)} entries to {self.connection_config.db_name} "
282
+ f"db in collection {self.upload_config.collection_name}"
283
+ )
284
+
285
+ prepared_data = self._prepare_data_for_insert(data=data)
286
+
287
+ with self.get_client() as client:
288
+ try:
289
+ res = client.insert(
290
+ collection_name=self.upload_config.collection_name, data=prepared_data
291
+ )
292
+ except MilvusException as milvus_exception:
293
+ raise WriteError(
294
+ f"failed to upload records to Milvus: {str(milvus_exception.message)}"
295
+ ) from milvus_exception
296
+ if "err_count" in res and isinstance(res["err_count"], int) and res["err_count"] > 0:
297
+ err_count = res["err_count"]
298
+ raise WriteError(f"failed to upload {err_count} docs")
299
+
300
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
301
+ self.delete_by_record_id(file_data=file_data)
302
+ self.insert_results(data=data)
303
+
304
+
305
+ milvus_destination_entry = DestinationRegistryEntry(
306
+ connection_config=MilvusConnectionConfig,
307
+ uploader=MilvusUploader,
308
+ uploader_config=MilvusUploaderConfig,
309
+ upload_stager=MilvusUploadStager,
310
+ upload_stager_config=MilvusUploadStagerConfig,
311
+ )
@@ -0,0 +1,389 @@
1
+ from contextlib import contextmanager
2
+ from dataclasses import dataclass
3
+ from datetime import datetime
4
+ from time import time
5
+ from typing import TYPE_CHECKING, Any, Generator, Optional
6
+
7
+ from pydantic import BaseModel, Field, Secret
8
+
9
+ from unstructured_ingest.__version__ import __version__ as unstructured_version
10
+ from unstructured_ingest.data_types.file_data import (
11
+ BatchFileData,
12
+ BatchItem,
13
+ FileData,
14
+ FileDataSourceMetadata,
15
+ SourceIdentifiers,
16
+ )
17
+ from unstructured_ingest.error import (
18
+ ConnectionError,
19
+ DestinationConnectionError,
20
+ SourceConnectionError,
21
+ ValueError,
22
+ )
23
+ from unstructured_ingest.interfaces import (
24
+ AccessConfig,
25
+ ConnectionConfig,
26
+ Downloader,
27
+ DownloaderConfig,
28
+ DownloadResponse,
29
+ Indexer,
30
+ IndexerConfig,
31
+ Uploader,
32
+ UploaderConfig,
33
+ download_responses,
34
+ )
35
+ from unstructured_ingest.logger import logger
36
+ from unstructured_ingest.processes.connector_registry import (
37
+ DestinationRegistryEntry,
38
+ SourceRegistryEntry,
39
+ )
40
+ from unstructured_ingest.utils.constants import RECORD_ID_LABEL
41
+ from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
42
+ from unstructured_ingest.utils.dep_check import requires_dependencies
43
+
44
+ if TYPE_CHECKING:
45
+ from pymongo import MongoClient
46
+ from pymongo.collection import Collection
47
+
48
+ CONNECTOR_TYPE = "mongodb"
49
+ SERVER_API_VERSION = "1"
50
+
51
+
52
+ class MongoDBAdditionalMetadata(BaseModel):
53
+ database: str
54
+ collection: str
55
+
56
+
57
+ class MongoDBBatchFileData(BatchFileData):
58
+ additional_metadata: MongoDBAdditionalMetadata
59
+
60
+
61
+ class MongoDBAccessConfig(AccessConfig):
62
+ uri: Optional[str] = Field(default=None, description="URI to user when connecting")
63
+
64
+
65
+ class MongoDBConnectionConfig(ConnectionConfig):
66
+ access_config: Secret[MongoDBAccessConfig] = Field(
67
+ default=MongoDBAccessConfig(), validate_default=True
68
+ )
69
+ host: Optional[str] = Field(
70
+ default=None,
71
+ description="hostname or IP address or Unix domain socket path of a single mongod or "
72
+ "mongos instance to connect to, or a list of hostnames",
73
+ )
74
+ port: int = Field(default=27017)
75
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
76
+
77
+ @contextmanager
78
+ @requires_dependencies(["pymongo"], extras="mongodb")
79
+ def get_client(self) -> Generator["MongoClient", None, None]:
80
+ from pymongo import MongoClient
81
+ from pymongo.driver_info import DriverInfo
82
+ from pymongo.server_api import ServerApi
83
+
84
+ access_config = self.access_config.get_secret_value()
85
+ if uri := access_config.uri:
86
+ client_kwargs = {
87
+ "host": uri,
88
+ "server_api": ServerApi(version=SERVER_API_VERSION),
89
+ "driver": DriverInfo(name="unstructured", version=unstructured_version),
90
+ }
91
+ else:
92
+ client_kwargs = {
93
+ "host": self.host,
94
+ "port": self.port,
95
+ "server_api": ServerApi(version=SERVER_API_VERSION),
96
+ }
97
+ with MongoClient(**client_kwargs) as client:
98
+ # UnsupportedDigestmodError means that SCRAM-SHA-1 is disabled
99
+ # It uses md5 which is unavailable on FIPS images
100
+ try:
101
+ from hashlib import UnsupportedDigestmodError # type: ignore[attr-defined]
102
+ except ImportError:
103
+ from _hashlib import UnsupportedDigestmodError # type: ignore[attr-defined]
104
+
105
+ # Check if the authentication mechanism is supported
106
+ try:
107
+ client.admin.command("ping")
108
+ except UnsupportedDigestmodError as e:
109
+ raise ConnectionError(
110
+ "Authentication using SCRAM-SHA-1 is disabled. "
111
+ "Use SCRAM-SHA-256 instead. "
112
+ "See: https://www.mongodb.com/docs/manual/core/security-scram/"
113
+ ) from e
114
+ yield client
115
+
116
+
117
+ class MongoDBIndexerConfig(IndexerConfig):
118
+ batch_size: int = Field(default=100, description="Number of records per batch")
119
+ database: Optional[str] = Field(default=None, description="database name to connect to")
120
+ collection: Optional[str] = Field(default=None, description="collection name to connect to")
121
+
122
+
123
+ class MongoDBDownloaderConfig(DownloaderConfig):
124
+ pass
125
+
126
+
127
+ @dataclass
128
+ class MongoDBIndexer(Indexer):
129
+ connection_config: MongoDBConnectionConfig
130
+ index_config: MongoDBIndexerConfig
131
+ connector_type: str = CONNECTOR_TYPE
132
+
133
+ def precheck(self) -> None:
134
+ """Validates the connection to the MongoDB server."""
135
+ try:
136
+ with self.connection_config.get_client() as client:
137
+ client.admin.command("ping")
138
+ database_names = client.list_database_names()
139
+ database_name = self.index_config.database
140
+ if database_name not in database_names:
141
+ raise SourceConnectionError(
142
+ "database {} does not exist: {}".format(
143
+ database_name, ", ".join(database_names)
144
+ )
145
+ )
146
+ database = client[database_name]
147
+ collection_names = database.list_collection_names()
148
+ collection_name = self.index_config.collection
149
+ if collection_name not in collection_names:
150
+ raise SourceConnectionError(
151
+ "collection {} does not exist: {}".format(
152
+ collection_name, ", ".join(collection_names)
153
+ )
154
+ )
155
+ except Exception as e:
156
+ logger.error(f"Failed to validate connection: {e}", exc_info=True)
157
+ raise SourceConnectionError(f"Failed to validate connection: {e}")
158
+
159
+ def run(self, **kwargs: Any) -> Generator[BatchFileData, None, None]:
160
+ """Generates FileData objects for each document in the MongoDB collection."""
161
+ with self.connection_config.get_client() as client:
162
+ database = client[self.index_config.database]
163
+ collection = database[self.index_config.collection]
164
+
165
+ # Get list of document IDs
166
+ ids = collection.distinct("_id")
167
+
168
+ ids = sorted(ids)
169
+ batch_size = self.index_config.batch_size
170
+
171
+ for id_batch in batch_generator(ids, batch_size=batch_size):
172
+ # Make sure the hash is always a positive number to create identifier
173
+ display_name = (
174
+ f"{self.index_config.database}.{self.index_config.collection}, "
175
+ f"batch {id_batch[0]}-{id_batch[-1]}"
176
+ )
177
+ metadata = FileDataSourceMetadata(
178
+ date_processed=str(time()),
179
+ record_locator={
180
+ "database": self.index_config.database,
181
+ "collection": self.index_config.collection,
182
+ },
183
+ )
184
+
185
+ file_data = MongoDBBatchFileData(
186
+ connector_type=self.connector_type,
187
+ metadata=metadata,
188
+ batch_items=[BatchItem(identifier=str(doc_id)) for doc_id in id_batch],
189
+ additional_metadata=MongoDBAdditionalMetadata(
190
+ collection=self.index_config.collection, database=self.index_config.database
191
+ ),
192
+ display_name=display_name,
193
+ )
194
+ yield file_data
195
+
196
+
197
+ @dataclass
198
+ class MongoDBDownloader(Downloader):
199
+ download_config: MongoDBDownloaderConfig
200
+ connection_config: MongoDBConnectionConfig
201
+ connector_type: str = CONNECTOR_TYPE
202
+
203
+ def generate_download_response(
204
+ self, doc: dict, file_data: MongoDBBatchFileData
205
+ ) -> DownloadResponse:
206
+ from bson.objectid import ObjectId
207
+
208
+ doc_id = doc["_id"]
209
+ doc.pop("_id", None)
210
+
211
+ # Extract date_created from the document or ObjectId
212
+ date_created = None
213
+ if "date_created" in doc:
214
+ # If the document has a 'date_created' field, use it
215
+ date_created = doc["date_created"]
216
+ if isinstance(date_created, datetime):
217
+ date_created = date_created.isoformat()
218
+ else:
219
+ # Convert to ISO format if it's a string
220
+ date_created = str(date_created)
221
+ elif isinstance(doc_id, ObjectId):
222
+ # Use the ObjectId's generation time
223
+ date_created = doc_id.generation_time.isoformat()
224
+
225
+ flattened_dict = flatten_dict(dictionary=doc)
226
+ concatenated_values = "\n".join(str(value) for value in flattened_dict.values())
227
+
228
+ # Create a FileData object for each document with source_identifiers
229
+ filename = f"{doc_id}.txt"
230
+ file_data.source_identifiers = SourceIdentifiers(
231
+ filename=filename,
232
+ fullpath=filename,
233
+ )
234
+ cast_file_data = FileData.cast(file_data=file_data)
235
+ cast_file_data.identifier = str(doc_id)
236
+
237
+ # Determine the download path
238
+ download_path = self.get_download_path(file_data=cast_file_data)
239
+ if download_path is None:
240
+ raise ValueError("Download path could not be determined")
241
+
242
+ download_path.parent.mkdir(parents=True, exist_ok=True)
243
+
244
+ # Write the concatenated values to the file
245
+ with open(download_path, "w", encoding="utf8") as f:
246
+ f.write(concatenated_values)
247
+
248
+ # Update metadata
249
+ cast_file_data.metadata.record_locator["document_id"] = str(doc_id)
250
+ cast_file_data.metadata.date_created = date_created
251
+
252
+ return super().generate_download_response(
253
+ file_data=cast_file_data, download_path=download_path
254
+ )
255
+
256
+ @SourceConnectionError.wrap
257
+ @requires_dependencies(["bson"], extras="mongodb")
258
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
259
+ """Fetches the document from MongoDB and writes it to a file."""
260
+ from bson.errors import InvalidId
261
+ from bson.objectid import ObjectId
262
+
263
+ mongo_file_data = MongoDBBatchFileData.cast(file_data=file_data)
264
+
265
+ with self.connection_config.get_client() as client:
266
+ database = client[mongo_file_data.additional_metadata.database]
267
+ collection = database[mongo_file_data.additional_metadata.collection]
268
+
269
+ ids = [item.identifier for item in mongo_file_data.batch_items]
270
+
271
+ object_ids = []
272
+ for doc_id in ids:
273
+ try:
274
+ object_ids.append(ObjectId(doc_id))
275
+ except InvalidId as e:
276
+ error_message = f"Invalid ObjectId for doc_id '{doc_id}': {str(e)}"
277
+ logger.error(error_message)
278
+ raise ValueError(error_message) from e
279
+
280
+ try:
281
+ docs = list(collection.find({"_id": {"$in": object_ids}}))
282
+ except Exception as e:
283
+ logger.error(f"Failed to fetch documents: {e}", exc_info=True)
284
+ raise e
285
+
286
+ download_responses = []
287
+ for doc in docs:
288
+ download_responses.append(
289
+ self.generate_download_response(doc=doc, file_data=mongo_file_data)
290
+ )
291
+
292
+ return download_responses
293
+
294
+
295
+ class MongoDBUploaderConfig(UploaderConfig):
296
+ batch_size: int = Field(default=100, description="Number of records per batch")
297
+ database: Optional[str] = Field(default=None, description="database name to connect to")
298
+ collection: Optional[str] = Field(default=None, description="collection name to connect to")
299
+ record_id_key: str = Field(
300
+ default=RECORD_ID_LABEL,
301
+ description="searchable key to find entries for the same record on previous runs",
302
+ )
303
+
304
+
305
+ @dataclass
306
+ class MongoDBUploader(Uploader):
307
+ upload_config: MongoDBUploaderConfig
308
+ connection_config: MongoDBConnectionConfig
309
+ connector_type: str = CONNECTOR_TYPE
310
+
311
+ def precheck(self) -> None:
312
+ try:
313
+ with self.connection_config.get_client() as client:
314
+ client.admin.command("ping")
315
+ database_names = client.list_database_names()
316
+ database_name = self.upload_config.database
317
+ if database_name not in database_names:
318
+ raise DestinationConnectionError(
319
+ "database {} does not exist: {}".format(
320
+ database_name, ", ".join(database_names)
321
+ )
322
+ )
323
+ database = client[database_name]
324
+ collection_names = database.list_collection_names()
325
+ collection_name = self.upload_config.collection
326
+ if collection_name not in collection_names:
327
+ raise DestinationConnectionError(
328
+ "collection {} does not exist: {}".format(
329
+ collection_name, ", ".join(collection_names)
330
+ )
331
+ )
332
+ except Exception as e:
333
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
334
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
335
+
336
+ def can_delete(self, collection: "Collection") -> bool:
337
+ indexed_keys = []
338
+ for index in collection.list_indexes():
339
+ key_bson = index["key"]
340
+ indexed_keys.extend(key_bson.keys())
341
+ return self.upload_config.record_id_key in indexed_keys
342
+
343
+ def delete_by_record_id(self, collection: "Collection", file_data: FileData) -> None:
344
+ logger.debug(
345
+ f"deleting any content with metadata "
346
+ f"{self.upload_config.record_id_key}={file_data.identifier} "
347
+ f"from collection: {collection.name}"
348
+ )
349
+ query = {self.upload_config.record_id_key: file_data.identifier}
350
+ delete_results = collection.delete_many(filter=query)
351
+ logger.info(
352
+ f"deleted {delete_results.deleted_count} records from collection {collection.name}"
353
+ )
354
+
355
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
356
+ logger.info(
357
+ f"writing {len(data)} objects to destination "
358
+ f"db, {self.upload_config.database}, "
359
+ f"collection {self.upload_config.collection} "
360
+ f"at {self.connection_config.host}",
361
+ )
362
+ # This would typically live in the stager but since no other manipulation
363
+ # is done, setting the record id field in the uploader
364
+ for element in data:
365
+ element[self.upload_config.record_id_key] = file_data.identifier
366
+ with self.connection_config.get_client() as client:
367
+ db = client[self.upload_config.database]
368
+ collection = db[self.upload_config.collection]
369
+ if self.can_delete(collection=collection):
370
+ self.delete_by_record_id(file_data=file_data, collection=collection)
371
+ else:
372
+ logger.warning("criteria for deleting previous content not met, skipping")
373
+ for chunk in batch_generator(data, self.upload_config.batch_size):
374
+ collection.insert_many(chunk)
375
+
376
+
377
+ mongodb_destination_entry = DestinationRegistryEntry(
378
+ connection_config=MongoDBConnectionConfig,
379
+ uploader=MongoDBUploader,
380
+ uploader_config=MongoDBUploaderConfig,
381
+ )
382
+
383
+ mongodb_source_entry = SourceRegistryEntry(
384
+ connection_config=MongoDBConnectionConfig,
385
+ indexer_config=MongoDBIndexerConfig,
386
+ indexer=MongoDBIndexer,
387
+ downloader_config=MongoDBDownloaderConfig,
388
+ downloader=MongoDBDownloader,
389
+ )