unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,534 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import uuid
5
+ from collections import defaultdict
6
+ from contextlib import asynccontextmanager
7
+ from dataclasses import dataclass
8
+ from enum import Enum
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, Literal, Optional
11
+
12
+ from pydantic import BaseModel, ConfigDict, Field, Secret, ValidationError, field_validator
13
+
14
+ from unstructured_ingest.data_types.entities import EntitiesData, Entity, EntityRelationship
15
+ from unstructured_ingest.data_types.file_data import FileData
16
+ from unstructured_ingest.error import (
17
+ DestinationConnectionError,
18
+ UnstructuredIngestError,
19
+ ValueError,
20
+ )
21
+ from unstructured_ingest.interfaces import (
22
+ AccessConfig,
23
+ ConnectionConfig,
24
+ Uploader,
25
+ UploaderConfig,
26
+ UploadStager,
27
+ UploadStagerConfig,
28
+ )
29
+ from unstructured_ingest.logger import logger
30
+ from unstructured_ingest.processes.connector_registry import (
31
+ DestinationRegistryEntry,
32
+ )
33
+ from unstructured_ingest.processes.connectors.utils import format_and_truncate_orig_elements
34
+ from unstructured_ingest.utils.data_prep import batch_generator, get_json_data
35
+ from unstructured_ingest.utils.dep_check import requires_dependencies
36
+
37
+ SimilarityFunction = Literal["cosine"]
38
+
39
+ if TYPE_CHECKING:
40
+ from neo4j import AsyncDriver, Auth
41
+ from networkx import Graph, MultiDiGraph
42
+
43
+ CONNECTOR_TYPE = "neo4j"
44
+
45
+
46
+ class Neo4jAccessConfig(AccessConfig):
47
+ password: str
48
+
49
+
50
+ class Neo4jConnectionConfig(ConnectionConfig):
51
+ access_config: Secret[Neo4jAccessConfig]
52
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
53
+ username: str = Field(default="neo4j")
54
+ uri: str = Field(description="Neo4j Connection URI <scheme>://<host>:<port>")
55
+ database: str = Field(default="neo4j", description="Name of the target database")
56
+
57
+ @requires_dependencies(["neo4j"], extras="neo4j")
58
+ @asynccontextmanager
59
+ async def get_client(self) -> AsyncGenerator["AsyncDriver", None]:
60
+ from neo4j import AsyncGraphDatabase
61
+
62
+ driver = AsyncGraphDatabase.driver(**self._get_driver_parameters())
63
+ logger.info(f"Created driver connecting to the database '{self.database}' at {self.uri}.")
64
+ try:
65
+ yield driver
66
+ finally:
67
+ await driver.close()
68
+ logger.info(
69
+ f"Closed driver connecting to the database '{self.database}' at {self.uri}."
70
+ )
71
+
72
+ def _get_driver_parameters(self) -> dict:
73
+ return {
74
+ "uri": self.uri,
75
+ "auth": self._get_auth(),
76
+ "database": self.database,
77
+ }
78
+
79
+ @requires_dependencies(["neo4j"], extras="neo4j")
80
+ def _get_auth(self) -> "Auth":
81
+ from neo4j import Auth
82
+
83
+ return Auth("basic", self.username, self.access_config.get_secret_value().password)
84
+
85
+
86
+ class Neo4jUploadStagerConfig(UploadStagerConfig):
87
+ pass
88
+
89
+
90
+ @dataclass
91
+ class Neo4jUploadStager(UploadStager):
92
+ upload_stager_config: Neo4jUploadStagerConfig = Field(
93
+ default_factory=Neo4jUploadStagerConfig, validate_default=True
94
+ )
95
+
96
+ def run( # type: ignore
97
+ self,
98
+ elements_filepath: Path,
99
+ file_data: FileData,
100
+ output_dir: Path,
101
+ output_filename: str,
102
+ **kwargs: Any,
103
+ ) -> Path:
104
+ elements = get_json_data(elements_filepath)
105
+ nx_graph = self._create_lexical_graph(
106
+ elements, self._create_document_node(file_data=file_data)
107
+ )
108
+ output_filepath = Path(output_dir) / f"{output_filename}.json"
109
+ output_filepath.parent.mkdir(parents=True, exist_ok=True)
110
+
111
+ with open(output_filepath, "w") as file:
112
+ file.write(_GraphData.from_nx(nx_graph).model_dump_json())
113
+
114
+ return output_filepath
115
+
116
+ def _add_entities(self, entities: list[Entity], graph: "Graph", element_node: _Node) -> None:
117
+ for entity in entities:
118
+ entity_node = _Node(
119
+ labels=[Label.ENTITY], properties={"id": entity.entity}, id_=entity.entity
120
+ )
121
+ graph.add_edge(
122
+ entity_node,
123
+ _Node(labels=[Label.ENTITY], properties={"id": entity.type}, id_=entity.type),
124
+ relationship=Relationship.ENTITY_TYPE,
125
+ )
126
+ graph.add_edge(element_node, entity_node, relationship=Relationship.HAS_ENTITY)
127
+
128
+ def _add_entity_relationships(
129
+ self, relationships: list[EntityRelationship], graph: "Graph"
130
+ ) -> None:
131
+ for relationship in relationships:
132
+ from_node = _Node(
133
+ labels=[Label.ENTITY],
134
+ properties={"id": relationship.from_},
135
+ id_=relationship.from_,
136
+ )
137
+ to_node = _Node(
138
+ labels=[Label.ENTITY], properties={"id": relationship.to}, id_=relationship.to
139
+ )
140
+ graph.add_edge(from_node, to_node, relationship=relationship.relationship)
141
+
142
+ def _add_entity_data(self, element: dict, graph: "Graph", element_node: _Node) -> None:
143
+ entities = element.get("metadata", {}).get("entities", {})
144
+ if not entities:
145
+ return None
146
+ try:
147
+ if isinstance(entities, list):
148
+ self._add_entities(
149
+ [Entity.model_validate(e) for e in entities if isinstance(e, dict)],
150
+ graph,
151
+ element_node,
152
+ )
153
+ elif isinstance(entities, dict):
154
+ entity_data = EntitiesData.model_validate(entities)
155
+ self._add_entities(entity_data.items, graph, element_node)
156
+ self._add_entity_relationships(entity_data.relationships, graph)
157
+ except ValidationError:
158
+ logger.warning(
159
+ "Failed to add entities to the graph. "
160
+ "Please check the format of the entities in the input data."
161
+ )
162
+ return None
163
+
164
+ def _create_lexical_graph(self, elements: list[dict], document_node: _Node) -> "Graph":
165
+ import networkx as nx
166
+
167
+ graph = nx.MultiDiGraph()
168
+ graph.add_node(document_node)
169
+
170
+ previous_node: Optional[_Node] = None
171
+ for element in elements:
172
+ element_node = self._create_element_node(element)
173
+ order_relationship = (
174
+ Relationship.NEXT_CHUNK if self._is_chunk(element) else Relationship.NEXT_ELEMENT
175
+ )
176
+ if previous_node:
177
+ graph.add_edge(element_node, previous_node, relationship=order_relationship)
178
+
179
+ previous_node = element_node
180
+ graph.add_edge(element_node, document_node, relationship=Relationship.PART_OF_DOCUMENT)
181
+
182
+ self._add_entity_data(element, graph, element_node)
183
+
184
+ if self._is_chunk(element):
185
+ for origin_element in format_and_truncate_orig_elements(element, include_text=True):
186
+ origin_element_node = self._create_element_node(origin_element)
187
+
188
+ graph.add_edge(
189
+ origin_element_node,
190
+ element_node,
191
+ relationship=Relationship.PART_OF_CHUNK,
192
+ )
193
+ graph.add_edge(
194
+ origin_element_node,
195
+ document_node,
196
+ relationship=Relationship.PART_OF_DOCUMENT,
197
+ )
198
+ self._add_entity_data(origin_element, graph, origin_element_node)
199
+
200
+ return graph
201
+
202
+ # TODO(Filip Knefel): Ensure _is_chunk is as reliable as possible, consider different checks
203
+ def _is_chunk(self, element: dict) -> bool:
204
+ return "orig_elements" in element.get("metadata", {})
205
+
206
+ def _create_document_node(self, file_data: FileData) -> _Node:
207
+ properties = {}
208
+ if file_data.source_identifiers:
209
+ properties["name"] = file_data.source_identifiers.filename
210
+ if file_data.metadata.date_created:
211
+ properties["date_created"] = file_data.metadata.date_created
212
+ if file_data.metadata.date_modified:
213
+ properties["date_modified"] = file_data.metadata.date_modified
214
+ return _Node(id_=file_data.identifier, properties=properties, labels=[Label.DOCUMENT])
215
+
216
+ def _create_element_node(self, element: dict) -> _Node:
217
+ properties = {"id": element["element_id"]}
218
+
219
+ if text := element.get("text"):
220
+ # if we have chunks, we won't have text here for the original elements
221
+ properties["text"] = text
222
+
223
+ if embeddings := element.get("embeddings"):
224
+ properties["embeddings"] = embeddings
225
+
226
+ label = Label.CHUNK if self._is_chunk(element) else Label.UNSTRUCTURED_ELEMENT
227
+ return _Node(id_=element["element_id"], properties=properties, labels=[label])
228
+
229
+
230
+ class _GraphData(BaseModel):
231
+ nodes: list[_Node]
232
+ edges: list[_Edge]
233
+
234
+ @classmethod
235
+ def from_nx(cls, nx_graph: "MultiDiGraph") -> _GraphData:
236
+ nodes = list(nx_graph.nodes())
237
+ edges = [
238
+ _Edge(
239
+ source=u,
240
+ destination=v,
241
+ relationship=Relationship(data_dict["relationship"])
242
+ if data_dict["relationship"] in Relationship
243
+ else data_dict["relationship"],
244
+ )
245
+ for u, v, data_dict in nx_graph.edges(data=True)
246
+ ]
247
+ return _GraphData(nodes=nodes, edges=edges)
248
+
249
+
250
+ class _Node(BaseModel):
251
+ model_config = ConfigDict()
252
+
253
+ labels: list[Label]
254
+ properties: dict = Field(default_factory=dict)
255
+ id_: str = Field(default_factory=lambda: str(uuid.uuid4()))
256
+
257
+ def __hash__(self):
258
+ return hash(self.id_)
259
+
260
+ @property
261
+ def main_label(self) -> Label:
262
+ return self.labels[0]
263
+
264
+ @classmethod
265
+ @field_validator("labels", mode="after")
266
+ def require_at_least_one_label(cls, value: list[Label]) -> list[Label]:
267
+ if not value:
268
+ raise ValueError("Node must have at least one label.")
269
+ return value
270
+
271
+
272
+ class _Edge(BaseModel):
273
+ model_config = ConfigDict()
274
+
275
+ source: _Node
276
+ destination: _Node
277
+ relationship: Relationship | str
278
+
279
+
280
+ class Label(Enum):
281
+ UNSTRUCTURED_ELEMENT = "UnstructuredElement"
282
+ CHUNK = "Chunk"
283
+ DOCUMENT = "Document"
284
+ ENTITY = "Entity"
285
+
286
+
287
+ class Relationship(Enum):
288
+ PART_OF_DOCUMENT = "PART_OF_DOCUMENT"
289
+ PART_OF_CHUNK = "PART_OF_CHUNK"
290
+ NEXT_CHUNK = "NEXT_CHUNK"
291
+ NEXT_ELEMENT = "NEXT_ELEMENT"
292
+ ENTITY_TYPE = "ENTITY_TYPE"
293
+ HAS_ENTITY = "HAS_ENTITY"
294
+
295
+
296
+ class Neo4jUploaderConfig(UploaderConfig):
297
+ batch_size: int = Field(
298
+ default=1000, description="Maximal number of nodes/relationships created per transaction."
299
+ )
300
+ similarity_function: SimilarityFunction = Field(
301
+ default="cosine",
302
+ description="Vector similarity function used to create index on Chunk nodes",
303
+ )
304
+ create_destination: bool = Field(
305
+ default=True, description="Create destination if it does not exist"
306
+ )
307
+
308
+
309
+ @dataclass
310
+ class Neo4jUploader(Uploader):
311
+ upload_config: Neo4jUploaderConfig
312
+ connection_config: Neo4jConnectionConfig
313
+ connector_type: str = CONNECTOR_TYPE
314
+
315
+ @DestinationConnectionError.wrap
316
+ def precheck(self) -> None:
317
+ async def verify_auth():
318
+ async with self.connection_config.get_client() as client:
319
+ await client.verify_connectivity()
320
+
321
+ asyncio.run(verify_auth())
322
+
323
+ def is_async(self):
324
+ return True
325
+
326
+ async def run_async(self, path: Path, file_data: FileData, **kwargs) -> None: # type: ignore
327
+ staged_data = get_json_data(path)
328
+
329
+ graph_data = _GraphData.model_validate(staged_data)
330
+ async with self.connection_config.get_client() as client:
331
+ await self._create_uniqueness_constraints(client)
332
+ embedding_dimensions = self._get_embedding_dimensions(graph_data)
333
+ if embedding_dimensions and self.upload_config.create_destination:
334
+ await self._create_vector_index(
335
+ client,
336
+ dimensions=embedding_dimensions,
337
+ similarity_function=self.upload_config.similarity_function,
338
+ )
339
+ await self._delete_old_data_if_exists(file_data, client=client)
340
+ await self._merge_graph(graph_data=graph_data, client=client)
341
+
342
+ async def _create_uniqueness_constraints(self, client: AsyncDriver) -> None:
343
+ for label in Label:
344
+ logger.info(
345
+ f"Adding id uniqueness constraint for nodes labeled '{label.value}'"
346
+ " if it does not already exist."
347
+ )
348
+ constraint_name = f"{label.value.lower()}_id"
349
+ await client.execute_query(
350
+ f"""
351
+ CREATE CONSTRAINT {constraint_name} IF NOT EXISTS
352
+ FOR (n: {label.value}) REQUIRE n.id IS UNIQUE
353
+ """
354
+ )
355
+
356
+ async def _create_vector_index(
357
+ self, client: AsyncDriver, dimensions: int, similarity_function: SimilarityFunction
358
+ ) -> None:
359
+ import neo4j.exceptions
360
+
361
+ label = Label.CHUNK
362
+ logger.info(
363
+ f"Creating index on nodes labeled '{label.value}' if it does not already exist."
364
+ )
365
+ index_name = f"{label.value.lower()}_vector"
366
+ try:
367
+ await client.execute_query(
368
+ f"""
369
+ CREATE VECTOR INDEX {index_name} IF NOT EXISTS
370
+ FOR (n:{label.value}) ON n.embedding
371
+ OPTIONS {{indexConfig: {{
372
+ `vector.similarity_function`: '{similarity_function}',
373
+ `vector.dimensions`: {dimensions}}}
374
+ }}
375
+ """
376
+ )
377
+ except neo4j.exceptions.ClientError as e:
378
+ if e.code == "Neo.ClientError.Schema.EquivalentSchemaRuleAlreadyExists":
379
+ logger.info(f"Index on nodes labeled '{label.value}' already exists.")
380
+ else:
381
+ raise UnstructuredIngestError(str(e))
382
+
383
+ async def _delete_old_data_if_exists(self, file_data: FileData, client: AsyncDriver) -> None:
384
+ logger.info(f"Deleting old data for the record '{file_data.identifier}' (if present).")
385
+ _, summary, _ = await client.execute_query(
386
+ f"""
387
+ MATCH (n: `{Label.DOCUMENT.value}` {{id: $identifier}})
388
+ MATCH (n)--(m: `{Label.CHUNK.value}`|`{Label.UNSTRUCTURED_ELEMENT.value}`)
389
+ DETACH DELETE m
390
+ DETACH DELETE n""",
391
+ identifier=file_data.identifier,
392
+ )
393
+ logger.info(
394
+ f"Deleted {summary.counters.nodes_deleted} nodes"
395
+ f" and {summary.counters.relationships_deleted} relationships."
396
+ )
397
+
398
+ async def _merge_graph(self, graph_data: _GraphData, client: AsyncDriver) -> None:
399
+ nodes_by_labels: defaultdict[Label, list[_Node]] = defaultdict(list)
400
+ for node in graph_data.nodes:
401
+ nodes_by_labels[node.main_label].append(node)
402
+ logger.info(f"Merging {len(graph_data.nodes)} graph nodes.")
403
+ # NOTE: Processed in parallel as there's no overlap between accessed nodes
404
+ await self._execute_queries(
405
+ [
406
+ self._create_nodes_query(nodes_batch, label)
407
+ for label, nodes in nodes_by_labels.items()
408
+ for nodes_batch in batch_generator(nodes, batch_size=self.upload_config.batch_size)
409
+ ],
410
+ client=client,
411
+ in_parallel=True,
412
+ )
413
+ logger.info(f"Finished merging {len(graph_data.nodes)} graph nodes.")
414
+
415
+ edges_by_relationship: defaultdict[tuple[Relationship | str, Label, Label], list[_Edge]] = (
416
+ defaultdict(list)
417
+ )
418
+ for edge in graph_data.edges:
419
+ key = (edge.relationship, edge.source.main_label, edge.destination.main_label)
420
+ edges_by_relationship[key].append(edge)
421
+
422
+ logger.info(f"Merging {len(graph_data.edges)} graph relationships (edges).")
423
+ # NOTE: Processed sequentially to avoid queries locking node access to one another
424
+ await self._execute_queries(
425
+ [
426
+ self._create_edges_query(edges_batch, relationship, source_label, destination_label)
427
+ for (
428
+ relationship,
429
+ source_label,
430
+ destination_label,
431
+ ), edges in edges_by_relationship.items()
432
+ for edges_batch in batch_generator(edges, batch_size=self.upload_config.batch_size)
433
+ ],
434
+ client=client,
435
+ )
436
+ logger.info(f"Finished merging {len(graph_data.edges)} graph relationships (edges).")
437
+
438
+ @staticmethod
439
+ async def _execute_queries(
440
+ queries_with_parameters: list[tuple[str, dict]],
441
+ client: AsyncDriver,
442
+ in_parallel: bool = False,
443
+ ) -> None:
444
+ from neo4j import EagerResult
445
+
446
+ results: list[EagerResult] = []
447
+ logger.info(
448
+ f"Executing {len(queries_with_parameters)} "
449
+ + f"{'parallel' if in_parallel else 'sequential'} Cypher statements."
450
+ )
451
+ if in_parallel:
452
+ results = await asyncio.gather(
453
+ *[
454
+ client.execute_query(query, parameters_=parameters)
455
+ for query, parameters in queries_with_parameters
456
+ ]
457
+ )
458
+ else:
459
+ for i, (query, parameters) in enumerate(queries_with_parameters):
460
+ logger.info(f"Statement #{i} started.")
461
+ results.append(await client.execute_query(query, parameters_=parameters))
462
+ logger.info(f"Statement #{i} finished.")
463
+ nodeCount = sum([res.summary.counters.nodes_created for res in results])
464
+ relCount = sum([res.summary.counters.relationships_created for res in results])
465
+ logger.info(
466
+ f"Finished executing all ({len(queries_with_parameters)}) "
467
+ + f"{'parallel' if in_parallel else 'sequential'} Cypher statements. "
468
+ + f"Created {nodeCount} nodes, {relCount} relationships."
469
+ )
470
+
471
+ @staticmethod
472
+ def _create_nodes_query(nodes: list[_Node], label: Label) -> tuple[str, dict]:
473
+ logger.info(f"Preparing MERGE query for {len(nodes)} nodes labeled '{label}'.")
474
+ query_string = f"""
475
+ UNWIND $nodes AS node
476
+ MERGE (n: `{label.value}` {{id: node.id}})
477
+ SET n += node.properties
478
+ SET n:$(node.labels)
479
+ WITH * WHERE node.vector IS NOT NULL
480
+ CALL db.create.setNodeVectorProperty(n, 'embedding', node.vector)
481
+ """
482
+ parameters = {
483
+ "nodes": [
484
+ {
485
+ "id": node.id_,
486
+ "labels": [l.value for l in node.labels if l != label], # noqa: E741
487
+ "vector": node.properties.pop("embedding", None),
488
+ "properties": node.properties,
489
+ }
490
+ for node in nodes
491
+ ]
492
+ }
493
+ return query_string, parameters
494
+
495
+ @staticmethod
496
+ def _create_edges_query(
497
+ edges: list[_Edge],
498
+ relationship: Relationship | str,
499
+ source_label: Label,
500
+ destination_label: Label,
501
+ ) -> tuple[str, dict]:
502
+ logger.info(f"Preparing MERGE query for {len(edges)} {relationship} relationships.")
503
+ relationship = (
504
+ relationship.value if isinstance(relationship, Relationship) else relationship
505
+ )
506
+ query_string = f"""
507
+ UNWIND $edges AS edge
508
+ MATCH (u: `{source_label.value}` {{id: edge.source}})
509
+ MATCH (v: `{destination_label.value}` {{id: edge.destination}})
510
+ MERGE (u)-[:`{relationship}`]->(v)
511
+ """
512
+ parameters = {
513
+ "edges": [
514
+ {"source": edge.source.id_, "destination": edge.destination.id_} for edge in edges
515
+ ]
516
+ }
517
+ return query_string, parameters
518
+
519
+ def _get_embedding_dimensions(self, graph_data: _GraphData) -> int | None:
520
+ """Embedding dimensions inferred from chunk nodes or None if it can't be determined."""
521
+ for node in graph_data.nodes:
522
+ if Label.CHUNK in node.labels and "embeddings" in node.properties:
523
+ return len(node.properties["embeddings"])
524
+
525
+ return None
526
+
527
+
528
+ neo4j_destination_entry = DestinationRegistryEntry(
529
+ connection_config=Neo4jConnectionConfig,
530
+ upload_stager=Neo4jUploadStager,
531
+ upload_stager_config=Neo4jUploadStagerConfig,
532
+ uploader=Neo4jUploader,
533
+ uploader_config=Neo4jUploaderConfig,
534
+ )