unstructured-ingest 1.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/cli/__init__.py +0 -0
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +269 -0
- unstructured_ingest/cli/base/dest.py +84 -0
- unstructured_ingest/cli/base/importer.py +34 -0
- unstructured_ingest/cli/base/src.py +75 -0
- unstructured_ingest/cli/cli.py +24 -0
- unstructured_ingest/cli/cmds.py +14 -0
- unstructured_ingest/cli/utils/__init__.py +0 -0
- unstructured_ingest/cli/utils/click.py +237 -0
- unstructured_ingest/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/data_types/__init__.py +0 -0
- unstructured_ingest/data_types/entities.py +17 -0
- unstructured_ingest/data_types/file_data.py +116 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +63 -0
- unstructured_ingest/embed/bedrock.py +323 -0
- unstructured_ingest/embed/huggingface.py +69 -0
- unstructured_ingest/embed/interfaces.py +146 -0
- unstructured_ingest/embed/mixedbreadai.py +134 -0
- unstructured_ingest/embed/octoai.py +133 -0
- unstructured_ingest/embed/openai.py +142 -0
- unstructured_ingest/embed/togetherai.py +116 -0
- unstructured_ingest/embed/vertexai.py +109 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/error.py +156 -0
- unstructured_ingest/errors_v2.py +156 -0
- unstructured_ingest/interfaces/__init__.py +27 -0
- unstructured_ingest/interfaces/connector.py +56 -0
- unstructured_ingest/interfaces/downloader.py +90 -0
- unstructured_ingest/interfaces/indexer.py +29 -0
- unstructured_ingest/interfaces/process.py +22 -0
- unstructured_ingest/interfaces/processor.py +88 -0
- unstructured_ingest/interfaces/upload_stager.py +89 -0
- unstructured_ingest/interfaces/uploader.py +67 -0
- unstructured_ingest/logger.py +39 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/otel.py +128 -0
- unstructured_ingest/pipeline/__init__.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +211 -0
- unstructured_ingest/pipeline/otel.py +32 -0
- unstructured_ingest/pipeline/pipeline.py +408 -0
- unstructured_ingest/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/pipeline/steps/chunk.py +78 -0
- unstructured_ingest/pipeline/steps/download.py +206 -0
- unstructured_ingest/pipeline/steps/embed.py +77 -0
- unstructured_ingest/pipeline/steps/filter.py +35 -0
- unstructured_ingest/pipeline/steps/index.py +86 -0
- unstructured_ingest/pipeline/steps/partition.py +77 -0
- unstructured_ingest/pipeline/steps/stage.py +65 -0
- unstructured_ingest/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/pipeline/steps/upload.py +58 -0
- unstructured_ingest/processes/__init__.py +18 -0
- unstructured_ingest/processes/chunker.py +131 -0
- unstructured_ingest/processes/connector_registry.py +69 -0
- unstructured_ingest/processes/connectors/__init__.py +129 -0
- unstructured_ingest/processes/connectors/airtable.py +238 -0
- unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +592 -0
- unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
- unstructured_ingest/processes/connectors/chroma.py +193 -0
- unstructured_ingest/processes/connectors/confluence.py +527 -0
- unstructured_ingest/processes/connectors/couchbase.py +336 -0
- unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
- unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
- unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
- unstructured_ingest/processes/connectors/delta_table.py +310 -0
- unstructured_ingest/processes/connectors/discord.py +161 -0
- unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
- unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
- unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
- unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
- unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
- unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
- unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
- unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/processes/connectors/github.py +226 -0
- unstructured_ingest/processes/connectors/gitlab.py +270 -0
- unstructured_ingest/processes/connectors/google_drive.py +848 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
- unstructured_ingest/processes/connectors/jira.py +522 -0
- unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
- unstructured_ingest/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/processes/connectors/kdbai.py +156 -0
- unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
- unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/processes/connectors/local.py +227 -0
- unstructured_ingest/processes/connectors/milvus.py +311 -0
- unstructured_ingest/processes/connectors/mongodb.py +389 -0
- unstructured_ingest/processes/connectors/neo4j.py +534 -0
- unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/processes/connectors/notion/connector.py +350 -0
- unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
- unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
- unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
- unstructured_ingest/processes/connectors/onedrive.py +485 -0
- unstructured_ingest/processes/connectors/outlook.py +242 -0
- unstructured_ingest/processes/connectors/pinecone.py +400 -0
- unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
- unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/processes/connectors/redisdb.py +214 -0
- unstructured_ingest/processes/connectors/salesforce.py +307 -0
- unstructured_ingest/processes/connectors/sharepoint.py +282 -0
- unstructured_ingest/processes/connectors/slack.py +249 -0
- unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
- unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
- unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
- unstructured_ingest/processes/connectors/sql/sql.py +456 -0
- unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
- unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
- unstructured_ingest/processes/connectors/utils.py +60 -0
- unstructured_ingest/processes/connectors/vectara.py +348 -0
- unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
- unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
- unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
- unstructured_ingest/processes/embedder.py +203 -0
- unstructured_ingest/processes/filter.py +60 -0
- unstructured_ingest/processes/partitioner.py +233 -0
- unstructured_ingest/processes/uncompress.py +61 -0
- unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest/processes/utils/blob_storage.py +32 -0
- unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- unstructured_ingest/unstructured_api.py +140 -0
- unstructured_ingest/utils/__init__.py +5 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +72 -0
- unstructured_ingest/utils/constants.py +2 -0
- unstructured_ingest/utils/data_prep.py +216 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/filesystem.py +27 -0
- unstructured_ingest/utils/html.py +174 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/utils/pydantic_models.py +52 -0
- unstructured_ingest/utils/string_and_date_utils.py +74 -0
- unstructured_ingest/utils/table.py +80 -0
- unstructured_ingest/utils/tls.py +15 -0
- unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
- unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
- unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
- unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
- unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
|
@@ -0,0 +1,534 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import uuid
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from contextlib import asynccontextmanager
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, Literal, Optional
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel, ConfigDict, Field, Secret, ValidationError, field_validator
|
|
13
|
+
|
|
14
|
+
from unstructured_ingest.data_types.entities import EntitiesData, Entity, EntityRelationship
|
|
15
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
16
|
+
from unstructured_ingest.error import (
|
|
17
|
+
DestinationConnectionError,
|
|
18
|
+
UnstructuredIngestError,
|
|
19
|
+
ValueError,
|
|
20
|
+
)
|
|
21
|
+
from unstructured_ingest.interfaces import (
|
|
22
|
+
AccessConfig,
|
|
23
|
+
ConnectionConfig,
|
|
24
|
+
Uploader,
|
|
25
|
+
UploaderConfig,
|
|
26
|
+
UploadStager,
|
|
27
|
+
UploadStagerConfig,
|
|
28
|
+
)
|
|
29
|
+
from unstructured_ingest.logger import logger
|
|
30
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
31
|
+
DestinationRegistryEntry,
|
|
32
|
+
)
|
|
33
|
+
from unstructured_ingest.processes.connectors.utils import format_and_truncate_orig_elements
|
|
34
|
+
from unstructured_ingest.utils.data_prep import batch_generator, get_json_data
|
|
35
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
36
|
+
|
|
37
|
+
SimilarityFunction = Literal["cosine"]
|
|
38
|
+
|
|
39
|
+
if TYPE_CHECKING:
|
|
40
|
+
from neo4j import AsyncDriver, Auth
|
|
41
|
+
from networkx import Graph, MultiDiGraph
|
|
42
|
+
|
|
43
|
+
CONNECTOR_TYPE = "neo4j"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class Neo4jAccessConfig(AccessConfig):
|
|
47
|
+
password: str
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class Neo4jConnectionConfig(ConnectionConfig):
|
|
51
|
+
access_config: Secret[Neo4jAccessConfig]
|
|
52
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
53
|
+
username: str = Field(default="neo4j")
|
|
54
|
+
uri: str = Field(description="Neo4j Connection URI <scheme>://<host>:<port>")
|
|
55
|
+
database: str = Field(default="neo4j", description="Name of the target database")
|
|
56
|
+
|
|
57
|
+
@requires_dependencies(["neo4j"], extras="neo4j")
|
|
58
|
+
@asynccontextmanager
|
|
59
|
+
async def get_client(self) -> AsyncGenerator["AsyncDriver", None]:
|
|
60
|
+
from neo4j import AsyncGraphDatabase
|
|
61
|
+
|
|
62
|
+
driver = AsyncGraphDatabase.driver(**self._get_driver_parameters())
|
|
63
|
+
logger.info(f"Created driver connecting to the database '{self.database}' at {self.uri}.")
|
|
64
|
+
try:
|
|
65
|
+
yield driver
|
|
66
|
+
finally:
|
|
67
|
+
await driver.close()
|
|
68
|
+
logger.info(
|
|
69
|
+
f"Closed driver connecting to the database '{self.database}' at {self.uri}."
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def _get_driver_parameters(self) -> dict:
|
|
73
|
+
return {
|
|
74
|
+
"uri": self.uri,
|
|
75
|
+
"auth": self._get_auth(),
|
|
76
|
+
"database": self.database,
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
@requires_dependencies(["neo4j"], extras="neo4j")
|
|
80
|
+
def _get_auth(self) -> "Auth":
|
|
81
|
+
from neo4j import Auth
|
|
82
|
+
|
|
83
|
+
return Auth("basic", self.username, self.access_config.get_secret_value().password)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class Neo4jUploadStagerConfig(UploadStagerConfig):
|
|
87
|
+
pass
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@dataclass
|
|
91
|
+
class Neo4jUploadStager(UploadStager):
|
|
92
|
+
upload_stager_config: Neo4jUploadStagerConfig = Field(
|
|
93
|
+
default_factory=Neo4jUploadStagerConfig, validate_default=True
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
def run( # type: ignore
|
|
97
|
+
self,
|
|
98
|
+
elements_filepath: Path,
|
|
99
|
+
file_data: FileData,
|
|
100
|
+
output_dir: Path,
|
|
101
|
+
output_filename: str,
|
|
102
|
+
**kwargs: Any,
|
|
103
|
+
) -> Path:
|
|
104
|
+
elements = get_json_data(elements_filepath)
|
|
105
|
+
nx_graph = self._create_lexical_graph(
|
|
106
|
+
elements, self._create_document_node(file_data=file_data)
|
|
107
|
+
)
|
|
108
|
+
output_filepath = Path(output_dir) / f"{output_filename}.json"
|
|
109
|
+
output_filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
110
|
+
|
|
111
|
+
with open(output_filepath, "w") as file:
|
|
112
|
+
file.write(_GraphData.from_nx(nx_graph).model_dump_json())
|
|
113
|
+
|
|
114
|
+
return output_filepath
|
|
115
|
+
|
|
116
|
+
def _add_entities(self, entities: list[Entity], graph: "Graph", element_node: _Node) -> None:
|
|
117
|
+
for entity in entities:
|
|
118
|
+
entity_node = _Node(
|
|
119
|
+
labels=[Label.ENTITY], properties={"id": entity.entity}, id_=entity.entity
|
|
120
|
+
)
|
|
121
|
+
graph.add_edge(
|
|
122
|
+
entity_node,
|
|
123
|
+
_Node(labels=[Label.ENTITY], properties={"id": entity.type}, id_=entity.type),
|
|
124
|
+
relationship=Relationship.ENTITY_TYPE,
|
|
125
|
+
)
|
|
126
|
+
graph.add_edge(element_node, entity_node, relationship=Relationship.HAS_ENTITY)
|
|
127
|
+
|
|
128
|
+
def _add_entity_relationships(
|
|
129
|
+
self, relationships: list[EntityRelationship], graph: "Graph"
|
|
130
|
+
) -> None:
|
|
131
|
+
for relationship in relationships:
|
|
132
|
+
from_node = _Node(
|
|
133
|
+
labels=[Label.ENTITY],
|
|
134
|
+
properties={"id": relationship.from_},
|
|
135
|
+
id_=relationship.from_,
|
|
136
|
+
)
|
|
137
|
+
to_node = _Node(
|
|
138
|
+
labels=[Label.ENTITY], properties={"id": relationship.to}, id_=relationship.to
|
|
139
|
+
)
|
|
140
|
+
graph.add_edge(from_node, to_node, relationship=relationship.relationship)
|
|
141
|
+
|
|
142
|
+
def _add_entity_data(self, element: dict, graph: "Graph", element_node: _Node) -> None:
|
|
143
|
+
entities = element.get("metadata", {}).get("entities", {})
|
|
144
|
+
if not entities:
|
|
145
|
+
return None
|
|
146
|
+
try:
|
|
147
|
+
if isinstance(entities, list):
|
|
148
|
+
self._add_entities(
|
|
149
|
+
[Entity.model_validate(e) for e in entities if isinstance(e, dict)],
|
|
150
|
+
graph,
|
|
151
|
+
element_node,
|
|
152
|
+
)
|
|
153
|
+
elif isinstance(entities, dict):
|
|
154
|
+
entity_data = EntitiesData.model_validate(entities)
|
|
155
|
+
self._add_entities(entity_data.items, graph, element_node)
|
|
156
|
+
self._add_entity_relationships(entity_data.relationships, graph)
|
|
157
|
+
except ValidationError:
|
|
158
|
+
logger.warning(
|
|
159
|
+
"Failed to add entities to the graph. "
|
|
160
|
+
"Please check the format of the entities in the input data."
|
|
161
|
+
)
|
|
162
|
+
return None
|
|
163
|
+
|
|
164
|
+
def _create_lexical_graph(self, elements: list[dict], document_node: _Node) -> "Graph":
|
|
165
|
+
import networkx as nx
|
|
166
|
+
|
|
167
|
+
graph = nx.MultiDiGraph()
|
|
168
|
+
graph.add_node(document_node)
|
|
169
|
+
|
|
170
|
+
previous_node: Optional[_Node] = None
|
|
171
|
+
for element in elements:
|
|
172
|
+
element_node = self._create_element_node(element)
|
|
173
|
+
order_relationship = (
|
|
174
|
+
Relationship.NEXT_CHUNK if self._is_chunk(element) else Relationship.NEXT_ELEMENT
|
|
175
|
+
)
|
|
176
|
+
if previous_node:
|
|
177
|
+
graph.add_edge(element_node, previous_node, relationship=order_relationship)
|
|
178
|
+
|
|
179
|
+
previous_node = element_node
|
|
180
|
+
graph.add_edge(element_node, document_node, relationship=Relationship.PART_OF_DOCUMENT)
|
|
181
|
+
|
|
182
|
+
self._add_entity_data(element, graph, element_node)
|
|
183
|
+
|
|
184
|
+
if self._is_chunk(element):
|
|
185
|
+
for origin_element in format_and_truncate_orig_elements(element, include_text=True):
|
|
186
|
+
origin_element_node = self._create_element_node(origin_element)
|
|
187
|
+
|
|
188
|
+
graph.add_edge(
|
|
189
|
+
origin_element_node,
|
|
190
|
+
element_node,
|
|
191
|
+
relationship=Relationship.PART_OF_CHUNK,
|
|
192
|
+
)
|
|
193
|
+
graph.add_edge(
|
|
194
|
+
origin_element_node,
|
|
195
|
+
document_node,
|
|
196
|
+
relationship=Relationship.PART_OF_DOCUMENT,
|
|
197
|
+
)
|
|
198
|
+
self._add_entity_data(origin_element, graph, origin_element_node)
|
|
199
|
+
|
|
200
|
+
return graph
|
|
201
|
+
|
|
202
|
+
# TODO(Filip Knefel): Ensure _is_chunk is as reliable as possible, consider different checks
|
|
203
|
+
def _is_chunk(self, element: dict) -> bool:
|
|
204
|
+
return "orig_elements" in element.get("metadata", {})
|
|
205
|
+
|
|
206
|
+
def _create_document_node(self, file_data: FileData) -> _Node:
|
|
207
|
+
properties = {}
|
|
208
|
+
if file_data.source_identifiers:
|
|
209
|
+
properties["name"] = file_data.source_identifiers.filename
|
|
210
|
+
if file_data.metadata.date_created:
|
|
211
|
+
properties["date_created"] = file_data.metadata.date_created
|
|
212
|
+
if file_data.metadata.date_modified:
|
|
213
|
+
properties["date_modified"] = file_data.metadata.date_modified
|
|
214
|
+
return _Node(id_=file_data.identifier, properties=properties, labels=[Label.DOCUMENT])
|
|
215
|
+
|
|
216
|
+
def _create_element_node(self, element: dict) -> _Node:
|
|
217
|
+
properties = {"id": element["element_id"]}
|
|
218
|
+
|
|
219
|
+
if text := element.get("text"):
|
|
220
|
+
# if we have chunks, we won't have text here for the original elements
|
|
221
|
+
properties["text"] = text
|
|
222
|
+
|
|
223
|
+
if embeddings := element.get("embeddings"):
|
|
224
|
+
properties["embeddings"] = embeddings
|
|
225
|
+
|
|
226
|
+
label = Label.CHUNK if self._is_chunk(element) else Label.UNSTRUCTURED_ELEMENT
|
|
227
|
+
return _Node(id_=element["element_id"], properties=properties, labels=[label])
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
class _GraphData(BaseModel):
|
|
231
|
+
nodes: list[_Node]
|
|
232
|
+
edges: list[_Edge]
|
|
233
|
+
|
|
234
|
+
@classmethod
|
|
235
|
+
def from_nx(cls, nx_graph: "MultiDiGraph") -> _GraphData:
|
|
236
|
+
nodes = list(nx_graph.nodes())
|
|
237
|
+
edges = [
|
|
238
|
+
_Edge(
|
|
239
|
+
source=u,
|
|
240
|
+
destination=v,
|
|
241
|
+
relationship=Relationship(data_dict["relationship"])
|
|
242
|
+
if data_dict["relationship"] in Relationship
|
|
243
|
+
else data_dict["relationship"],
|
|
244
|
+
)
|
|
245
|
+
for u, v, data_dict in nx_graph.edges(data=True)
|
|
246
|
+
]
|
|
247
|
+
return _GraphData(nodes=nodes, edges=edges)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
class _Node(BaseModel):
|
|
251
|
+
model_config = ConfigDict()
|
|
252
|
+
|
|
253
|
+
labels: list[Label]
|
|
254
|
+
properties: dict = Field(default_factory=dict)
|
|
255
|
+
id_: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
|
256
|
+
|
|
257
|
+
def __hash__(self):
|
|
258
|
+
return hash(self.id_)
|
|
259
|
+
|
|
260
|
+
@property
|
|
261
|
+
def main_label(self) -> Label:
|
|
262
|
+
return self.labels[0]
|
|
263
|
+
|
|
264
|
+
@classmethod
|
|
265
|
+
@field_validator("labels", mode="after")
|
|
266
|
+
def require_at_least_one_label(cls, value: list[Label]) -> list[Label]:
|
|
267
|
+
if not value:
|
|
268
|
+
raise ValueError("Node must have at least one label.")
|
|
269
|
+
return value
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
class _Edge(BaseModel):
|
|
273
|
+
model_config = ConfigDict()
|
|
274
|
+
|
|
275
|
+
source: _Node
|
|
276
|
+
destination: _Node
|
|
277
|
+
relationship: Relationship | str
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
class Label(Enum):
|
|
281
|
+
UNSTRUCTURED_ELEMENT = "UnstructuredElement"
|
|
282
|
+
CHUNK = "Chunk"
|
|
283
|
+
DOCUMENT = "Document"
|
|
284
|
+
ENTITY = "Entity"
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
class Relationship(Enum):
|
|
288
|
+
PART_OF_DOCUMENT = "PART_OF_DOCUMENT"
|
|
289
|
+
PART_OF_CHUNK = "PART_OF_CHUNK"
|
|
290
|
+
NEXT_CHUNK = "NEXT_CHUNK"
|
|
291
|
+
NEXT_ELEMENT = "NEXT_ELEMENT"
|
|
292
|
+
ENTITY_TYPE = "ENTITY_TYPE"
|
|
293
|
+
HAS_ENTITY = "HAS_ENTITY"
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
class Neo4jUploaderConfig(UploaderConfig):
|
|
297
|
+
batch_size: int = Field(
|
|
298
|
+
default=1000, description="Maximal number of nodes/relationships created per transaction."
|
|
299
|
+
)
|
|
300
|
+
similarity_function: SimilarityFunction = Field(
|
|
301
|
+
default="cosine",
|
|
302
|
+
description="Vector similarity function used to create index on Chunk nodes",
|
|
303
|
+
)
|
|
304
|
+
create_destination: bool = Field(
|
|
305
|
+
default=True, description="Create destination if it does not exist"
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
@dataclass
|
|
310
|
+
class Neo4jUploader(Uploader):
|
|
311
|
+
upload_config: Neo4jUploaderConfig
|
|
312
|
+
connection_config: Neo4jConnectionConfig
|
|
313
|
+
connector_type: str = CONNECTOR_TYPE
|
|
314
|
+
|
|
315
|
+
@DestinationConnectionError.wrap
|
|
316
|
+
def precheck(self) -> None:
|
|
317
|
+
async def verify_auth():
|
|
318
|
+
async with self.connection_config.get_client() as client:
|
|
319
|
+
await client.verify_connectivity()
|
|
320
|
+
|
|
321
|
+
asyncio.run(verify_auth())
|
|
322
|
+
|
|
323
|
+
def is_async(self):
|
|
324
|
+
return True
|
|
325
|
+
|
|
326
|
+
async def run_async(self, path: Path, file_data: FileData, **kwargs) -> None: # type: ignore
|
|
327
|
+
staged_data = get_json_data(path)
|
|
328
|
+
|
|
329
|
+
graph_data = _GraphData.model_validate(staged_data)
|
|
330
|
+
async with self.connection_config.get_client() as client:
|
|
331
|
+
await self._create_uniqueness_constraints(client)
|
|
332
|
+
embedding_dimensions = self._get_embedding_dimensions(graph_data)
|
|
333
|
+
if embedding_dimensions and self.upload_config.create_destination:
|
|
334
|
+
await self._create_vector_index(
|
|
335
|
+
client,
|
|
336
|
+
dimensions=embedding_dimensions,
|
|
337
|
+
similarity_function=self.upload_config.similarity_function,
|
|
338
|
+
)
|
|
339
|
+
await self._delete_old_data_if_exists(file_data, client=client)
|
|
340
|
+
await self._merge_graph(graph_data=graph_data, client=client)
|
|
341
|
+
|
|
342
|
+
async def _create_uniqueness_constraints(self, client: AsyncDriver) -> None:
|
|
343
|
+
for label in Label:
|
|
344
|
+
logger.info(
|
|
345
|
+
f"Adding id uniqueness constraint for nodes labeled '{label.value}'"
|
|
346
|
+
" if it does not already exist."
|
|
347
|
+
)
|
|
348
|
+
constraint_name = f"{label.value.lower()}_id"
|
|
349
|
+
await client.execute_query(
|
|
350
|
+
f"""
|
|
351
|
+
CREATE CONSTRAINT {constraint_name} IF NOT EXISTS
|
|
352
|
+
FOR (n: {label.value}) REQUIRE n.id IS UNIQUE
|
|
353
|
+
"""
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
async def _create_vector_index(
|
|
357
|
+
self, client: AsyncDriver, dimensions: int, similarity_function: SimilarityFunction
|
|
358
|
+
) -> None:
|
|
359
|
+
import neo4j.exceptions
|
|
360
|
+
|
|
361
|
+
label = Label.CHUNK
|
|
362
|
+
logger.info(
|
|
363
|
+
f"Creating index on nodes labeled '{label.value}' if it does not already exist."
|
|
364
|
+
)
|
|
365
|
+
index_name = f"{label.value.lower()}_vector"
|
|
366
|
+
try:
|
|
367
|
+
await client.execute_query(
|
|
368
|
+
f"""
|
|
369
|
+
CREATE VECTOR INDEX {index_name} IF NOT EXISTS
|
|
370
|
+
FOR (n:{label.value}) ON n.embedding
|
|
371
|
+
OPTIONS {{indexConfig: {{
|
|
372
|
+
`vector.similarity_function`: '{similarity_function}',
|
|
373
|
+
`vector.dimensions`: {dimensions}}}
|
|
374
|
+
}}
|
|
375
|
+
"""
|
|
376
|
+
)
|
|
377
|
+
except neo4j.exceptions.ClientError as e:
|
|
378
|
+
if e.code == "Neo.ClientError.Schema.EquivalentSchemaRuleAlreadyExists":
|
|
379
|
+
logger.info(f"Index on nodes labeled '{label.value}' already exists.")
|
|
380
|
+
else:
|
|
381
|
+
raise UnstructuredIngestError(str(e))
|
|
382
|
+
|
|
383
|
+
async def _delete_old_data_if_exists(self, file_data: FileData, client: AsyncDriver) -> None:
|
|
384
|
+
logger.info(f"Deleting old data for the record '{file_data.identifier}' (if present).")
|
|
385
|
+
_, summary, _ = await client.execute_query(
|
|
386
|
+
f"""
|
|
387
|
+
MATCH (n: `{Label.DOCUMENT.value}` {{id: $identifier}})
|
|
388
|
+
MATCH (n)--(m: `{Label.CHUNK.value}`|`{Label.UNSTRUCTURED_ELEMENT.value}`)
|
|
389
|
+
DETACH DELETE m
|
|
390
|
+
DETACH DELETE n""",
|
|
391
|
+
identifier=file_data.identifier,
|
|
392
|
+
)
|
|
393
|
+
logger.info(
|
|
394
|
+
f"Deleted {summary.counters.nodes_deleted} nodes"
|
|
395
|
+
f" and {summary.counters.relationships_deleted} relationships."
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
async def _merge_graph(self, graph_data: _GraphData, client: AsyncDriver) -> None:
|
|
399
|
+
nodes_by_labels: defaultdict[Label, list[_Node]] = defaultdict(list)
|
|
400
|
+
for node in graph_data.nodes:
|
|
401
|
+
nodes_by_labels[node.main_label].append(node)
|
|
402
|
+
logger.info(f"Merging {len(graph_data.nodes)} graph nodes.")
|
|
403
|
+
# NOTE: Processed in parallel as there's no overlap between accessed nodes
|
|
404
|
+
await self._execute_queries(
|
|
405
|
+
[
|
|
406
|
+
self._create_nodes_query(nodes_batch, label)
|
|
407
|
+
for label, nodes in nodes_by_labels.items()
|
|
408
|
+
for nodes_batch in batch_generator(nodes, batch_size=self.upload_config.batch_size)
|
|
409
|
+
],
|
|
410
|
+
client=client,
|
|
411
|
+
in_parallel=True,
|
|
412
|
+
)
|
|
413
|
+
logger.info(f"Finished merging {len(graph_data.nodes)} graph nodes.")
|
|
414
|
+
|
|
415
|
+
edges_by_relationship: defaultdict[tuple[Relationship | str, Label, Label], list[_Edge]] = (
|
|
416
|
+
defaultdict(list)
|
|
417
|
+
)
|
|
418
|
+
for edge in graph_data.edges:
|
|
419
|
+
key = (edge.relationship, edge.source.main_label, edge.destination.main_label)
|
|
420
|
+
edges_by_relationship[key].append(edge)
|
|
421
|
+
|
|
422
|
+
logger.info(f"Merging {len(graph_data.edges)} graph relationships (edges).")
|
|
423
|
+
# NOTE: Processed sequentially to avoid queries locking node access to one another
|
|
424
|
+
await self._execute_queries(
|
|
425
|
+
[
|
|
426
|
+
self._create_edges_query(edges_batch, relationship, source_label, destination_label)
|
|
427
|
+
for (
|
|
428
|
+
relationship,
|
|
429
|
+
source_label,
|
|
430
|
+
destination_label,
|
|
431
|
+
), edges in edges_by_relationship.items()
|
|
432
|
+
for edges_batch in batch_generator(edges, batch_size=self.upload_config.batch_size)
|
|
433
|
+
],
|
|
434
|
+
client=client,
|
|
435
|
+
)
|
|
436
|
+
logger.info(f"Finished merging {len(graph_data.edges)} graph relationships (edges).")
|
|
437
|
+
|
|
438
|
+
@staticmethod
|
|
439
|
+
async def _execute_queries(
|
|
440
|
+
queries_with_parameters: list[tuple[str, dict]],
|
|
441
|
+
client: AsyncDriver,
|
|
442
|
+
in_parallel: bool = False,
|
|
443
|
+
) -> None:
|
|
444
|
+
from neo4j import EagerResult
|
|
445
|
+
|
|
446
|
+
results: list[EagerResult] = []
|
|
447
|
+
logger.info(
|
|
448
|
+
f"Executing {len(queries_with_parameters)} "
|
|
449
|
+
+ f"{'parallel' if in_parallel else 'sequential'} Cypher statements."
|
|
450
|
+
)
|
|
451
|
+
if in_parallel:
|
|
452
|
+
results = await asyncio.gather(
|
|
453
|
+
*[
|
|
454
|
+
client.execute_query(query, parameters_=parameters)
|
|
455
|
+
for query, parameters in queries_with_parameters
|
|
456
|
+
]
|
|
457
|
+
)
|
|
458
|
+
else:
|
|
459
|
+
for i, (query, parameters) in enumerate(queries_with_parameters):
|
|
460
|
+
logger.info(f"Statement #{i} started.")
|
|
461
|
+
results.append(await client.execute_query(query, parameters_=parameters))
|
|
462
|
+
logger.info(f"Statement #{i} finished.")
|
|
463
|
+
nodeCount = sum([res.summary.counters.nodes_created for res in results])
|
|
464
|
+
relCount = sum([res.summary.counters.relationships_created for res in results])
|
|
465
|
+
logger.info(
|
|
466
|
+
f"Finished executing all ({len(queries_with_parameters)}) "
|
|
467
|
+
+ f"{'parallel' if in_parallel else 'sequential'} Cypher statements. "
|
|
468
|
+
+ f"Created {nodeCount} nodes, {relCount} relationships."
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
@staticmethod
|
|
472
|
+
def _create_nodes_query(nodes: list[_Node], label: Label) -> tuple[str, dict]:
|
|
473
|
+
logger.info(f"Preparing MERGE query for {len(nodes)} nodes labeled '{label}'.")
|
|
474
|
+
query_string = f"""
|
|
475
|
+
UNWIND $nodes AS node
|
|
476
|
+
MERGE (n: `{label.value}` {{id: node.id}})
|
|
477
|
+
SET n += node.properties
|
|
478
|
+
SET n:$(node.labels)
|
|
479
|
+
WITH * WHERE node.vector IS NOT NULL
|
|
480
|
+
CALL db.create.setNodeVectorProperty(n, 'embedding', node.vector)
|
|
481
|
+
"""
|
|
482
|
+
parameters = {
|
|
483
|
+
"nodes": [
|
|
484
|
+
{
|
|
485
|
+
"id": node.id_,
|
|
486
|
+
"labels": [l.value for l in node.labels if l != label], # noqa: E741
|
|
487
|
+
"vector": node.properties.pop("embedding", None),
|
|
488
|
+
"properties": node.properties,
|
|
489
|
+
}
|
|
490
|
+
for node in nodes
|
|
491
|
+
]
|
|
492
|
+
}
|
|
493
|
+
return query_string, parameters
|
|
494
|
+
|
|
495
|
+
@staticmethod
|
|
496
|
+
def _create_edges_query(
|
|
497
|
+
edges: list[_Edge],
|
|
498
|
+
relationship: Relationship | str,
|
|
499
|
+
source_label: Label,
|
|
500
|
+
destination_label: Label,
|
|
501
|
+
) -> tuple[str, dict]:
|
|
502
|
+
logger.info(f"Preparing MERGE query for {len(edges)} {relationship} relationships.")
|
|
503
|
+
relationship = (
|
|
504
|
+
relationship.value if isinstance(relationship, Relationship) else relationship
|
|
505
|
+
)
|
|
506
|
+
query_string = f"""
|
|
507
|
+
UNWIND $edges AS edge
|
|
508
|
+
MATCH (u: `{source_label.value}` {{id: edge.source}})
|
|
509
|
+
MATCH (v: `{destination_label.value}` {{id: edge.destination}})
|
|
510
|
+
MERGE (u)-[:`{relationship}`]->(v)
|
|
511
|
+
"""
|
|
512
|
+
parameters = {
|
|
513
|
+
"edges": [
|
|
514
|
+
{"source": edge.source.id_, "destination": edge.destination.id_} for edge in edges
|
|
515
|
+
]
|
|
516
|
+
}
|
|
517
|
+
return query_string, parameters
|
|
518
|
+
|
|
519
|
+
def _get_embedding_dimensions(self, graph_data: _GraphData) -> int | None:
|
|
520
|
+
"""Embedding dimensions inferred from chunk nodes or None if it can't be determined."""
|
|
521
|
+
for node in graph_data.nodes:
|
|
522
|
+
if Label.CHUNK in node.labels and "embeddings" in node.properties:
|
|
523
|
+
return len(node.properties["embeddings"])
|
|
524
|
+
|
|
525
|
+
return None
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
neo4j_destination_entry = DestinationRegistryEntry(
|
|
529
|
+
connection_config=Neo4jConnectionConfig,
|
|
530
|
+
upload_stager=Neo4jUploadStager,
|
|
531
|
+
upload_stager_config=Neo4jUploadStagerConfig,
|
|
532
|
+
uploader=Neo4jUploader,
|
|
533
|
+
uploader_config=Neo4jUploaderConfig,
|
|
534
|
+
)
|
|
File without changes
|