unstructured-ingest 1.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/cli/__init__.py +0 -0
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +269 -0
- unstructured_ingest/cli/base/dest.py +84 -0
- unstructured_ingest/cli/base/importer.py +34 -0
- unstructured_ingest/cli/base/src.py +75 -0
- unstructured_ingest/cli/cli.py +24 -0
- unstructured_ingest/cli/cmds.py +14 -0
- unstructured_ingest/cli/utils/__init__.py +0 -0
- unstructured_ingest/cli/utils/click.py +237 -0
- unstructured_ingest/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/data_types/__init__.py +0 -0
- unstructured_ingest/data_types/entities.py +17 -0
- unstructured_ingest/data_types/file_data.py +116 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +63 -0
- unstructured_ingest/embed/bedrock.py +323 -0
- unstructured_ingest/embed/huggingface.py +69 -0
- unstructured_ingest/embed/interfaces.py +146 -0
- unstructured_ingest/embed/mixedbreadai.py +134 -0
- unstructured_ingest/embed/octoai.py +133 -0
- unstructured_ingest/embed/openai.py +142 -0
- unstructured_ingest/embed/togetherai.py +116 -0
- unstructured_ingest/embed/vertexai.py +109 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/error.py +156 -0
- unstructured_ingest/errors_v2.py +156 -0
- unstructured_ingest/interfaces/__init__.py +27 -0
- unstructured_ingest/interfaces/connector.py +56 -0
- unstructured_ingest/interfaces/downloader.py +90 -0
- unstructured_ingest/interfaces/indexer.py +29 -0
- unstructured_ingest/interfaces/process.py +22 -0
- unstructured_ingest/interfaces/processor.py +88 -0
- unstructured_ingest/interfaces/upload_stager.py +89 -0
- unstructured_ingest/interfaces/uploader.py +67 -0
- unstructured_ingest/logger.py +39 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/otel.py +128 -0
- unstructured_ingest/pipeline/__init__.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +211 -0
- unstructured_ingest/pipeline/otel.py +32 -0
- unstructured_ingest/pipeline/pipeline.py +408 -0
- unstructured_ingest/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/pipeline/steps/chunk.py +78 -0
- unstructured_ingest/pipeline/steps/download.py +206 -0
- unstructured_ingest/pipeline/steps/embed.py +77 -0
- unstructured_ingest/pipeline/steps/filter.py +35 -0
- unstructured_ingest/pipeline/steps/index.py +86 -0
- unstructured_ingest/pipeline/steps/partition.py +77 -0
- unstructured_ingest/pipeline/steps/stage.py +65 -0
- unstructured_ingest/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/pipeline/steps/upload.py +58 -0
- unstructured_ingest/processes/__init__.py +18 -0
- unstructured_ingest/processes/chunker.py +131 -0
- unstructured_ingest/processes/connector_registry.py +69 -0
- unstructured_ingest/processes/connectors/__init__.py +129 -0
- unstructured_ingest/processes/connectors/airtable.py +238 -0
- unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +592 -0
- unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
- unstructured_ingest/processes/connectors/chroma.py +193 -0
- unstructured_ingest/processes/connectors/confluence.py +527 -0
- unstructured_ingest/processes/connectors/couchbase.py +336 -0
- unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
- unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
- unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
- unstructured_ingest/processes/connectors/delta_table.py +310 -0
- unstructured_ingest/processes/connectors/discord.py +161 -0
- unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
- unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
- unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
- unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
- unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
- unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
- unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
- unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/processes/connectors/github.py +226 -0
- unstructured_ingest/processes/connectors/gitlab.py +270 -0
- unstructured_ingest/processes/connectors/google_drive.py +848 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
- unstructured_ingest/processes/connectors/jira.py +522 -0
- unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
- unstructured_ingest/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/processes/connectors/kdbai.py +156 -0
- unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
- unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/processes/connectors/local.py +227 -0
- unstructured_ingest/processes/connectors/milvus.py +311 -0
- unstructured_ingest/processes/connectors/mongodb.py +389 -0
- unstructured_ingest/processes/connectors/neo4j.py +534 -0
- unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/processes/connectors/notion/connector.py +350 -0
- unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
- unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
- unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
- unstructured_ingest/processes/connectors/onedrive.py +485 -0
- unstructured_ingest/processes/connectors/outlook.py +242 -0
- unstructured_ingest/processes/connectors/pinecone.py +400 -0
- unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
- unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/processes/connectors/redisdb.py +214 -0
- unstructured_ingest/processes/connectors/salesforce.py +307 -0
- unstructured_ingest/processes/connectors/sharepoint.py +282 -0
- unstructured_ingest/processes/connectors/slack.py +249 -0
- unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
- unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
- unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
- unstructured_ingest/processes/connectors/sql/sql.py +456 -0
- unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
- unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
- unstructured_ingest/processes/connectors/utils.py +60 -0
- unstructured_ingest/processes/connectors/vectara.py +348 -0
- unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
- unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
- unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
- unstructured_ingest/processes/embedder.py +203 -0
- unstructured_ingest/processes/filter.py +60 -0
- unstructured_ingest/processes/partitioner.py +233 -0
- unstructured_ingest/processes/uncompress.py +61 -0
- unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest/processes/utils/blob_storage.py +32 -0
- unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- unstructured_ingest/unstructured_api.py +140 -0
- unstructured_ingest/utils/__init__.py +5 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +72 -0
- unstructured_ingest/utils/constants.py +2 -0
- unstructured_ingest/utils/data_prep.py +216 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/filesystem.py +27 -0
- unstructured_ingest/utils/html.py +174 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/utils/pydantic_models.py +52 -0
- unstructured_ingest/utils/string_and_date_utils.py +74 -0
- unstructured_ingest/utils/table.py +80 -0
- unstructured_ingest/utils/tls.py +15 -0
- unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
- unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
- unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
- unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
- unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
6
|
+
|
|
7
|
+
from dateutil import parser
|
|
8
|
+
from pydantic import Field, Secret
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
11
|
+
from unstructured_ingest.error import (
|
|
12
|
+
DestinationConnectionError,
|
|
13
|
+
KeyError,
|
|
14
|
+
WriteError,
|
|
15
|
+
)
|
|
16
|
+
from unstructured_ingest.interfaces import (
|
|
17
|
+
AccessConfig,
|
|
18
|
+
ConnectionConfig,
|
|
19
|
+
Uploader,
|
|
20
|
+
UploaderConfig,
|
|
21
|
+
UploadStager,
|
|
22
|
+
UploadStagerConfig,
|
|
23
|
+
)
|
|
24
|
+
from unstructured_ingest.logger import logger
|
|
25
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
26
|
+
DestinationRegistryEntry,
|
|
27
|
+
)
|
|
28
|
+
from unstructured_ingest.utils.constants import RECORD_ID_LABEL
|
|
29
|
+
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
30
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
31
|
+
|
|
32
|
+
if TYPE_CHECKING:
|
|
33
|
+
from pymilvus import MilvusClient
|
|
34
|
+
|
|
35
|
+
CONNECTOR_TYPE = "milvus"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class MilvusAccessConfig(AccessConfig):
|
|
39
|
+
password: Optional[str] = Field(default=None, description="Milvus password")
|
|
40
|
+
token: Optional[str] = Field(default=None, description="Milvus access token")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class MilvusConnectionConfig(ConnectionConfig):
|
|
44
|
+
access_config: Secret[MilvusAccessConfig] = Field(
|
|
45
|
+
default=MilvusAccessConfig(), validate_default=True
|
|
46
|
+
)
|
|
47
|
+
uri: Optional[str] = Field(
|
|
48
|
+
default=None, description="Milvus uri", examples=["http://localhost:19530"]
|
|
49
|
+
)
|
|
50
|
+
user: Optional[str] = Field(default=None, description="Milvus user")
|
|
51
|
+
db_name: Optional[str] = Field(default=None, description="Milvus database name")
|
|
52
|
+
|
|
53
|
+
def get_connection_kwargs(self) -> dict[str, Any]:
|
|
54
|
+
access_config = self.access_config.get_secret_value()
|
|
55
|
+
access_config_dict = access_config.model_dump()
|
|
56
|
+
connection_config_dict = self.model_dump()
|
|
57
|
+
connection_config_dict.pop("access_config", None)
|
|
58
|
+
connection_config_dict.update(access_config_dict)
|
|
59
|
+
# Drop any that were not set explicitly
|
|
60
|
+
connection_config_dict = {k: v for k, v in connection_config_dict.items() if v is not None}
|
|
61
|
+
return connection_config_dict
|
|
62
|
+
|
|
63
|
+
@requires_dependencies(["pymilvus"], extras="milvus")
|
|
64
|
+
@contextmanager
|
|
65
|
+
def get_client(self) -> Generator["MilvusClient", None, None]:
|
|
66
|
+
from pymilvus import MilvusClient
|
|
67
|
+
|
|
68
|
+
client = None
|
|
69
|
+
try:
|
|
70
|
+
client = MilvusClient(**self.get_connection_kwargs())
|
|
71
|
+
yield client
|
|
72
|
+
finally:
|
|
73
|
+
if client:
|
|
74
|
+
client.close()
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class MilvusUploadStagerConfig(UploadStagerConfig):
|
|
78
|
+
fields_to_include: Optional[list[str]] = None
|
|
79
|
+
"""If set - list of fields to include in the output.
|
|
80
|
+
Unspecified fields are removed from the elements.
|
|
81
|
+
This action takes place after metadata flattening.
|
|
82
|
+
Missing fields will cause stager to throw KeyError."""
|
|
83
|
+
|
|
84
|
+
flatten_metadata: bool = True
|
|
85
|
+
"""If set - flatten "metadata" key and put contents directly into data"""
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclass
|
|
89
|
+
class MilvusUploadStager(UploadStager):
|
|
90
|
+
upload_stager_config: MilvusUploadStagerConfig = field(
|
|
91
|
+
default_factory=lambda: MilvusUploadStagerConfig()
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
@staticmethod
|
|
95
|
+
def parse_date_string(date_string: str) -> float:
|
|
96
|
+
try:
|
|
97
|
+
timestamp = float(date_string)
|
|
98
|
+
return timestamp
|
|
99
|
+
except ValueError:
|
|
100
|
+
pass
|
|
101
|
+
|
|
102
|
+
try:
|
|
103
|
+
dt = datetime.fromisoformat(date_string.replace("Z", "+00:00"))
|
|
104
|
+
return dt.timestamp()
|
|
105
|
+
except ValueError:
|
|
106
|
+
pass
|
|
107
|
+
|
|
108
|
+
return parser.parse(date_string).timestamp()
|
|
109
|
+
|
|
110
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
111
|
+
working_data = element_dict.copy()
|
|
112
|
+
|
|
113
|
+
if self.upload_stager_config.flatten_metadata:
|
|
114
|
+
metadata: dict[str, Any] = working_data.pop("metadata", {})
|
|
115
|
+
flattened_metadata = flatten_dict(
|
|
116
|
+
metadata,
|
|
117
|
+
separator="_",
|
|
118
|
+
flatten_lists=False,
|
|
119
|
+
remove_none=True,
|
|
120
|
+
)
|
|
121
|
+
working_data.update(flattened_metadata)
|
|
122
|
+
|
|
123
|
+
# TODO: milvus sdk doesn't seem to support defaults via the schema yet,
|
|
124
|
+
# remove once that gets updated
|
|
125
|
+
defaults = {"is_continuation": False}
|
|
126
|
+
for default in defaults:
|
|
127
|
+
if default not in working_data:
|
|
128
|
+
working_data[default] = defaults[default]
|
|
129
|
+
|
|
130
|
+
if self.upload_stager_config.fields_to_include:
|
|
131
|
+
data_keys = set(working_data.keys())
|
|
132
|
+
for data_key in data_keys:
|
|
133
|
+
if data_key not in self.upload_stager_config.fields_to_include:
|
|
134
|
+
working_data.pop(data_key)
|
|
135
|
+
for field_include_key in self.upload_stager_config.fields_to_include:
|
|
136
|
+
if field_include_key not in working_data:
|
|
137
|
+
raise KeyError(f"Field '{field_include_key}' is missing in data!")
|
|
138
|
+
|
|
139
|
+
datetime_columns = [
|
|
140
|
+
"data_source_date_created",
|
|
141
|
+
"data_source_date_modified",
|
|
142
|
+
"data_source_date_processed",
|
|
143
|
+
"last_modified",
|
|
144
|
+
]
|
|
145
|
+
|
|
146
|
+
json_dumps_fields = ["languages", "data_source_permissions_data"]
|
|
147
|
+
|
|
148
|
+
for datetime_column in datetime_columns:
|
|
149
|
+
if datetime_column in working_data:
|
|
150
|
+
working_data[datetime_column] = self.parse_date_string(
|
|
151
|
+
working_data[datetime_column]
|
|
152
|
+
)
|
|
153
|
+
for json_dumps_field in json_dumps_fields:
|
|
154
|
+
if json_dumps_field in working_data:
|
|
155
|
+
working_data[json_dumps_field] = json.dumps(working_data[json_dumps_field])
|
|
156
|
+
working_data[RECORD_ID_LABEL] = file_data.identifier
|
|
157
|
+
return working_data
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class MilvusUploaderConfig(UploaderConfig):
|
|
161
|
+
db_name: Optional[str] = Field(default=None, description="Milvus database name")
|
|
162
|
+
collection_name: str = Field(description="Milvus collections to write to")
|
|
163
|
+
record_id_key: str = Field(
|
|
164
|
+
default=RECORD_ID_LABEL,
|
|
165
|
+
description="searchable key to find entries for the same record on previous runs",
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
@dataclass
|
|
170
|
+
class MilvusUploader(Uploader):
|
|
171
|
+
connection_config: MilvusConnectionConfig
|
|
172
|
+
upload_config: MilvusUploaderConfig
|
|
173
|
+
connector_type: str = CONNECTOR_TYPE
|
|
174
|
+
|
|
175
|
+
def has_dynamic_fields_enabled(self) -> bool:
|
|
176
|
+
"""Check if the target collection has dynamic fields enabled."""
|
|
177
|
+
try:
|
|
178
|
+
with self.get_client() as client:
|
|
179
|
+
collection_info = client.describe_collection(self.upload_config.collection_name)
|
|
180
|
+
|
|
181
|
+
# Check if dynamic field is enabled
|
|
182
|
+
# The schema info should contain enable_dynamic_field or enableDynamicField
|
|
183
|
+
schema_info = collection_info.get(
|
|
184
|
+
"enable_dynamic_field",
|
|
185
|
+
collection_info.get("enableDynamicField", False),
|
|
186
|
+
)
|
|
187
|
+
return bool(schema_info)
|
|
188
|
+
except Exception as e:
|
|
189
|
+
logger.warning(f"Could not determine if collection has dynamic fields enabled: {e}")
|
|
190
|
+
return False
|
|
191
|
+
|
|
192
|
+
@DestinationConnectionError.wrap
|
|
193
|
+
def precheck(self):
|
|
194
|
+
from pymilvus import MilvusException
|
|
195
|
+
|
|
196
|
+
try:
|
|
197
|
+
with self.get_client() as client:
|
|
198
|
+
if not client.has_collection(self.upload_config.collection_name):
|
|
199
|
+
raise DestinationConnectionError(
|
|
200
|
+
f"Collection '{self.upload_config.collection_name}' does not exist"
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
except MilvusException as milvus_exception:
|
|
204
|
+
raise DestinationConnectionError(
|
|
205
|
+
f"failed to precheck Milvus: {str(milvus_exception.message)}"
|
|
206
|
+
) from milvus_exception
|
|
207
|
+
|
|
208
|
+
@contextmanager
|
|
209
|
+
def get_client(self) -> Generator["MilvusClient", None, None]:
|
|
210
|
+
with self.connection_config.get_client() as client:
|
|
211
|
+
if db_name := self.upload_config.db_name:
|
|
212
|
+
client.using_database(db_name=db_name)
|
|
213
|
+
yield client
|
|
214
|
+
|
|
215
|
+
def delete_by_record_id(self, file_data: FileData) -> None:
|
|
216
|
+
logger.info(
|
|
217
|
+
f"deleting any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
|
|
218
|
+
f"from milvus collection {self.upload_config.collection_name}"
|
|
219
|
+
)
|
|
220
|
+
with self.get_client() as client:
|
|
221
|
+
delete_filter = f'{self.upload_config.record_id_key} == "{file_data.identifier}"'
|
|
222
|
+
resp = client.delete(
|
|
223
|
+
collection_name=self.upload_config.collection_name, filter=delete_filter
|
|
224
|
+
)
|
|
225
|
+
logger.info(
|
|
226
|
+
"deleted {} records from milvus collection {}".format(
|
|
227
|
+
resp["delete_count"], self.upload_config.collection_name
|
|
228
|
+
)
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
@requires_dependencies(["pymilvus"], extras="milvus")
|
|
232
|
+
def _prepare_data_for_insert(self, data: list[dict]) -> list[dict]:
|
|
233
|
+
"""
|
|
234
|
+
Conforms the provided data to the schema of the target Milvus collection.
|
|
235
|
+
- If dynamic fields are enabled, it ensures JSON-stringified fields are decoded.
|
|
236
|
+
- If dynamic fields are disabled, it filters out any fields not present in the schema.
|
|
237
|
+
"""
|
|
238
|
+
|
|
239
|
+
dynamic_fields_enabled = self.has_dynamic_fields_enabled()
|
|
240
|
+
|
|
241
|
+
# If dynamic fields are enabled, 'languages' field needs to be a list
|
|
242
|
+
if dynamic_fields_enabled:
|
|
243
|
+
logger.debug("Dynamic fields enabled, ensuring 'languages' field is a list.")
|
|
244
|
+
prepared_data = []
|
|
245
|
+
for item in data:
|
|
246
|
+
new_item = item.copy()
|
|
247
|
+
if "languages" in new_item and isinstance(new_item["languages"], str):
|
|
248
|
+
try:
|
|
249
|
+
new_item["languages"] = json.loads(new_item["languages"])
|
|
250
|
+
except (json.JSONDecodeError, TypeError):
|
|
251
|
+
logger.warning(
|
|
252
|
+
f"Could not JSON decode languages field: {new_item['languages']}. "
|
|
253
|
+
"Leaving as string.",
|
|
254
|
+
)
|
|
255
|
+
prepared_data.append(new_item)
|
|
256
|
+
return prepared_data
|
|
257
|
+
|
|
258
|
+
# If dynamic fields are not enabled, we need to filter out the metadata fields
|
|
259
|
+
# to avoid insertion errors for fields not defined in the schema
|
|
260
|
+
with self.get_client() as client:
|
|
261
|
+
collection_info = client.describe_collection(
|
|
262
|
+
self.upload_config.collection_name,
|
|
263
|
+
)
|
|
264
|
+
schema_fields = {
|
|
265
|
+
field["name"]
|
|
266
|
+
for field in collection_info.get("fields", [])
|
|
267
|
+
if not field.get("auto_id", False)
|
|
268
|
+
}
|
|
269
|
+
# Remove metadata fields that are not part of the base schema
|
|
270
|
+
filtered_data = []
|
|
271
|
+
for item in data:
|
|
272
|
+
filtered_item = {key: value for key, value in item.items() if key in schema_fields}
|
|
273
|
+
filtered_data.append(filtered_item)
|
|
274
|
+
return filtered_data
|
|
275
|
+
|
|
276
|
+
@requires_dependencies(["pymilvus"], extras="milvus")
|
|
277
|
+
def insert_results(self, data: list[dict]):
|
|
278
|
+
from pymilvus import MilvusException
|
|
279
|
+
|
|
280
|
+
logger.info(
|
|
281
|
+
f"uploading {len(data)} entries to {self.connection_config.db_name} "
|
|
282
|
+
f"db in collection {self.upload_config.collection_name}"
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
prepared_data = self._prepare_data_for_insert(data=data)
|
|
286
|
+
|
|
287
|
+
with self.get_client() as client:
|
|
288
|
+
try:
|
|
289
|
+
res = client.insert(
|
|
290
|
+
collection_name=self.upload_config.collection_name, data=prepared_data
|
|
291
|
+
)
|
|
292
|
+
except MilvusException as milvus_exception:
|
|
293
|
+
raise WriteError(
|
|
294
|
+
f"failed to upload records to Milvus: {str(milvus_exception.message)}"
|
|
295
|
+
) from milvus_exception
|
|
296
|
+
if "err_count" in res and isinstance(res["err_count"], int) and res["err_count"] > 0:
|
|
297
|
+
err_count = res["err_count"]
|
|
298
|
+
raise WriteError(f"failed to upload {err_count} docs")
|
|
299
|
+
|
|
300
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
301
|
+
self.delete_by_record_id(file_data=file_data)
|
|
302
|
+
self.insert_results(data=data)
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
milvus_destination_entry = DestinationRegistryEntry(
|
|
306
|
+
connection_config=MilvusConnectionConfig,
|
|
307
|
+
uploader=MilvusUploader,
|
|
308
|
+
uploader_config=MilvusUploaderConfig,
|
|
309
|
+
upload_stager=MilvusUploadStager,
|
|
310
|
+
upload_stager_config=MilvusUploadStagerConfig,
|
|
311
|
+
)
|
|
@@ -0,0 +1,389 @@
|
|
|
1
|
+
from contextlib import contextmanager
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from time import time
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field, Secret
|
|
8
|
+
|
|
9
|
+
from unstructured_ingest.__version__ import __version__ as unstructured_version
|
|
10
|
+
from unstructured_ingest.data_types.file_data import (
|
|
11
|
+
BatchFileData,
|
|
12
|
+
BatchItem,
|
|
13
|
+
FileData,
|
|
14
|
+
FileDataSourceMetadata,
|
|
15
|
+
SourceIdentifiers,
|
|
16
|
+
)
|
|
17
|
+
from unstructured_ingest.error import (
|
|
18
|
+
ConnectionError,
|
|
19
|
+
DestinationConnectionError,
|
|
20
|
+
SourceConnectionError,
|
|
21
|
+
ValueError,
|
|
22
|
+
)
|
|
23
|
+
from unstructured_ingest.interfaces import (
|
|
24
|
+
AccessConfig,
|
|
25
|
+
ConnectionConfig,
|
|
26
|
+
Downloader,
|
|
27
|
+
DownloaderConfig,
|
|
28
|
+
DownloadResponse,
|
|
29
|
+
Indexer,
|
|
30
|
+
IndexerConfig,
|
|
31
|
+
Uploader,
|
|
32
|
+
UploaderConfig,
|
|
33
|
+
download_responses,
|
|
34
|
+
)
|
|
35
|
+
from unstructured_ingest.logger import logger
|
|
36
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
37
|
+
DestinationRegistryEntry,
|
|
38
|
+
SourceRegistryEntry,
|
|
39
|
+
)
|
|
40
|
+
from unstructured_ingest.utils.constants import RECORD_ID_LABEL
|
|
41
|
+
from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
|
|
42
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
43
|
+
|
|
44
|
+
if TYPE_CHECKING:
|
|
45
|
+
from pymongo import MongoClient
|
|
46
|
+
from pymongo.collection import Collection
|
|
47
|
+
|
|
48
|
+
CONNECTOR_TYPE = "mongodb"
|
|
49
|
+
SERVER_API_VERSION = "1"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class MongoDBAdditionalMetadata(BaseModel):
|
|
53
|
+
database: str
|
|
54
|
+
collection: str
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class MongoDBBatchFileData(BatchFileData):
|
|
58
|
+
additional_metadata: MongoDBAdditionalMetadata
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class MongoDBAccessConfig(AccessConfig):
|
|
62
|
+
uri: Optional[str] = Field(default=None, description="URI to user when connecting")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class MongoDBConnectionConfig(ConnectionConfig):
|
|
66
|
+
access_config: Secret[MongoDBAccessConfig] = Field(
|
|
67
|
+
default=MongoDBAccessConfig(), validate_default=True
|
|
68
|
+
)
|
|
69
|
+
host: Optional[str] = Field(
|
|
70
|
+
default=None,
|
|
71
|
+
description="hostname or IP address or Unix domain socket path of a single mongod or "
|
|
72
|
+
"mongos instance to connect to, or a list of hostnames",
|
|
73
|
+
)
|
|
74
|
+
port: int = Field(default=27017)
|
|
75
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
76
|
+
|
|
77
|
+
@contextmanager
|
|
78
|
+
@requires_dependencies(["pymongo"], extras="mongodb")
|
|
79
|
+
def get_client(self) -> Generator["MongoClient", None, None]:
|
|
80
|
+
from pymongo import MongoClient
|
|
81
|
+
from pymongo.driver_info import DriverInfo
|
|
82
|
+
from pymongo.server_api import ServerApi
|
|
83
|
+
|
|
84
|
+
access_config = self.access_config.get_secret_value()
|
|
85
|
+
if uri := access_config.uri:
|
|
86
|
+
client_kwargs = {
|
|
87
|
+
"host": uri,
|
|
88
|
+
"server_api": ServerApi(version=SERVER_API_VERSION),
|
|
89
|
+
"driver": DriverInfo(name="unstructured", version=unstructured_version),
|
|
90
|
+
}
|
|
91
|
+
else:
|
|
92
|
+
client_kwargs = {
|
|
93
|
+
"host": self.host,
|
|
94
|
+
"port": self.port,
|
|
95
|
+
"server_api": ServerApi(version=SERVER_API_VERSION),
|
|
96
|
+
}
|
|
97
|
+
with MongoClient(**client_kwargs) as client:
|
|
98
|
+
# UnsupportedDigestmodError means that SCRAM-SHA-1 is disabled
|
|
99
|
+
# It uses md5 which is unavailable on FIPS images
|
|
100
|
+
try:
|
|
101
|
+
from hashlib import UnsupportedDigestmodError # type: ignore[attr-defined]
|
|
102
|
+
except ImportError:
|
|
103
|
+
from _hashlib import UnsupportedDigestmodError # type: ignore[attr-defined]
|
|
104
|
+
|
|
105
|
+
# Check if the authentication mechanism is supported
|
|
106
|
+
try:
|
|
107
|
+
client.admin.command("ping")
|
|
108
|
+
except UnsupportedDigestmodError as e:
|
|
109
|
+
raise ConnectionError(
|
|
110
|
+
"Authentication using SCRAM-SHA-1 is disabled. "
|
|
111
|
+
"Use SCRAM-SHA-256 instead. "
|
|
112
|
+
"See: https://www.mongodb.com/docs/manual/core/security-scram/"
|
|
113
|
+
) from e
|
|
114
|
+
yield client
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class MongoDBIndexerConfig(IndexerConfig):
|
|
118
|
+
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
119
|
+
database: Optional[str] = Field(default=None, description="database name to connect to")
|
|
120
|
+
collection: Optional[str] = Field(default=None, description="collection name to connect to")
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class MongoDBDownloaderConfig(DownloaderConfig):
|
|
124
|
+
pass
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@dataclass
|
|
128
|
+
class MongoDBIndexer(Indexer):
|
|
129
|
+
connection_config: MongoDBConnectionConfig
|
|
130
|
+
index_config: MongoDBIndexerConfig
|
|
131
|
+
connector_type: str = CONNECTOR_TYPE
|
|
132
|
+
|
|
133
|
+
def precheck(self) -> None:
|
|
134
|
+
"""Validates the connection to the MongoDB server."""
|
|
135
|
+
try:
|
|
136
|
+
with self.connection_config.get_client() as client:
|
|
137
|
+
client.admin.command("ping")
|
|
138
|
+
database_names = client.list_database_names()
|
|
139
|
+
database_name = self.index_config.database
|
|
140
|
+
if database_name not in database_names:
|
|
141
|
+
raise SourceConnectionError(
|
|
142
|
+
"database {} does not exist: {}".format(
|
|
143
|
+
database_name, ", ".join(database_names)
|
|
144
|
+
)
|
|
145
|
+
)
|
|
146
|
+
database = client[database_name]
|
|
147
|
+
collection_names = database.list_collection_names()
|
|
148
|
+
collection_name = self.index_config.collection
|
|
149
|
+
if collection_name not in collection_names:
|
|
150
|
+
raise SourceConnectionError(
|
|
151
|
+
"collection {} does not exist: {}".format(
|
|
152
|
+
collection_name, ", ".join(collection_names)
|
|
153
|
+
)
|
|
154
|
+
)
|
|
155
|
+
except Exception as e:
|
|
156
|
+
logger.error(f"Failed to validate connection: {e}", exc_info=True)
|
|
157
|
+
raise SourceConnectionError(f"Failed to validate connection: {e}")
|
|
158
|
+
|
|
159
|
+
def run(self, **kwargs: Any) -> Generator[BatchFileData, None, None]:
|
|
160
|
+
"""Generates FileData objects for each document in the MongoDB collection."""
|
|
161
|
+
with self.connection_config.get_client() as client:
|
|
162
|
+
database = client[self.index_config.database]
|
|
163
|
+
collection = database[self.index_config.collection]
|
|
164
|
+
|
|
165
|
+
# Get list of document IDs
|
|
166
|
+
ids = collection.distinct("_id")
|
|
167
|
+
|
|
168
|
+
ids = sorted(ids)
|
|
169
|
+
batch_size = self.index_config.batch_size
|
|
170
|
+
|
|
171
|
+
for id_batch in batch_generator(ids, batch_size=batch_size):
|
|
172
|
+
# Make sure the hash is always a positive number to create identifier
|
|
173
|
+
display_name = (
|
|
174
|
+
f"{self.index_config.database}.{self.index_config.collection}, "
|
|
175
|
+
f"batch {id_batch[0]}-{id_batch[-1]}"
|
|
176
|
+
)
|
|
177
|
+
metadata = FileDataSourceMetadata(
|
|
178
|
+
date_processed=str(time()),
|
|
179
|
+
record_locator={
|
|
180
|
+
"database": self.index_config.database,
|
|
181
|
+
"collection": self.index_config.collection,
|
|
182
|
+
},
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
file_data = MongoDBBatchFileData(
|
|
186
|
+
connector_type=self.connector_type,
|
|
187
|
+
metadata=metadata,
|
|
188
|
+
batch_items=[BatchItem(identifier=str(doc_id)) for doc_id in id_batch],
|
|
189
|
+
additional_metadata=MongoDBAdditionalMetadata(
|
|
190
|
+
collection=self.index_config.collection, database=self.index_config.database
|
|
191
|
+
),
|
|
192
|
+
display_name=display_name,
|
|
193
|
+
)
|
|
194
|
+
yield file_data
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
@dataclass
|
|
198
|
+
class MongoDBDownloader(Downloader):
|
|
199
|
+
download_config: MongoDBDownloaderConfig
|
|
200
|
+
connection_config: MongoDBConnectionConfig
|
|
201
|
+
connector_type: str = CONNECTOR_TYPE
|
|
202
|
+
|
|
203
|
+
def generate_download_response(
|
|
204
|
+
self, doc: dict, file_data: MongoDBBatchFileData
|
|
205
|
+
) -> DownloadResponse:
|
|
206
|
+
from bson.objectid import ObjectId
|
|
207
|
+
|
|
208
|
+
doc_id = doc["_id"]
|
|
209
|
+
doc.pop("_id", None)
|
|
210
|
+
|
|
211
|
+
# Extract date_created from the document or ObjectId
|
|
212
|
+
date_created = None
|
|
213
|
+
if "date_created" in doc:
|
|
214
|
+
# If the document has a 'date_created' field, use it
|
|
215
|
+
date_created = doc["date_created"]
|
|
216
|
+
if isinstance(date_created, datetime):
|
|
217
|
+
date_created = date_created.isoformat()
|
|
218
|
+
else:
|
|
219
|
+
# Convert to ISO format if it's a string
|
|
220
|
+
date_created = str(date_created)
|
|
221
|
+
elif isinstance(doc_id, ObjectId):
|
|
222
|
+
# Use the ObjectId's generation time
|
|
223
|
+
date_created = doc_id.generation_time.isoformat()
|
|
224
|
+
|
|
225
|
+
flattened_dict = flatten_dict(dictionary=doc)
|
|
226
|
+
concatenated_values = "\n".join(str(value) for value in flattened_dict.values())
|
|
227
|
+
|
|
228
|
+
# Create a FileData object for each document with source_identifiers
|
|
229
|
+
filename = f"{doc_id}.txt"
|
|
230
|
+
file_data.source_identifiers = SourceIdentifiers(
|
|
231
|
+
filename=filename,
|
|
232
|
+
fullpath=filename,
|
|
233
|
+
)
|
|
234
|
+
cast_file_data = FileData.cast(file_data=file_data)
|
|
235
|
+
cast_file_data.identifier = str(doc_id)
|
|
236
|
+
|
|
237
|
+
# Determine the download path
|
|
238
|
+
download_path = self.get_download_path(file_data=cast_file_data)
|
|
239
|
+
if download_path is None:
|
|
240
|
+
raise ValueError("Download path could not be determined")
|
|
241
|
+
|
|
242
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
243
|
+
|
|
244
|
+
# Write the concatenated values to the file
|
|
245
|
+
with open(download_path, "w", encoding="utf8") as f:
|
|
246
|
+
f.write(concatenated_values)
|
|
247
|
+
|
|
248
|
+
# Update metadata
|
|
249
|
+
cast_file_data.metadata.record_locator["document_id"] = str(doc_id)
|
|
250
|
+
cast_file_data.metadata.date_created = date_created
|
|
251
|
+
|
|
252
|
+
return super().generate_download_response(
|
|
253
|
+
file_data=cast_file_data, download_path=download_path
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
@SourceConnectionError.wrap
|
|
257
|
+
@requires_dependencies(["bson"], extras="mongodb")
|
|
258
|
+
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
259
|
+
"""Fetches the document from MongoDB and writes it to a file."""
|
|
260
|
+
from bson.errors import InvalidId
|
|
261
|
+
from bson.objectid import ObjectId
|
|
262
|
+
|
|
263
|
+
mongo_file_data = MongoDBBatchFileData.cast(file_data=file_data)
|
|
264
|
+
|
|
265
|
+
with self.connection_config.get_client() as client:
|
|
266
|
+
database = client[mongo_file_data.additional_metadata.database]
|
|
267
|
+
collection = database[mongo_file_data.additional_metadata.collection]
|
|
268
|
+
|
|
269
|
+
ids = [item.identifier for item in mongo_file_data.batch_items]
|
|
270
|
+
|
|
271
|
+
object_ids = []
|
|
272
|
+
for doc_id in ids:
|
|
273
|
+
try:
|
|
274
|
+
object_ids.append(ObjectId(doc_id))
|
|
275
|
+
except InvalidId as e:
|
|
276
|
+
error_message = f"Invalid ObjectId for doc_id '{doc_id}': {str(e)}"
|
|
277
|
+
logger.error(error_message)
|
|
278
|
+
raise ValueError(error_message) from e
|
|
279
|
+
|
|
280
|
+
try:
|
|
281
|
+
docs = list(collection.find({"_id": {"$in": object_ids}}))
|
|
282
|
+
except Exception as e:
|
|
283
|
+
logger.error(f"Failed to fetch documents: {e}", exc_info=True)
|
|
284
|
+
raise e
|
|
285
|
+
|
|
286
|
+
download_responses = []
|
|
287
|
+
for doc in docs:
|
|
288
|
+
download_responses.append(
|
|
289
|
+
self.generate_download_response(doc=doc, file_data=mongo_file_data)
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
return download_responses
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
class MongoDBUploaderConfig(UploaderConfig):
|
|
296
|
+
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
297
|
+
database: Optional[str] = Field(default=None, description="database name to connect to")
|
|
298
|
+
collection: Optional[str] = Field(default=None, description="collection name to connect to")
|
|
299
|
+
record_id_key: str = Field(
|
|
300
|
+
default=RECORD_ID_LABEL,
|
|
301
|
+
description="searchable key to find entries for the same record on previous runs",
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
@dataclass
|
|
306
|
+
class MongoDBUploader(Uploader):
|
|
307
|
+
upload_config: MongoDBUploaderConfig
|
|
308
|
+
connection_config: MongoDBConnectionConfig
|
|
309
|
+
connector_type: str = CONNECTOR_TYPE
|
|
310
|
+
|
|
311
|
+
def precheck(self) -> None:
|
|
312
|
+
try:
|
|
313
|
+
with self.connection_config.get_client() as client:
|
|
314
|
+
client.admin.command("ping")
|
|
315
|
+
database_names = client.list_database_names()
|
|
316
|
+
database_name = self.upload_config.database
|
|
317
|
+
if database_name not in database_names:
|
|
318
|
+
raise DestinationConnectionError(
|
|
319
|
+
"database {} does not exist: {}".format(
|
|
320
|
+
database_name, ", ".join(database_names)
|
|
321
|
+
)
|
|
322
|
+
)
|
|
323
|
+
database = client[database_name]
|
|
324
|
+
collection_names = database.list_collection_names()
|
|
325
|
+
collection_name = self.upload_config.collection
|
|
326
|
+
if collection_name not in collection_names:
|
|
327
|
+
raise DestinationConnectionError(
|
|
328
|
+
"collection {} does not exist: {}".format(
|
|
329
|
+
collection_name, ", ".join(collection_names)
|
|
330
|
+
)
|
|
331
|
+
)
|
|
332
|
+
except Exception as e:
|
|
333
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
334
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
335
|
+
|
|
336
|
+
def can_delete(self, collection: "Collection") -> bool:
|
|
337
|
+
indexed_keys = []
|
|
338
|
+
for index in collection.list_indexes():
|
|
339
|
+
key_bson = index["key"]
|
|
340
|
+
indexed_keys.extend(key_bson.keys())
|
|
341
|
+
return self.upload_config.record_id_key in indexed_keys
|
|
342
|
+
|
|
343
|
+
def delete_by_record_id(self, collection: "Collection", file_data: FileData) -> None:
|
|
344
|
+
logger.debug(
|
|
345
|
+
f"deleting any content with metadata "
|
|
346
|
+
f"{self.upload_config.record_id_key}={file_data.identifier} "
|
|
347
|
+
f"from collection: {collection.name}"
|
|
348
|
+
)
|
|
349
|
+
query = {self.upload_config.record_id_key: file_data.identifier}
|
|
350
|
+
delete_results = collection.delete_many(filter=query)
|
|
351
|
+
logger.info(
|
|
352
|
+
f"deleted {delete_results.deleted_count} records from collection {collection.name}"
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
356
|
+
logger.info(
|
|
357
|
+
f"writing {len(data)} objects to destination "
|
|
358
|
+
f"db, {self.upload_config.database}, "
|
|
359
|
+
f"collection {self.upload_config.collection} "
|
|
360
|
+
f"at {self.connection_config.host}",
|
|
361
|
+
)
|
|
362
|
+
# This would typically live in the stager but since no other manipulation
|
|
363
|
+
# is done, setting the record id field in the uploader
|
|
364
|
+
for element in data:
|
|
365
|
+
element[self.upload_config.record_id_key] = file_data.identifier
|
|
366
|
+
with self.connection_config.get_client() as client:
|
|
367
|
+
db = client[self.upload_config.database]
|
|
368
|
+
collection = db[self.upload_config.collection]
|
|
369
|
+
if self.can_delete(collection=collection):
|
|
370
|
+
self.delete_by_record_id(file_data=file_data, collection=collection)
|
|
371
|
+
else:
|
|
372
|
+
logger.warning("criteria for deleting previous content not met, skipping")
|
|
373
|
+
for chunk in batch_generator(data, self.upload_config.batch_size):
|
|
374
|
+
collection.insert_many(chunk)
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
mongodb_destination_entry = DestinationRegistryEntry(
|
|
378
|
+
connection_config=MongoDBConnectionConfig,
|
|
379
|
+
uploader=MongoDBUploader,
|
|
380
|
+
uploader_config=MongoDBUploaderConfig,
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
mongodb_source_entry = SourceRegistryEntry(
|
|
384
|
+
connection_config=MongoDBConnectionConfig,
|
|
385
|
+
indexer_config=MongoDBIndexerConfig,
|
|
386
|
+
indexer=MongoDBIndexer,
|
|
387
|
+
downloader_config=MongoDBDownloaderConfig,
|
|
388
|
+
downloader=MongoDBDownloader,
|
|
389
|
+
)
|