unstructured-ingest 1.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/cli/__init__.py +0 -0
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +269 -0
- unstructured_ingest/cli/base/dest.py +84 -0
- unstructured_ingest/cli/base/importer.py +34 -0
- unstructured_ingest/cli/base/src.py +75 -0
- unstructured_ingest/cli/cli.py +24 -0
- unstructured_ingest/cli/cmds.py +14 -0
- unstructured_ingest/cli/utils/__init__.py +0 -0
- unstructured_ingest/cli/utils/click.py +237 -0
- unstructured_ingest/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/data_types/__init__.py +0 -0
- unstructured_ingest/data_types/entities.py +17 -0
- unstructured_ingest/data_types/file_data.py +116 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +63 -0
- unstructured_ingest/embed/bedrock.py +323 -0
- unstructured_ingest/embed/huggingface.py +69 -0
- unstructured_ingest/embed/interfaces.py +146 -0
- unstructured_ingest/embed/mixedbreadai.py +134 -0
- unstructured_ingest/embed/octoai.py +133 -0
- unstructured_ingest/embed/openai.py +142 -0
- unstructured_ingest/embed/togetherai.py +116 -0
- unstructured_ingest/embed/vertexai.py +109 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/error.py +156 -0
- unstructured_ingest/errors_v2.py +156 -0
- unstructured_ingest/interfaces/__init__.py +27 -0
- unstructured_ingest/interfaces/connector.py +56 -0
- unstructured_ingest/interfaces/downloader.py +90 -0
- unstructured_ingest/interfaces/indexer.py +29 -0
- unstructured_ingest/interfaces/process.py +22 -0
- unstructured_ingest/interfaces/processor.py +88 -0
- unstructured_ingest/interfaces/upload_stager.py +89 -0
- unstructured_ingest/interfaces/uploader.py +67 -0
- unstructured_ingest/logger.py +39 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/otel.py +128 -0
- unstructured_ingest/pipeline/__init__.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +211 -0
- unstructured_ingest/pipeline/otel.py +32 -0
- unstructured_ingest/pipeline/pipeline.py +408 -0
- unstructured_ingest/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/pipeline/steps/chunk.py +78 -0
- unstructured_ingest/pipeline/steps/download.py +206 -0
- unstructured_ingest/pipeline/steps/embed.py +77 -0
- unstructured_ingest/pipeline/steps/filter.py +35 -0
- unstructured_ingest/pipeline/steps/index.py +86 -0
- unstructured_ingest/pipeline/steps/partition.py +77 -0
- unstructured_ingest/pipeline/steps/stage.py +65 -0
- unstructured_ingest/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/pipeline/steps/upload.py +58 -0
- unstructured_ingest/processes/__init__.py +18 -0
- unstructured_ingest/processes/chunker.py +131 -0
- unstructured_ingest/processes/connector_registry.py +69 -0
- unstructured_ingest/processes/connectors/__init__.py +129 -0
- unstructured_ingest/processes/connectors/airtable.py +238 -0
- unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +592 -0
- unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
- unstructured_ingest/processes/connectors/chroma.py +193 -0
- unstructured_ingest/processes/connectors/confluence.py +527 -0
- unstructured_ingest/processes/connectors/couchbase.py +336 -0
- unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
- unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
- unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
- unstructured_ingest/processes/connectors/delta_table.py +310 -0
- unstructured_ingest/processes/connectors/discord.py +161 -0
- unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
- unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
- unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
- unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
- unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
- unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
- unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
- unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/processes/connectors/github.py +226 -0
- unstructured_ingest/processes/connectors/gitlab.py +270 -0
- unstructured_ingest/processes/connectors/google_drive.py +848 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
- unstructured_ingest/processes/connectors/jira.py +522 -0
- unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
- unstructured_ingest/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/processes/connectors/kdbai.py +156 -0
- unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
- unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/processes/connectors/local.py +227 -0
- unstructured_ingest/processes/connectors/milvus.py +311 -0
- unstructured_ingest/processes/connectors/mongodb.py +389 -0
- unstructured_ingest/processes/connectors/neo4j.py +534 -0
- unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/processes/connectors/notion/connector.py +350 -0
- unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
- unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
- unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
- unstructured_ingest/processes/connectors/onedrive.py +485 -0
- unstructured_ingest/processes/connectors/outlook.py +242 -0
- unstructured_ingest/processes/connectors/pinecone.py +400 -0
- unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
- unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/processes/connectors/redisdb.py +214 -0
- unstructured_ingest/processes/connectors/salesforce.py +307 -0
- unstructured_ingest/processes/connectors/sharepoint.py +282 -0
- unstructured_ingest/processes/connectors/slack.py +249 -0
- unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
- unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
- unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
- unstructured_ingest/processes/connectors/sql/sql.py +456 -0
- unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
- unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
- unstructured_ingest/processes/connectors/utils.py +60 -0
- unstructured_ingest/processes/connectors/vectara.py +348 -0
- unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
- unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
- unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
- unstructured_ingest/processes/embedder.py +203 -0
- unstructured_ingest/processes/filter.py +60 -0
- unstructured_ingest/processes/partitioner.py +233 -0
- unstructured_ingest/processes/uncompress.py +61 -0
- unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest/processes/utils/blob_storage.py +32 -0
- unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- unstructured_ingest/unstructured_api.py +140 -0
- unstructured_ingest/utils/__init__.py +5 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +72 -0
- unstructured_ingest/utils/constants.py +2 -0
- unstructured_ingest/utils/data_prep.py +216 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/filesystem.py +27 -0
- unstructured_ingest/utils/html.py +174 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/utils/pydantic_models.py +52 -0
- unstructured_ingest/utils/string_and_date_utils.py +74 -0
- unstructured_ingest/utils/table.py +80 -0
- unstructured_ingest/utils/tls.py +15 -0
- unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
- unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
- unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
- unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
- unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Salesforce Connector
|
|
3
|
+
Able to download Account, Case, Campaign, EmailMessage, Lead
|
|
4
|
+
Salesforce returns everything as a list of json.
|
|
5
|
+
This saves each entry as a separate file to be partitioned.
|
|
6
|
+
Using JWT authorization
|
|
7
|
+
https://developer.salesforce.com/docs/atlas.en-us.sfdx_dev.meta/sfdx_dev/sfdx_dev_auth_key_and_cert.htm
|
|
8
|
+
https://developer.salesforce.com/docs/atlas.en-us.sfdx_dev.meta/sfdx_dev/sfdx_dev_auth_connected_app.htm
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
from collections import OrderedDict
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from email.utils import formatdate
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from string import Template
|
|
17
|
+
from textwrap import dedent
|
|
18
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional, Type
|
|
19
|
+
|
|
20
|
+
from dateutil import parser
|
|
21
|
+
from pydantic import Field, Secret
|
|
22
|
+
|
|
23
|
+
from unstructured_ingest.data_types.file_data import (
|
|
24
|
+
FileData,
|
|
25
|
+
FileDataSourceMetadata,
|
|
26
|
+
SourceIdentifiers,
|
|
27
|
+
)
|
|
28
|
+
from unstructured_ingest.error import (
|
|
29
|
+
MissingCategoryError,
|
|
30
|
+
SourceConnectionError,
|
|
31
|
+
SourceConnectionNetworkError,
|
|
32
|
+
ValueError,
|
|
33
|
+
)
|
|
34
|
+
from unstructured_ingest.interfaces import (
|
|
35
|
+
AccessConfig,
|
|
36
|
+
ConnectionConfig,
|
|
37
|
+
Downloader,
|
|
38
|
+
DownloaderConfig,
|
|
39
|
+
DownloadResponse,
|
|
40
|
+
Indexer,
|
|
41
|
+
IndexerConfig,
|
|
42
|
+
)
|
|
43
|
+
from unstructured_ingest.logger import logger
|
|
44
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
45
|
+
SourceRegistryEntry,
|
|
46
|
+
)
|
|
47
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
48
|
+
|
|
49
|
+
CONNECTOR_TYPE = "salesforce"
|
|
50
|
+
|
|
51
|
+
if TYPE_CHECKING:
|
|
52
|
+
from simple_salesforce import Salesforce
|
|
53
|
+
|
|
54
|
+
SALESFORCE_API_VERSION = "57.0"
|
|
55
|
+
|
|
56
|
+
# TODO: Add more categories as needed
|
|
57
|
+
ACCEPTED_CATEGORIES: list[str] = ["Account", "Case", "Campaign", "EmailMessage", "Lead"]
|
|
58
|
+
|
|
59
|
+
# Generic minimal email template used only
|
|
60
|
+
# to process EmailMessage records as .eml files
|
|
61
|
+
EMAIL_TEMPLATE = Template(
|
|
62
|
+
"""MIME-Version: 1.0
|
|
63
|
+
Date: $date
|
|
64
|
+
Message-ID: $message_identifier
|
|
65
|
+
Subject: $subject
|
|
66
|
+
From: $from_email
|
|
67
|
+
To: $to_email
|
|
68
|
+
Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630"
|
|
69
|
+
--00000000000095c9b205eff92630
|
|
70
|
+
Content-Type: text/plain; charset="UTF-8"
|
|
71
|
+
$textbody
|
|
72
|
+
--00000000000095c9b205eff92630
|
|
73
|
+
Content-Type: text/html; charset="UTF-8"
|
|
74
|
+
$htmlbody
|
|
75
|
+
--00000000000095c9b205eff92630--
|
|
76
|
+
""",
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class SalesforceAccessConfig(AccessConfig):
|
|
81
|
+
consumer_key: str
|
|
82
|
+
private_key_path: Optional[Path] = Field(
|
|
83
|
+
default=None,
|
|
84
|
+
description="Path to the private key file. Key file is usually named server.key.",
|
|
85
|
+
)
|
|
86
|
+
private_key: Optional[str] = Field(default=None, description="Contents of the private key")
|
|
87
|
+
|
|
88
|
+
def model_post_init(self, __context: Any) -> None:
|
|
89
|
+
if self.private_key_path is None and self.private_key is None:
|
|
90
|
+
raise ValueError("either private_key or private_key_path must be set")
|
|
91
|
+
if self.private_key is not None and self.private_key_path is not None:
|
|
92
|
+
raise ValueError("only one of private_key or private_key_path must be set")
|
|
93
|
+
|
|
94
|
+
@requires_dependencies(["cryptography"])
|
|
95
|
+
def get_private_key_value_and_type(self) -> tuple[str, Type]:
|
|
96
|
+
from cryptography.hazmat.primitives import serialization
|
|
97
|
+
|
|
98
|
+
if self.private_key_path and self.private_key_path.is_file():
|
|
99
|
+
return str(self.private_key_path), Path
|
|
100
|
+
if self.private_key:
|
|
101
|
+
try:
|
|
102
|
+
serialization.load_pem_private_key(
|
|
103
|
+
data=str(self.private_key).encode("utf-8"), password=None
|
|
104
|
+
)
|
|
105
|
+
except Exception as e:
|
|
106
|
+
raise ValueError(f"failed to validate private key data: {e}") from e
|
|
107
|
+
return self.private_key, str
|
|
108
|
+
|
|
109
|
+
raise ValueError("private_key does not contain PEM private key or path")
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class SalesforceConnectionConfig(ConnectionConfig):
|
|
113
|
+
username: str
|
|
114
|
+
access_config: Secret[SalesforceAccessConfig]
|
|
115
|
+
|
|
116
|
+
@requires_dependencies(["simple_salesforce"], extras="salesforce")
|
|
117
|
+
def get_client(self) -> "Salesforce":
|
|
118
|
+
from simple_salesforce import Salesforce
|
|
119
|
+
|
|
120
|
+
access_config = self.access_config.get_secret_value()
|
|
121
|
+
pkey_value, pkey_type = access_config.get_private_key_value_and_type()
|
|
122
|
+
|
|
123
|
+
return Salesforce(
|
|
124
|
+
username=self.username,
|
|
125
|
+
consumer_key=access_config.consumer_key,
|
|
126
|
+
privatekey_file=pkey_value if pkey_type is Path else None,
|
|
127
|
+
privatekey=pkey_value if pkey_type is str else None,
|
|
128
|
+
version=SALESFORCE_API_VERSION,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class SalesforceIndexerConfig(IndexerConfig):
|
|
133
|
+
categories: list[str]
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
@dataclass
|
|
137
|
+
class SalesforceIndexer(Indexer):
|
|
138
|
+
connection_config: SalesforceConnectionConfig
|
|
139
|
+
index_config: SalesforceIndexerConfig
|
|
140
|
+
|
|
141
|
+
def __post_init__(self):
|
|
142
|
+
for record_type in self.index_config.categories:
|
|
143
|
+
if record_type not in ACCEPTED_CATEGORIES:
|
|
144
|
+
raise ValueError(f"{record_type} not currently an accepted Salesforce category")
|
|
145
|
+
|
|
146
|
+
def precheck(self) -> None:
|
|
147
|
+
try:
|
|
148
|
+
self.connection_config.get_client()
|
|
149
|
+
except Exception as e:
|
|
150
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
151
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
152
|
+
|
|
153
|
+
def get_file_extension(self, record_type) -> str:
|
|
154
|
+
if record_type == "EmailMessage":
|
|
155
|
+
extension = ".eml"
|
|
156
|
+
elif record_type in ["Account", "Lead", "Case", "Campaign"]:
|
|
157
|
+
extension = ".xml"
|
|
158
|
+
else:
|
|
159
|
+
raise MissingCategoryError(
|
|
160
|
+
f"There are no categories with the name: {record_type}",
|
|
161
|
+
)
|
|
162
|
+
return extension
|
|
163
|
+
|
|
164
|
+
@requires_dependencies(["simple_salesforce"], extras="salesforce")
|
|
165
|
+
def list_files(self) -> list[FileData]:
|
|
166
|
+
"""Get Salesforce Ids for the records.
|
|
167
|
+
Send them to next phase where each doc gets downloaded into the
|
|
168
|
+
appropriate format for partitioning.
|
|
169
|
+
"""
|
|
170
|
+
from simple_salesforce.exceptions import SalesforceMalformedRequest
|
|
171
|
+
|
|
172
|
+
client = self.connection_config.get_client()
|
|
173
|
+
|
|
174
|
+
files_list = []
|
|
175
|
+
for record_type in self.index_config.categories:
|
|
176
|
+
try:
|
|
177
|
+
# Get ids from Salesforce
|
|
178
|
+
records = client.query_all_iter(
|
|
179
|
+
f"select Id, SystemModstamp, CreatedDate, LastModifiedDate from {record_type}",
|
|
180
|
+
)
|
|
181
|
+
for record in records:
|
|
182
|
+
record_with_extension = record["Id"] + self.get_file_extension(
|
|
183
|
+
record["attributes"]["type"]
|
|
184
|
+
)
|
|
185
|
+
source_identifiers = SourceIdentifiers(
|
|
186
|
+
filename=record_with_extension,
|
|
187
|
+
fullpath=f"{record['attributes']['type']}/{record_with_extension}",
|
|
188
|
+
)
|
|
189
|
+
files_list.append(
|
|
190
|
+
FileData(
|
|
191
|
+
connector_type=CONNECTOR_TYPE,
|
|
192
|
+
identifier=record["Id"],
|
|
193
|
+
source_identifiers=source_identifiers,
|
|
194
|
+
metadata=FileDataSourceMetadata(
|
|
195
|
+
url=record["attributes"]["url"],
|
|
196
|
+
version=str(parser.parse(record["SystemModstamp"]).timestamp()),
|
|
197
|
+
date_created=str(parser.parse(record["CreatedDate"]).timestamp()),
|
|
198
|
+
date_modified=str(
|
|
199
|
+
parser.parse(record["LastModifiedDate"]).timestamp()
|
|
200
|
+
),
|
|
201
|
+
record_locator={"id": record["Id"]},
|
|
202
|
+
),
|
|
203
|
+
additional_metadata={"record_type": record["attributes"]["type"]},
|
|
204
|
+
display_name=source_identifiers.fullpath,
|
|
205
|
+
)
|
|
206
|
+
)
|
|
207
|
+
except SalesforceMalformedRequest as e:
|
|
208
|
+
raise SalesforceMalformedRequest(f"Problem with Salesforce query: {e}")
|
|
209
|
+
|
|
210
|
+
return files_list
|
|
211
|
+
|
|
212
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
213
|
+
for f in self.list_files():
|
|
214
|
+
yield f
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
class SalesforceDownloaderConfig(DownloaderConfig):
|
|
218
|
+
pass
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
@dataclass
|
|
222
|
+
class SalesforceDownloader(Downloader):
|
|
223
|
+
connection_config: SalesforceConnectionConfig
|
|
224
|
+
download_config: SalesforceDownloaderConfig = field(
|
|
225
|
+
default_factory=lambda: SalesforceDownloaderConfig()
|
|
226
|
+
)
|
|
227
|
+
connector_type: str = CONNECTOR_TYPE
|
|
228
|
+
|
|
229
|
+
def _xml_for_record(self, record: OrderedDict) -> str:
|
|
230
|
+
"""Creates partitionable xml file from a record"""
|
|
231
|
+
import xml.etree.ElementTree as ET
|
|
232
|
+
|
|
233
|
+
def create_xml_doc(data, parent, prefix=""):
|
|
234
|
+
for key, value in data.items():
|
|
235
|
+
if isinstance(value, OrderedDict):
|
|
236
|
+
create_xml_doc(value, parent, prefix=f"{prefix}{key}.")
|
|
237
|
+
else:
|
|
238
|
+
item = ET.Element("item")
|
|
239
|
+
item.text = f"{prefix}{key}: {value}"
|
|
240
|
+
parent.append(item)
|
|
241
|
+
|
|
242
|
+
root = ET.Element("root")
|
|
243
|
+
create_xml_doc(record, root)
|
|
244
|
+
|
|
245
|
+
xml_string = ET.tostring(root, encoding="utf-8", xml_declaration=True).decode()
|
|
246
|
+
return xml_string
|
|
247
|
+
|
|
248
|
+
def _eml_for_record(self, email_json: dict[str, Any]) -> str:
|
|
249
|
+
"""Recreates standard expected .eml format using template."""
|
|
250
|
+
eml = EMAIL_TEMPLATE.substitute(
|
|
251
|
+
date=formatdate(parser.parse(email_json.get("MessageDate")).timestamp()),
|
|
252
|
+
message_identifier=email_json.get("MessageIdentifier"),
|
|
253
|
+
subject=email_json.get("Subject"),
|
|
254
|
+
from_email=email_json.get("FromAddress"),
|
|
255
|
+
to_email=email_json.get("ToAddress"),
|
|
256
|
+
textbody=email_json.get("TextBody"),
|
|
257
|
+
htmlbody=email_json.get("HtmlBody"),
|
|
258
|
+
)
|
|
259
|
+
return dedent(eml)
|
|
260
|
+
|
|
261
|
+
@SourceConnectionNetworkError.wrap
|
|
262
|
+
def _get_response(self, file_data: FileData) -> OrderedDict:
|
|
263
|
+
client = self.connection_config.get_client()
|
|
264
|
+
return client.query(
|
|
265
|
+
f"select FIELDS(STANDARD) from {file_data.additional_metadata['record_type']} where Id='{file_data.identifier}'", # noqa: E501
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
def get_record(self, file_data: FileData) -> OrderedDict:
|
|
269
|
+
# Get record from Salesforce based on id
|
|
270
|
+
response = self._get_response(file_data)
|
|
271
|
+
logger.debug(f"response was returned for salesforce record id: {file_data.identifier}")
|
|
272
|
+
records = response["records"]
|
|
273
|
+
if not records:
|
|
274
|
+
raise ValueError(
|
|
275
|
+
f"No record found with record id {file_data.identifier}: {json.dumps(response)}"
|
|
276
|
+
)
|
|
277
|
+
record_json = records[0]
|
|
278
|
+
return record_json
|
|
279
|
+
|
|
280
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
281
|
+
record = self.get_record(file_data)
|
|
282
|
+
|
|
283
|
+
try:
|
|
284
|
+
if file_data.additional_metadata["record_type"] == "EmailMessage":
|
|
285
|
+
document = self._eml_for_record(record)
|
|
286
|
+
else:
|
|
287
|
+
document = self._xml_for_record(record)
|
|
288
|
+
download_path = self.get_download_path(file_data=file_data)
|
|
289
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
290
|
+
|
|
291
|
+
with open(download_path, "w") as page_file:
|
|
292
|
+
page_file.write(document)
|
|
293
|
+
|
|
294
|
+
except Exception as e:
|
|
295
|
+
logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
|
|
296
|
+
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
297
|
+
|
|
298
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
salesforce_source_entry = SourceRegistryEntry(
|
|
302
|
+
connection_config=SalesforceConnectionConfig,
|
|
303
|
+
indexer_config=SalesforceIndexerConfig,
|
|
304
|
+
indexer=SalesforceIndexer,
|
|
305
|
+
downloader_config=SalesforceDownloaderConfig,
|
|
306
|
+
downloader=SalesforceDownloader,
|
|
307
|
+
)
|
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import TYPE_CHECKING, Any, AsyncIterator, Optional
|
|
7
|
+
|
|
8
|
+
from pydantic import Field
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.data_types.file_data import (
|
|
11
|
+
FileData,
|
|
12
|
+
)
|
|
13
|
+
from unstructured_ingest.error import (
|
|
14
|
+
NotFoundError,
|
|
15
|
+
SourceConnectionError,
|
|
16
|
+
SourceConnectionNetworkError,
|
|
17
|
+
UserAuthError,
|
|
18
|
+
UserError,
|
|
19
|
+
ValueError,
|
|
20
|
+
)
|
|
21
|
+
from unstructured_ingest.logger import logger
|
|
22
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
23
|
+
SourceRegistryEntry,
|
|
24
|
+
)
|
|
25
|
+
from unstructured_ingest.processes.connectors.onedrive import (
|
|
26
|
+
OnedriveAccessConfig,
|
|
27
|
+
OnedriveConnectionConfig,
|
|
28
|
+
OnedriveDownloader,
|
|
29
|
+
OnedriveDownloaderConfig,
|
|
30
|
+
OnedriveIndexer,
|
|
31
|
+
OnedriveIndexerConfig,
|
|
32
|
+
)
|
|
33
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
34
|
+
|
|
35
|
+
if TYPE_CHECKING:
|
|
36
|
+
from office365.onedrive.driveitems.driveItem import DriveItem
|
|
37
|
+
from office365.onedrive.sites.site import Site
|
|
38
|
+
from office365.runtime.client_request_exception import ClientRequestException
|
|
39
|
+
|
|
40
|
+
CONNECTOR_TYPE = "sharepoint"
|
|
41
|
+
LEGACY_DEFAULT_PATH = "Shared Documents"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class SharepointAccessConfig(OnedriveAccessConfig):
|
|
45
|
+
client_cred: str = Field(description="Microsoft App client secret")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class SharepointConnectionConfig(OnedriveConnectionConfig):
|
|
49
|
+
user_pname: Optional[str] = Field(
|
|
50
|
+
default=None,
|
|
51
|
+
description="User principal name or service account, usually your Azure AD email.",
|
|
52
|
+
)
|
|
53
|
+
site: str = Field(
|
|
54
|
+
description="Sharepoint site url. Process either base url e.g \
|
|
55
|
+
https://[tenant].sharepoint.com or relative sites \
|
|
56
|
+
https://[tenant].sharepoint.com/sites/<site_name>. \
|
|
57
|
+
To process all sites within the tenant pass a site url as \
|
|
58
|
+
https://[tenant]-admin.sharepoint.com.\
|
|
59
|
+
This requires the app to be registered at a tenant level"
|
|
60
|
+
)
|
|
61
|
+
library: Optional[str] = Field(
|
|
62
|
+
default=None,
|
|
63
|
+
description="Sharepoint library name. If not provided, the default \
|
|
64
|
+
drive will be used.",
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def _get_drive_item(self, client_site: Site) -> DriveItem:
|
|
68
|
+
"""Helper method to get the drive item for the specified library or default drive."""
|
|
69
|
+
site_drive_item = None
|
|
70
|
+
if self.library:
|
|
71
|
+
for drive in client_site.drives.get().execute_query():
|
|
72
|
+
if drive.name == self.library:
|
|
73
|
+
logger.info(f"Found the requested library: {self.library}")
|
|
74
|
+
site_drive_item = drive.get().execute_query().root
|
|
75
|
+
break
|
|
76
|
+
|
|
77
|
+
# If no specific library was found or requested, use the default drive
|
|
78
|
+
if not site_drive_item:
|
|
79
|
+
if self.library:
|
|
80
|
+
logger.warning(
|
|
81
|
+
f"Library '{self.library}' not found in site '{self.site}'. "
|
|
82
|
+
"Using the default drive instead."
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
site_drive_item = client_site.drive.get().execute_query().root
|
|
86
|
+
|
|
87
|
+
return site_drive_item
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class SharepointIndexerConfig(OnedriveIndexerConfig):
|
|
91
|
+
# TODO: We can probably make path non-optional on OnedriveIndexerConfig once tested
|
|
92
|
+
path: str = Field(default="")
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@dataclass
|
|
96
|
+
class SharepointIndexer(OnedriveIndexer):
|
|
97
|
+
connection_config: SharepointConnectionConfig
|
|
98
|
+
index_config: SharepointIndexerConfig
|
|
99
|
+
connector_type: str = CONNECTOR_TYPE
|
|
100
|
+
|
|
101
|
+
def _handle_client_request_exception(self, e: ClientRequestException, context: str) -> None:
|
|
102
|
+
"""Convert ClientRequestException to appropriate user-facing error based on HTTP status."""
|
|
103
|
+
if hasattr(e, "response") and e.response is not None and hasattr(e.response, "status_code"):
|
|
104
|
+
status_code = e.response.status_code
|
|
105
|
+
if status_code == 401:
|
|
106
|
+
raise UserAuthError(
|
|
107
|
+
f"Unauthorized access to {context}. Check client credentials and permissions"
|
|
108
|
+
)
|
|
109
|
+
elif status_code == 403:
|
|
110
|
+
raise UserAuthError(
|
|
111
|
+
f"Access forbidden to {context}. "
|
|
112
|
+
f"Check app permissions (Sites.Read.All required)"
|
|
113
|
+
)
|
|
114
|
+
elif status_code == 404:
|
|
115
|
+
raise UserError(f"Not found: {context}")
|
|
116
|
+
|
|
117
|
+
raise UserError(f"Failed to access {context}: {str(e)}")
|
|
118
|
+
|
|
119
|
+
def _is_root_path(self, path: str) -> bool:
|
|
120
|
+
"""Check if the path represents root access (empty string or legacy default)."""
|
|
121
|
+
return not path or not path.strip() or path == LEGACY_DEFAULT_PATH
|
|
122
|
+
|
|
123
|
+
def _get_target_drive_item(self, site_drive_item: DriveItem, path: str) -> DriveItem:
|
|
124
|
+
"""Get the drive item to search in based on the path."""
|
|
125
|
+
if self._is_root_path(path):
|
|
126
|
+
return site_drive_item
|
|
127
|
+
else:
|
|
128
|
+
return site_drive_item.get_by_path(path).get().execute_query()
|
|
129
|
+
|
|
130
|
+
def _validate_folder_path(self, site_drive_item: DriveItem, path: str) -> None:
|
|
131
|
+
"""Validate that a specific folder path exists and is accessible."""
|
|
132
|
+
from office365.runtime.client_request_exception import ClientRequestException
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
path_item = site_drive_item.get_by_path(path).get().execute_query()
|
|
136
|
+
if path_item is None or not hasattr(path_item, "is_folder"):
|
|
137
|
+
raise UserError(
|
|
138
|
+
f"SharePoint path '{path}' not found in site {self.connection_config.site}. "
|
|
139
|
+
f"Check that the path exists and you have access to it"
|
|
140
|
+
)
|
|
141
|
+
logger.info(f"SharePoint folder path '{path}' validated successfully")
|
|
142
|
+
except ClientRequestException as e:
|
|
143
|
+
logger.error(f"Failed to access SharePoint path '{path}': {e}")
|
|
144
|
+
self._handle_client_request_exception(e, f"SharePoint path '{path}'")
|
|
145
|
+
except Exception as e:
|
|
146
|
+
logger.error(f"Unexpected error accessing SharePoint path '{path}': {e}")
|
|
147
|
+
raise UserError(f"Failed to validate SharePoint path '{path}': {str(e)}")
|
|
148
|
+
|
|
149
|
+
@requires_dependencies(["office365"], extras="sharepoint")
|
|
150
|
+
def precheck(self) -> None:
|
|
151
|
+
"""Validate SharePoint connection before indexing."""
|
|
152
|
+
from office365.runtime.client_request_exception import ClientRequestException
|
|
153
|
+
|
|
154
|
+
# Validate authentication - this call will raise UserAuthError if invalid
|
|
155
|
+
self.connection_config.get_token()
|
|
156
|
+
|
|
157
|
+
try:
|
|
158
|
+
client = self.connection_config.get_client()
|
|
159
|
+
client_site = client.sites.get_by_url(self.connection_config.site).get().execute_query()
|
|
160
|
+
site_drive_item = self.connection_config._get_drive_item(client_site)
|
|
161
|
+
|
|
162
|
+
path = self.index_config.path
|
|
163
|
+
if not self._is_root_path(path):
|
|
164
|
+
self._validate_folder_path(site_drive_item, path)
|
|
165
|
+
|
|
166
|
+
logger.info(
|
|
167
|
+
f"SharePoint connection validated successfully for site: "
|
|
168
|
+
f"{self.connection_config.site}"
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
except ClientRequestException as e:
|
|
172
|
+
logger.error(f"SharePoint precheck failed for site: {self.connection_config.site}")
|
|
173
|
+
self._handle_client_request_exception(
|
|
174
|
+
e, f"SharePoint site {self.connection_config.site}"
|
|
175
|
+
)
|
|
176
|
+
except Exception as e:
|
|
177
|
+
logger.error(f"Unexpected error during SharePoint precheck: {e}", exc_info=True)
|
|
178
|
+
raise UserError(f"Failed to validate SharePoint connection: {str(e)}")
|
|
179
|
+
|
|
180
|
+
@requires_dependencies(["office365"], extras="sharepoint")
|
|
181
|
+
async def run_async(self, **kwargs: Any) -> AsyncIterator[FileData]:
|
|
182
|
+
from office365.runtime.client_request_exception import ClientRequestException
|
|
183
|
+
|
|
184
|
+
token_resp = await asyncio.to_thread(self.connection_config.get_token)
|
|
185
|
+
if "error" in token_resp:
|
|
186
|
+
raise SourceConnectionError(
|
|
187
|
+
f"[{self.connector_type}]: {token_resp['error']} "
|
|
188
|
+
f"({token_resp.get('error_description')})"
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
client = await asyncio.to_thread(self.connection_config.get_client)
|
|
192
|
+
try:
|
|
193
|
+
client_site = client.sites.get_by_url(self.connection_config.site).get().execute_query()
|
|
194
|
+
site_drive_item = self.connection_config._get_drive_item(client_site)
|
|
195
|
+
except ClientRequestException as e:
|
|
196
|
+
logger.error(f"Failed to access SharePoint site: {self.connection_config.site}")
|
|
197
|
+
raise SourceConnectionError(
|
|
198
|
+
f"Unable to access SharePoint site at {self.connection_config.site}: {str(e)}"
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
path = self.index_config.path
|
|
202
|
+
target_drive_item = await asyncio.to_thread(
|
|
203
|
+
self._get_target_drive_item, site_drive_item, path
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
for drive_item in target_drive_item.get_files(
|
|
207
|
+
recursive=self.index_config.recursive
|
|
208
|
+
).execute_query():
|
|
209
|
+
file_data = await self.drive_item_to_file_data(drive_item=drive_item)
|
|
210
|
+
yield file_data
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
class SharepointDownloaderConfig(OnedriveDownloaderConfig):
|
|
214
|
+
max_retries: int = 10
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
@dataclass
|
|
218
|
+
class SharepointDownloader(OnedriveDownloader):
|
|
219
|
+
connection_config: SharepointConnectionConfig
|
|
220
|
+
download_config: SharepointDownloaderConfig
|
|
221
|
+
connector_type: str = CONNECTOR_TYPE
|
|
222
|
+
|
|
223
|
+
@staticmethod
|
|
224
|
+
def retry_on_status_code(exc):
|
|
225
|
+
error_msg = str(exc).lower()
|
|
226
|
+
return "429" in error_msg or "activitylimitreached" in error_msg or "throttled" in error_msg
|
|
227
|
+
|
|
228
|
+
@SourceConnectionNetworkError.wrap
|
|
229
|
+
@requires_dependencies(["office365"], extras="sharepoint")
|
|
230
|
+
def _fetch_file(self, file_data: FileData) -> DriveItem:
|
|
231
|
+
from office365.runtime.client_request_exception import ClientRequestException
|
|
232
|
+
from tenacity import (
|
|
233
|
+
before_log,
|
|
234
|
+
retry,
|
|
235
|
+
retry_if_exception,
|
|
236
|
+
stop_after_attempt,
|
|
237
|
+
wait_exponential,
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
if file_data.source_identifiers is None or not file_data.source_identifiers.fullpath:
|
|
241
|
+
raise ValueError(
|
|
242
|
+
f"file data doesn't have enough information to get "
|
|
243
|
+
f"file content: {file_data.model_dump()}"
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
server_relative_path = file_data.source_identifiers.fullpath
|
|
247
|
+
client = self.connection_config.get_client()
|
|
248
|
+
|
|
249
|
+
@retry(
|
|
250
|
+
stop=stop_after_attempt(self.download_config.max_retries),
|
|
251
|
+
wait=wait_exponential(exp_base=2, multiplier=1, min=2, max=10),
|
|
252
|
+
retry=retry_if_exception(self.retry_on_status_code),
|
|
253
|
+
before=before_log(logger, logging.DEBUG),
|
|
254
|
+
reraise=True,
|
|
255
|
+
)
|
|
256
|
+
def _get_item_by_path() -> DriveItem:
|
|
257
|
+
try:
|
|
258
|
+
client_site = (
|
|
259
|
+
client.sites.get_by_url(self.connection_config.site).get().execute_query()
|
|
260
|
+
)
|
|
261
|
+
site_drive_item = self.connection_config._get_drive_item(client_site)
|
|
262
|
+
except ClientRequestException:
|
|
263
|
+
logger.info(f"Site not found: {self.connection_config.site}")
|
|
264
|
+
raise SourceConnectionError(f"Site not found: {self.connection_config.site}")
|
|
265
|
+
file = site_drive_item.get_by_path(server_relative_path).get().execute_query()
|
|
266
|
+
return file
|
|
267
|
+
|
|
268
|
+
# Call the retry-wrapped function
|
|
269
|
+
file = _get_item_by_path()
|
|
270
|
+
|
|
271
|
+
if not file:
|
|
272
|
+
raise NotFoundError(f"file not found: {server_relative_path}")
|
|
273
|
+
return file
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
sharepoint_source_entry = SourceRegistryEntry(
|
|
277
|
+
connection_config=SharepointConnectionConfig,
|
|
278
|
+
indexer_config=SharepointIndexerConfig,
|
|
279
|
+
indexer=SharepointIndexer,
|
|
280
|
+
downloader_config=SharepointDownloaderConfig,
|
|
281
|
+
downloader=SharepointDownloader,
|
|
282
|
+
)
|