unstructured-ingest 1.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/cli/__init__.py +0 -0
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +269 -0
- unstructured_ingest/cli/base/dest.py +84 -0
- unstructured_ingest/cli/base/importer.py +34 -0
- unstructured_ingest/cli/base/src.py +75 -0
- unstructured_ingest/cli/cli.py +24 -0
- unstructured_ingest/cli/cmds.py +14 -0
- unstructured_ingest/cli/utils/__init__.py +0 -0
- unstructured_ingest/cli/utils/click.py +237 -0
- unstructured_ingest/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/data_types/__init__.py +0 -0
- unstructured_ingest/data_types/entities.py +17 -0
- unstructured_ingest/data_types/file_data.py +116 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +63 -0
- unstructured_ingest/embed/bedrock.py +323 -0
- unstructured_ingest/embed/huggingface.py +69 -0
- unstructured_ingest/embed/interfaces.py +146 -0
- unstructured_ingest/embed/mixedbreadai.py +134 -0
- unstructured_ingest/embed/octoai.py +133 -0
- unstructured_ingest/embed/openai.py +142 -0
- unstructured_ingest/embed/togetherai.py +116 -0
- unstructured_ingest/embed/vertexai.py +109 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/error.py +156 -0
- unstructured_ingest/errors_v2.py +156 -0
- unstructured_ingest/interfaces/__init__.py +27 -0
- unstructured_ingest/interfaces/connector.py +56 -0
- unstructured_ingest/interfaces/downloader.py +90 -0
- unstructured_ingest/interfaces/indexer.py +29 -0
- unstructured_ingest/interfaces/process.py +22 -0
- unstructured_ingest/interfaces/processor.py +88 -0
- unstructured_ingest/interfaces/upload_stager.py +89 -0
- unstructured_ingest/interfaces/uploader.py +67 -0
- unstructured_ingest/logger.py +39 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/otel.py +128 -0
- unstructured_ingest/pipeline/__init__.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +211 -0
- unstructured_ingest/pipeline/otel.py +32 -0
- unstructured_ingest/pipeline/pipeline.py +408 -0
- unstructured_ingest/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/pipeline/steps/chunk.py +78 -0
- unstructured_ingest/pipeline/steps/download.py +206 -0
- unstructured_ingest/pipeline/steps/embed.py +77 -0
- unstructured_ingest/pipeline/steps/filter.py +35 -0
- unstructured_ingest/pipeline/steps/index.py +86 -0
- unstructured_ingest/pipeline/steps/partition.py +77 -0
- unstructured_ingest/pipeline/steps/stage.py +65 -0
- unstructured_ingest/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/pipeline/steps/upload.py +58 -0
- unstructured_ingest/processes/__init__.py +18 -0
- unstructured_ingest/processes/chunker.py +131 -0
- unstructured_ingest/processes/connector_registry.py +69 -0
- unstructured_ingest/processes/connectors/__init__.py +129 -0
- unstructured_ingest/processes/connectors/airtable.py +238 -0
- unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +592 -0
- unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
- unstructured_ingest/processes/connectors/chroma.py +193 -0
- unstructured_ingest/processes/connectors/confluence.py +527 -0
- unstructured_ingest/processes/connectors/couchbase.py +336 -0
- unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
- unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
- unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
- unstructured_ingest/processes/connectors/delta_table.py +310 -0
- unstructured_ingest/processes/connectors/discord.py +161 -0
- unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
- unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
- unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
- unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
- unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
- unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
- unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
- unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/processes/connectors/github.py +226 -0
- unstructured_ingest/processes/connectors/gitlab.py +270 -0
- unstructured_ingest/processes/connectors/google_drive.py +848 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
- unstructured_ingest/processes/connectors/jira.py +522 -0
- unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
- unstructured_ingest/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/processes/connectors/kdbai.py +156 -0
- unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
- unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/processes/connectors/local.py +227 -0
- unstructured_ingest/processes/connectors/milvus.py +311 -0
- unstructured_ingest/processes/connectors/mongodb.py +389 -0
- unstructured_ingest/processes/connectors/neo4j.py +534 -0
- unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/processes/connectors/notion/connector.py +350 -0
- unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
- unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
- unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
- unstructured_ingest/processes/connectors/onedrive.py +485 -0
- unstructured_ingest/processes/connectors/outlook.py +242 -0
- unstructured_ingest/processes/connectors/pinecone.py +400 -0
- unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
- unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/processes/connectors/redisdb.py +214 -0
- unstructured_ingest/processes/connectors/salesforce.py +307 -0
- unstructured_ingest/processes/connectors/sharepoint.py +282 -0
- unstructured_ingest/processes/connectors/slack.py +249 -0
- unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
- unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
- unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
- unstructured_ingest/processes/connectors/sql/sql.py +456 -0
- unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
- unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
- unstructured_ingest/processes/connectors/utils.py +60 -0
- unstructured_ingest/processes/connectors/vectara.py +348 -0
- unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
- unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
- unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
- unstructured_ingest/processes/embedder.py +203 -0
- unstructured_ingest/processes/filter.py +60 -0
- unstructured_ingest/processes/partitioner.py +233 -0
- unstructured_ingest/processes/uncompress.py +61 -0
- unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest/processes/utils/blob_storage.py +32 -0
- unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- unstructured_ingest/unstructured_api.py +140 -0
- unstructured_ingest/utils/__init__.py +5 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +72 -0
- unstructured_ingest/utils/constants.py +2 -0
- unstructured_ingest/utils/data_prep.py +216 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/filesystem.py +27 -0
- unstructured_ingest/utils/html.py +174 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/utils/pydantic_models.py +52 -0
- unstructured_ingest/utils/string_and_date_utils.py +74 -0
- unstructured_ingest/utils/table.py +80 -0
- unstructured_ingest/utils/tls.py +15 -0
- unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
- unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
- unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
- unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
- unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from pydantic import Field, Secret
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.interfaces.connector import AccessConfig
|
|
6
|
+
from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
|
|
7
|
+
from unstructured_ingest.processes.connectors.lancedb.lancedb import (
|
|
8
|
+
LanceDBRemoteConnectionConfig,
|
|
9
|
+
LanceDBUploader,
|
|
10
|
+
LanceDBUploaderConfig,
|
|
11
|
+
LanceDBUploadStager,
|
|
12
|
+
LanceDBUploadStagerConfig,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
CONNECTOR_TYPE = "lancedb_gcs"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class LanceDBGCSAccessConfig(AccessConfig):
|
|
19
|
+
google_service_account_key: str = Field(
|
|
20
|
+
description="The serialized google service account key."
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class LanceDBGCSConnectionConfig(LanceDBRemoteConnectionConfig):
|
|
25
|
+
access_config: Secret[LanceDBGCSAccessConfig]
|
|
26
|
+
|
|
27
|
+
def get_storage_options(self) -> dict:
|
|
28
|
+
return {**self.access_config.get_secret_value().model_dump(), "timeout": self.timeout}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class LanceDBGSPUploader(LanceDBUploader):
|
|
33
|
+
upload_config: LanceDBUploaderConfig
|
|
34
|
+
connection_config: LanceDBGCSConnectionConfig
|
|
35
|
+
connector_type: str = CONNECTOR_TYPE
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
lancedb_gcp_destination_entry = DestinationRegistryEntry(
|
|
39
|
+
connection_config=LanceDBGCSConnectionConfig,
|
|
40
|
+
uploader=LanceDBGSPUploader,
|
|
41
|
+
uploader_config=LanceDBUploaderConfig,
|
|
42
|
+
upload_stager_config=LanceDBUploadStagerConfig,
|
|
43
|
+
upload_stager=LanceDBUploadStager,
|
|
44
|
+
)
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from contextlib import asynccontextmanager
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, Optional
|
|
10
|
+
|
|
11
|
+
from pydantic import Field
|
|
12
|
+
|
|
13
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
14
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
15
|
+
from unstructured_ingest.interfaces import (
|
|
16
|
+
ConnectionConfig,
|
|
17
|
+
Uploader,
|
|
18
|
+
UploaderConfig,
|
|
19
|
+
UploadStager,
|
|
20
|
+
UploadStagerConfig,
|
|
21
|
+
)
|
|
22
|
+
from unstructured_ingest.logger import logger
|
|
23
|
+
from unstructured_ingest.utils.constants import RECORD_ID_LABEL
|
|
24
|
+
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
25
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
26
|
+
|
|
27
|
+
CONNECTOR_TYPE = "lancedb"
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from lancedb import AsyncConnection
|
|
31
|
+
from lancedb.table import AsyncTable
|
|
32
|
+
from pandas import DataFrame
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class LanceDBConnectionConfig(ConnectionConfig, ABC):
|
|
36
|
+
uri: str = Field(description="The uri of the database.")
|
|
37
|
+
|
|
38
|
+
@abstractmethod
|
|
39
|
+
def get_storage_options(self) -> Optional[dict[str, str]]:
|
|
40
|
+
raise NotImplementedError
|
|
41
|
+
|
|
42
|
+
@asynccontextmanager
|
|
43
|
+
@requires_dependencies(["lancedb"], extras="lancedb")
|
|
44
|
+
@DestinationConnectionError.wrap
|
|
45
|
+
async def get_async_connection(self) -> AsyncGenerator["AsyncConnection", None]:
|
|
46
|
+
import lancedb
|
|
47
|
+
|
|
48
|
+
with await lancedb.connect_async(
|
|
49
|
+
self.uri,
|
|
50
|
+
storage_options=self.get_storage_options(),
|
|
51
|
+
) as connection:
|
|
52
|
+
yield connection
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class LanceDBRemoteConnectionConfig(LanceDBConnectionConfig):
|
|
56
|
+
timeout: str = Field(
|
|
57
|
+
default="30s",
|
|
58
|
+
description=(
|
|
59
|
+
"Timeout for the entire request, from connection until the response body has finished"
|
|
60
|
+
"in a [0-9]+(ns|us|ms|[smhdwy]) format."
|
|
61
|
+
),
|
|
62
|
+
pattern=r"[0-9]+(ns|us|ms|[smhdwy])",
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class LanceDBUploadStagerConfig(UploadStagerConfig):
|
|
67
|
+
pass
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dataclass
|
|
71
|
+
class LanceDBUploadStager(UploadStager):
|
|
72
|
+
upload_stager_config: LanceDBUploadStagerConfig = field(
|
|
73
|
+
default_factory=LanceDBUploadStagerConfig
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
@requires_dependencies(["pandas"], extras="lancedb")
|
|
77
|
+
def run(
|
|
78
|
+
self,
|
|
79
|
+
elements_filepath: Path,
|
|
80
|
+
file_data: FileData,
|
|
81
|
+
output_dir: Path,
|
|
82
|
+
output_filename: str,
|
|
83
|
+
**kwargs: Any,
|
|
84
|
+
) -> Path:
|
|
85
|
+
import pandas as pd
|
|
86
|
+
|
|
87
|
+
with open(elements_filepath) as elements_file:
|
|
88
|
+
elements_contents: list[dict] = json.load(elements_file)
|
|
89
|
+
|
|
90
|
+
df = pd.DataFrame(
|
|
91
|
+
[
|
|
92
|
+
self.conform_dict(element_dict=element_dict, file_data=file_data)
|
|
93
|
+
for element_dict in elements_contents
|
|
94
|
+
]
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
output_path = (output_dir / output_filename).with_suffix(".feather")
|
|
98
|
+
df.to_feather(output_path)
|
|
99
|
+
|
|
100
|
+
return output_path
|
|
101
|
+
|
|
102
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
103
|
+
data = element_dict.copy()
|
|
104
|
+
return {
|
|
105
|
+
"vector": data.pop("embeddings", None),
|
|
106
|
+
RECORD_ID_LABEL: file_data.identifier,
|
|
107
|
+
**flatten_dict(data, separator="-"),
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class LanceDBUploaderConfig(UploaderConfig):
|
|
112
|
+
table_name: str = Field(description="The name of the table.")
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@dataclass
|
|
116
|
+
class LanceDBUploader(Uploader):
|
|
117
|
+
upload_config: LanceDBUploaderConfig
|
|
118
|
+
connection_config: LanceDBConnectionConfig
|
|
119
|
+
connector_type: str = CONNECTOR_TYPE
|
|
120
|
+
|
|
121
|
+
@DestinationConnectionError.wrap
|
|
122
|
+
def precheck(self):
|
|
123
|
+
async def _precheck() -> None:
|
|
124
|
+
async with self.connection_config.get_async_connection() as conn:
|
|
125
|
+
table = await conn.open_table(self.upload_config.table_name)
|
|
126
|
+
table.close()
|
|
127
|
+
|
|
128
|
+
asyncio.run(_precheck())
|
|
129
|
+
|
|
130
|
+
@asynccontextmanager
|
|
131
|
+
async def get_table(self) -> AsyncGenerator["AsyncTable", None]:
|
|
132
|
+
async with self.connection_config.get_async_connection() as conn:
|
|
133
|
+
table = await conn.open_table(self.upload_config.table_name)
|
|
134
|
+
try:
|
|
135
|
+
yield table
|
|
136
|
+
finally:
|
|
137
|
+
table.close()
|
|
138
|
+
|
|
139
|
+
@requires_dependencies(["pandas"], extras="lancedb")
|
|
140
|
+
async def run_async(self, path, file_data, **kwargs):
|
|
141
|
+
import pandas as pd
|
|
142
|
+
|
|
143
|
+
df = pd.read_feather(path)
|
|
144
|
+
async with self.get_table() as table:
|
|
145
|
+
schema = await table.schema()
|
|
146
|
+
df = self._fit_to_schema(df, schema)
|
|
147
|
+
if RECORD_ID_LABEL not in schema.names:
|
|
148
|
+
logger.warning(
|
|
149
|
+
f"Designated table doesn't contain {RECORD_ID_LABEL} column of type"
|
|
150
|
+
" string which is required to support overwriting updates on subsequent"
|
|
151
|
+
" uploads of the same record. New rows will be appended instead."
|
|
152
|
+
)
|
|
153
|
+
else:
|
|
154
|
+
await table.delete(f'{RECORD_ID_LABEL} = "{file_data.identifier}"')
|
|
155
|
+
await table.add(data=df)
|
|
156
|
+
|
|
157
|
+
def _fit_to_schema(self, df: "DataFrame", schema) -> "DataFrame":
|
|
158
|
+
import pandas as pd
|
|
159
|
+
|
|
160
|
+
columns = set(df.columns)
|
|
161
|
+
schema_fields = set(schema.names)
|
|
162
|
+
columns_to_drop = columns - schema_fields
|
|
163
|
+
missing_columns = schema_fields - columns
|
|
164
|
+
|
|
165
|
+
if columns_to_drop:
|
|
166
|
+
logger.info(
|
|
167
|
+
"Following columns will be dropped to match the table's schema: "
|
|
168
|
+
f"{', '.join(columns_to_drop)}"
|
|
169
|
+
)
|
|
170
|
+
if missing_columns:
|
|
171
|
+
logger.info(
|
|
172
|
+
"Following null filled columns will be added to match the table's schema:"
|
|
173
|
+
f" {', '.join(missing_columns)} "
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
df = df.drop(columns=columns_to_drop)
|
|
177
|
+
|
|
178
|
+
for column in missing_columns:
|
|
179
|
+
df[column] = pd.Series()
|
|
180
|
+
|
|
181
|
+
return df
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from pydantic import Field, Secret
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.interfaces.connector import AccessConfig
|
|
6
|
+
from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
|
|
7
|
+
from unstructured_ingest.processes.connectors.lancedb.lancedb import (
|
|
8
|
+
LanceDBConnectionConfig,
|
|
9
|
+
LanceDBUploader,
|
|
10
|
+
LanceDBUploaderConfig,
|
|
11
|
+
LanceDBUploadStager,
|
|
12
|
+
LanceDBUploadStagerConfig,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
CONNECTOR_TYPE = "lancedb_local"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class LanceDBLocalAccessConfig(AccessConfig):
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class LanceDBLocalConnectionConfig(LanceDBConnectionConfig):
|
|
23
|
+
access_config: Secret[LanceDBLocalAccessConfig] = Field(
|
|
24
|
+
default_factory=LanceDBLocalAccessConfig, validate_default=True
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
def get_storage_options(self) -> None:
|
|
28
|
+
return None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class LanceDBLocalUploader(LanceDBUploader):
|
|
33
|
+
upload_config: LanceDBUploaderConfig
|
|
34
|
+
connection_config: LanceDBLocalConnectionConfig
|
|
35
|
+
connector_type: str = CONNECTOR_TYPE
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
lancedb_local_destination_entry = DestinationRegistryEntry(
|
|
39
|
+
connection_config=LanceDBLocalConnectionConfig,
|
|
40
|
+
uploader=LanceDBLocalUploader,
|
|
41
|
+
uploader_config=LanceDBUploaderConfig,
|
|
42
|
+
upload_stager_config=LanceDBUploadStagerConfig,
|
|
43
|
+
upload_stager=LanceDBUploadStager,
|
|
44
|
+
)
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
import glob
|
|
2
|
+
import json
|
|
3
|
+
import shutil
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from time import time
|
|
7
|
+
from typing import Any, Generator
|
|
8
|
+
|
|
9
|
+
from pydantic import Field, Secret
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.data_types.file_data import (
|
|
12
|
+
FileData,
|
|
13
|
+
FileDataSourceMetadata,
|
|
14
|
+
SourceIdentifiers,
|
|
15
|
+
)
|
|
16
|
+
from unstructured_ingest.error import FileExistsError
|
|
17
|
+
from unstructured_ingest.interfaces import (
|
|
18
|
+
AccessConfig,
|
|
19
|
+
ConnectionConfig,
|
|
20
|
+
Downloader,
|
|
21
|
+
DownloaderConfig,
|
|
22
|
+
DownloadResponse,
|
|
23
|
+
Indexer,
|
|
24
|
+
IndexerConfig,
|
|
25
|
+
Uploader,
|
|
26
|
+
UploaderConfig,
|
|
27
|
+
)
|
|
28
|
+
from unstructured_ingest.logger import logger
|
|
29
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
30
|
+
DestinationRegistryEntry,
|
|
31
|
+
SourceRegistryEntry,
|
|
32
|
+
)
|
|
33
|
+
from unstructured_ingest.processes.utils.blob_storage import (
|
|
34
|
+
BlobStoreUploadStager,
|
|
35
|
+
BlobStoreUploadStagerConfig,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
CONNECTOR_TYPE = "local"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class LocalAccessConfig(AccessConfig):
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class LocalConnectionConfig(ConnectionConfig):
|
|
46
|
+
access_config: Secret[LocalAccessConfig] = Field(
|
|
47
|
+
default=LocalAccessConfig(), validate_default=True
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class LocalIndexerConfig(IndexerConfig):
|
|
52
|
+
input_path: Path = Field(
|
|
53
|
+
description="Path to the location in the local file system that will be processed."
|
|
54
|
+
)
|
|
55
|
+
recursive: bool = Field(
|
|
56
|
+
default=False,
|
|
57
|
+
description="Recursively download files in their respective folders "
|
|
58
|
+
"otherwise stop at the files in provided folder level.",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def path(self) -> Path:
|
|
63
|
+
return Path(self.input_path).resolve()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass
|
|
67
|
+
class LocalIndexer(Indexer):
|
|
68
|
+
index_config: LocalIndexerConfig
|
|
69
|
+
connection_config: LocalConnectionConfig = field(
|
|
70
|
+
default_factory=lambda: LocalConnectionConfig()
|
|
71
|
+
)
|
|
72
|
+
connector_type: str = CONNECTOR_TYPE
|
|
73
|
+
|
|
74
|
+
def list_files(self) -> list[Path]:
|
|
75
|
+
input_path = self.index_config.path
|
|
76
|
+
if input_path.is_file():
|
|
77
|
+
return [Path(s) for s in glob.glob(f"{self.index_config.path}")]
|
|
78
|
+
files = []
|
|
79
|
+
if self.index_config.recursive:
|
|
80
|
+
files.extend(list(input_path.rglob("*")))
|
|
81
|
+
else:
|
|
82
|
+
files.extend(list(input_path.glob("*")))
|
|
83
|
+
return [f for f in files if f.is_file()]
|
|
84
|
+
|
|
85
|
+
def get_file_metadata(self, path: Path) -> FileDataSourceMetadata:
|
|
86
|
+
stats = path.stat()
|
|
87
|
+
try:
|
|
88
|
+
date_modified = str(stats.st_mtime)
|
|
89
|
+
except Exception as e:
|
|
90
|
+
logger.warning(f"Couldn't detect date modified: {e}")
|
|
91
|
+
date_modified = None
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
date_created = str(stats.st_birthtime)
|
|
95
|
+
except Exception as e:
|
|
96
|
+
logger.warning(f"Couldn't detect date created: {e}")
|
|
97
|
+
date_created = None
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
mode = stats.st_mode
|
|
101
|
+
permissions_data = [{"mode": mode}]
|
|
102
|
+
except Exception as e:
|
|
103
|
+
logger.warning(f"Couldn't detect file mode: {e}")
|
|
104
|
+
permissions_data = None
|
|
105
|
+
|
|
106
|
+
try:
|
|
107
|
+
filesize_bytes = stats.st_size
|
|
108
|
+
except Exception as e:
|
|
109
|
+
logger.warning(f"Couldn't detect file size: {e}")
|
|
110
|
+
filesize_bytes = None
|
|
111
|
+
|
|
112
|
+
return FileDataSourceMetadata(
|
|
113
|
+
date_modified=date_modified,
|
|
114
|
+
date_created=date_created,
|
|
115
|
+
date_processed=str(time()),
|
|
116
|
+
permissions_data=permissions_data,
|
|
117
|
+
record_locator={"path": str(path.resolve())},
|
|
118
|
+
filesize_bytes=filesize_bytes,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
122
|
+
for file_path in self.list_files():
|
|
123
|
+
source_identifiers = SourceIdentifiers(
|
|
124
|
+
fullpath=str(file_path.resolve()),
|
|
125
|
+
filename=file_path.name,
|
|
126
|
+
rel_path=(
|
|
127
|
+
str(file_path.resolve()).replace(str(self.index_config.path.resolve()), "")[1:]
|
|
128
|
+
if not self.index_config.path.is_file()
|
|
129
|
+
else self.index_config.path.name
|
|
130
|
+
),
|
|
131
|
+
)
|
|
132
|
+
file_data = FileData(
|
|
133
|
+
identifier=str(file_path.resolve()),
|
|
134
|
+
connector_type=CONNECTOR_TYPE,
|
|
135
|
+
source_identifiers=source_identifiers,
|
|
136
|
+
metadata=self.get_file_metadata(path=file_path),
|
|
137
|
+
display_name=source_identifiers.fullpath,
|
|
138
|
+
)
|
|
139
|
+
yield file_data
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class LocalDownloaderConfig(DownloaderConfig):
|
|
143
|
+
pass
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
@dataclass
|
|
147
|
+
class LocalDownloader(Downloader):
|
|
148
|
+
connector_type: str = CONNECTOR_TYPE
|
|
149
|
+
connection_config: LocalConnectionConfig = field(default_factory=LocalConnectionConfig)
|
|
150
|
+
download_config: LocalDownloaderConfig = field(default_factory=LocalDownloaderConfig)
|
|
151
|
+
|
|
152
|
+
def get_download_path(self, file_data: FileData) -> Path:
|
|
153
|
+
return Path(file_data.source_identifiers.fullpath)
|
|
154
|
+
|
|
155
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
156
|
+
return DownloadResponse(
|
|
157
|
+
file_data=file_data, path=Path(file_data.source_identifiers.fullpath)
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class LocalUploaderConfig(UploaderConfig):
|
|
162
|
+
output_dir: str = Field(
|
|
163
|
+
default="structured-output", description="Local path to write partitioned output to"
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
@property
|
|
167
|
+
def output_path(self) -> Path:
|
|
168
|
+
return Path(self.output_dir).resolve()
|
|
169
|
+
|
|
170
|
+
def __post_init__(self):
|
|
171
|
+
if self.output_path.exists() and self.output_path.is_file():
|
|
172
|
+
raise FileExistsError(f"output path {self.output_path} already exists as a file")
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
@dataclass
|
|
176
|
+
class LocalUploader(Uploader):
|
|
177
|
+
connector_type: str = CONNECTOR_TYPE
|
|
178
|
+
upload_config: LocalUploaderConfig = field(default_factory=LocalUploaderConfig)
|
|
179
|
+
connection_config: LocalConnectionConfig = field(
|
|
180
|
+
default_factory=lambda: LocalConnectionConfig()
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
def is_async(self) -> bool:
|
|
184
|
+
return False
|
|
185
|
+
|
|
186
|
+
def get_destination_path(self, file_data: FileData) -> Path:
|
|
187
|
+
if source_identifiers := file_data.source_identifiers:
|
|
188
|
+
rel_path = (
|
|
189
|
+
source_identifiers.relative_path[1:]
|
|
190
|
+
if source_identifiers.relative_path.startswith("/")
|
|
191
|
+
else source_identifiers.relative_path
|
|
192
|
+
)
|
|
193
|
+
new_path = self.upload_config.output_path / Path(rel_path)
|
|
194
|
+
final_path = str(new_path).replace(
|
|
195
|
+
source_identifiers.filename, f"{source_identifiers.filename}.json"
|
|
196
|
+
)
|
|
197
|
+
else:
|
|
198
|
+
final_path = self.upload_config.output_path / Path(f"{file_data.identifier}.json")
|
|
199
|
+
final_path = Path(final_path)
|
|
200
|
+
final_path.parent.mkdir(parents=True, exist_ok=True)
|
|
201
|
+
return final_path
|
|
202
|
+
|
|
203
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
204
|
+
final_path = self.get_destination_path(file_data=file_data)
|
|
205
|
+
with final_path.open("w") as f:
|
|
206
|
+
json.dump(data, f)
|
|
207
|
+
|
|
208
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
209
|
+
final_path = self.get_destination_path(file_data=file_data)
|
|
210
|
+
logger.debug(f"copying file from {path} to {final_path}")
|
|
211
|
+
shutil.copy(src=str(path), dst=str(final_path))
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
local_source_entry = SourceRegistryEntry(
|
|
215
|
+
indexer=LocalIndexer,
|
|
216
|
+
indexer_config=LocalIndexerConfig,
|
|
217
|
+
downloader=LocalDownloader,
|
|
218
|
+
downloader_config=LocalDownloaderConfig,
|
|
219
|
+
connection_config=LocalConnectionConfig,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
local_destination_entry = DestinationRegistryEntry(
|
|
223
|
+
uploader=LocalUploader,
|
|
224
|
+
uploader_config=LocalUploaderConfig,
|
|
225
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
226
|
+
upload_stager=BlobStoreUploadStager,
|
|
227
|
+
)
|