unstructured-ingest 0.0.2.dev0__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cli.py +6 -1
- unstructured_ingest/cli/cmds/__init__.py +4 -4
- unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
- unstructured_ingest/cli/interfaces.py +13 -6
- unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
- unstructured_ingest/connector/biomed.py +12 -5
- unstructured_ingest/connector/confluence.py +3 -3
- unstructured_ingest/connector/github.py +3 -2
- unstructured_ingest/connector/google_drive.py +1 -2
- unstructured_ingest/connector/mongodb.py +1 -2
- unstructured_ingest/connector/notion/client.py +31 -16
- unstructured_ingest/connector/notion/connector.py +3 -2
- unstructured_ingest/connector/registry.py +2 -2
- unstructured_ingest/connector/vectara.py +7 -2
- unstructured_ingest/interfaces.py +13 -9
- unstructured_ingest/pipeline/interfaces.py +8 -3
- unstructured_ingest/pipeline/reformat/chunking.py +13 -9
- unstructured_ingest/pipeline/reformat/embedding.py +3 -3
- unstructured_ingest/runner/__init__.py +2 -2
- unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
- unstructured_ingest/utils/chunking.py +45 -0
- unstructured_ingest/utils/dep_check.py +1 -1
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/v2/cli/base/cmd.py +66 -12
- unstructured_ingest/v2/cli/base/dest.py +21 -12
- unstructured_ingest/v2/cli/base/src.py +35 -21
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
- unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
- unstructured_ingest/v2/interfaces/__init__.py +2 -1
- unstructured_ingest/v2/interfaces/connector.py +5 -7
- unstructured_ingest/v2/interfaces/downloader.py +17 -8
- unstructured_ingest/v2/interfaces/file_data.py +13 -2
- unstructured_ingest/v2/interfaces/indexer.py +3 -4
- unstructured_ingest/v2/interfaces/process.py +3 -4
- unstructured_ingest/v2/interfaces/processor.py +10 -10
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
- unstructured_ingest/v2/interfaces/uploader.py +3 -3
- unstructured_ingest/v2/pipeline/interfaces.py +3 -5
- unstructured_ingest/v2/pipeline/pipeline.py +73 -7
- unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
- unstructured_ingest/v2/pipeline/steps/download.py +90 -24
- unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
- unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
- unstructured_ingest/v2/pipeline/steps/index.py +14 -10
- unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
- unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
- unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +74 -28
- unstructured_ingest/v2/processes/connector_registry.py +8 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
- unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +53 -35
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +38 -27
- unstructured_ingest/v2/processes/connectors/chroma.py +38 -27
- unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +95 -31
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +92 -53
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +47 -16
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +23 -13
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +18 -11
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +49 -61
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +46 -13
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +50 -20
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +27 -28
- unstructured_ingest/v2/processes/connectors/google_drive.py +52 -42
- unstructured_ingest/v2/processes/connectors/local.py +36 -28
- unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
- unstructured_ingest/v2/processes/connectors/mongodb.py +32 -22
- unstructured_ingest/v2/processes/connectors/onedrive.py +31 -16
- unstructured_ingest/v2/processes/connectors/opensearch.py +81 -43
- unstructured_ingest/v2/processes/connectors/pinecone.py +29 -23
- unstructured_ingest/v2/processes/connectors/salesforce.py +36 -26
- unstructured_ingest/v2/processes/connectors/sharepoint.py +64 -33
- unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
- unstructured_ingest/v2/processes/connectors/sql.py +52 -39
- unstructured_ingest/v2/processes/connectors/weaviate.py +35 -18
- unstructured_ingest/v2/processes/embedder.py +106 -47
- unstructured_ingest/v2/processes/filter.py +60 -0
- unstructured_ingest/v2/processes/partitioner.py +79 -33
- unstructured_ingest/v2/processes/uncompress.py +3 -3
- unstructured_ingest/v2/utils.py +45 -0
- unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +92 -116
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
- unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
- unstructured_ingest/v2/cli/cmds/astra.py +0 -85
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
- unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -77
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
- unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
- unstructured_ingest/v2/cli/cmds/local.py +0 -60
- unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
- unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
- unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
- unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
- unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
- unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
- unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
- unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
- unstructured_ingest/v2/cli/cmds/sql.py +0 -84
- unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
- unstructured_ingest/v2/cli/configs/__init__.py +0 -6
- unstructured_ingest/v2/cli/configs/chunk.py +0 -89
- unstructured_ingest/v2/cli/configs/embed.py +0 -74
- unstructured_ingest/v2/cli/configs/partition.py +0 -99
- unstructured_ingest/v2/cli/configs/processor.py +0 -88
- unstructured_ingest/v2/cli/interfaces.py +0 -27
- unstructured_ingest/v2/pipeline/utils.py +0 -15
- unstructured_ingest-0.0.2.dev0.dist-info/METADATA +0 -321
- /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
|
@@ -3,29 +3,70 @@ from dataclasses import dataclass, fields
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Any, Optional
|
|
5
5
|
|
|
6
|
-
from
|
|
7
|
-
from unstructured.documents.elements import Element, assign_and_map_hash_ids
|
|
8
|
-
from unstructured.staging.base import dict_to_elements, elements_from_json
|
|
6
|
+
from pydantic import BaseModel, Field, SecretStr
|
|
9
7
|
|
|
10
|
-
from unstructured_ingest.
|
|
8
|
+
from unstructured_ingest.utils.chunking import assign_and_map_hash_ids
|
|
9
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
10
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
12
11
|
from unstructured_ingest.v2.logger import logger
|
|
13
12
|
|
|
13
|
+
CHUNK_MAX_CHARS_DEFAULT: int = 500
|
|
14
|
+
CHUNK_MULTI_PAGE_DEFAULT: bool = True
|
|
14
15
|
|
|
15
|
-
@dataclass
|
|
16
|
-
class ChunkerConfig(EnhancedDataClassJsonMixin):
|
|
17
|
-
chunking_strategy: Optional[str] = None
|
|
18
|
-
chunking_endpoint: Optional[str] = "https://api.unstructured.io/general/v0/general"
|
|
19
|
-
chunk_by_api: bool = False
|
|
20
|
-
chunk_api_key: Optional[str] = enhanced_field(default=None, sensitive=True)
|
|
21
16
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
17
|
+
class ChunkerConfig(BaseModel):
|
|
18
|
+
chunking_strategy: Optional[str] = Field(
|
|
19
|
+
default=None, description="The rule-set to use to form chunks. Omit to disable chunking."
|
|
20
|
+
)
|
|
21
|
+
chunking_endpoint: Optional[str] = Field(
|
|
22
|
+
default="https://api.unstructured.io/general/v0/general",
|
|
23
|
+
description="If chunking via api, use the following host.",
|
|
24
|
+
)
|
|
25
|
+
chunk_by_api: bool = Field(default=False, description="Flag to use api for chunking")
|
|
26
|
+
chunk_api_key: Optional[SecretStr] = Field(
|
|
27
|
+
default=None, description="API Key for chunking endpoint."
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
chunk_combine_text_under_n_chars: Optional[int] = Field(
|
|
31
|
+
default=None,
|
|
32
|
+
description="Combine consecutive chunks when the first does not exceed this length and"
|
|
33
|
+
" the second will fit without exceeding the hard-maximum length. Only"
|
|
34
|
+
" operative for 'by_title' chunking-strategy.",
|
|
35
|
+
)
|
|
36
|
+
chunk_include_orig_elements: Optional[bool] = Field(
|
|
37
|
+
default=None,
|
|
38
|
+
description="When chunking, add the original elements consolidated to form each chunk to"
|
|
39
|
+
" `.metadata.orig_elements` on that chunk.",
|
|
40
|
+
)
|
|
41
|
+
chunk_max_characters: int = Field(
|
|
42
|
+
default=CHUNK_MAX_CHARS_DEFAULT,
|
|
43
|
+
description="Hard maximum chunk length. No chunk will exceed this length. An oversized"
|
|
44
|
+
" element will be divided by text-splitting to fit this window.",
|
|
45
|
+
)
|
|
46
|
+
chunk_multipage_sections: bool = Field(
|
|
47
|
+
default=CHUNK_MULTI_PAGE_DEFAULT,
|
|
48
|
+
description="Ignore page boundaries when chunking such that elements from two different"
|
|
49
|
+
" pages can appear in the same chunk. Only operative for 'by_title'"
|
|
50
|
+
" chunking-strategy.",
|
|
51
|
+
)
|
|
52
|
+
chunk_new_after_n_chars: Optional[int] = Field(
|
|
53
|
+
default=None,
|
|
54
|
+
description="Soft-maximum chunk length. Another element will not be added to a chunk of"
|
|
55
|
+
" this length even when it would fit without exceeding the hard-maximum"
|
|
56
|
+
" length.",
|
|
57
|
+
)
|
|
58
|
+
chunk_overlap: Optional[int] = Field(
|
|
59
|
+
default=None,
|
|
60
|
+
description="Prefix chunk text with last overlap=N characters of prior chunk. Only"
|
|
61
|
+
" applies to oversized chunks divided by text-splitting. To apply overlap to"
|
|
62
|
+
" non-oversized chunks use the --overlap-all option.",
|
|
63
|
+
)
|
|
64
|
+
chunk_overlap_all: Optional[bool] = Field(
|
|
65
|
+
default=None,
|
|
66
|
+
description="Apply overlap to chunks formed from whole elements as well as those formed"
|
|
67
|
+
" by text-splitting oversized elements. Overlap length is take from --overlap"
|
|
68
|
+
" option value.",
|
|
69
|
+
)
|
|
29
70
|
|
|
30
71
|
def to_chunking_kwargs(self) -> dict[str, Any]:
|
|
31
72
|
return {
|
|
@@ -47,10 +88,14 @@ class Chunker(BaseProcess, ABC):
|
|
|
47
88
|
def is_async(self) -> bool:
|
|
48
89
|
return self.config.chunk_by_api
|
|
49
90
|
|
|
50
|
-
|
|
91
|
+
@requires_dependencies(dependencies=["unstructured"])
|
|
92
|
+
def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
|
|
93
|
+
from unstructured.chunking import dispatch
|
|
94
|
+
from unstructured.staging.base import elements_from_json
|
|
95
|
+
|
|
51
96
|
elements = elements_from_json(filename=str(elements_filepath))
|
|
52
97
|
if not elements:
|
|
53
|
-
return elements
|
|
98
|
+
return [e.to_dict() for e in elements]
|
|
54
99
|
local_chunking_strategies = ("basic", "by_title")
|
|
55
100
|
if self.config.chunking_strategy not in local_chunking_strategies:
|
|
56
101
|
logger.warning(
|
|
@@ -58,17 +103,19 @@ class Chunker(BaseProcess, ABC):
|
|
|
58
103
|
self.config.chunking_strategy, ", ".join(local_chunking_strategies)
|
|
59
104
|
)
|
|
60
105
|
)
|
|
61
|
-
return elements
|
|
106
|
+
return [e.to_dict() for e in elements]
|
|
62
107
|
chunked_elements = dispatch.chunk(elements=elements, **self.config.to_chunking_kwargs())
|
|
63
|
-
|
|
64
|
-
|
|
108
|
+
chunked_elements_dicts = [e.to_dict() for e in chunked_elements]
|
|
109
|
+
chunked_elements_dicts = assign_and_map_hash_ids(elements=chunked_elements_dicts)
|
|
110
|
+
return chunked_elements_dicts
|
|
65
111
|
|
|
66
|
-
|
|
112
|
+
@requires_dependencies(dependencies=["unstructured_client"], extras="remote")
|
|
113
|
+
async def run_async(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
|
|
67
114
|
from unstructured_client import UnstructuredClient
|
|
68
115
|
from unstructured_client.models.shared import Files, PartitionParameters
|
|
69
116
|
|
|
70
117
|
client = UnstructuredClient(
|
|
71
|
-
api_key_auth=self.config.chunk_api_key,
|
|
118
|
+
api_key_auth=self.config.chunk_api_key.get_secret_value(),
|
|
72
119
|
server_url=self.config.chunking_endpoint,
|
|
73
120
|
)
|
|
74
121
|
partition_request = self.config.to_chunking_kwargs()
|
|
@@ -89,9 +136,8 @@ class Chunker(BaseProcess, ABC):
|
|
|
89
136
|
file_name=str(elements_filepath.resolve()),
|
|
90
137
|
)
|
|
91
138
|
filtered_partition_request["files"] = files
|
|
92
|
-
|
|
139
|
+
partition_params = PartitionParameters(**filtered_partition_request)
|
|
93
140
|
resp = client.general.partition(partition_params)
|
|
94
|
-
|
|
95
|
-
elements =
|
|
96
|
-
assign_and_map_hash_ids(elements)
|
|
141
|
+
elements = resp.elements or []
|
|
142
|
+
elements = assign_and_map_hash_ids(elements=elements)
|
|
97
143
|
return elements
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from abc import ABC
|
|
1
2
|
from dataclasses import dataclass
|
|
2
3
|
from typing import Optional, Type, TypeVar
|
|
3
4
|
|
|
@@ -25,7 +26,12 @@ UploaderT = TypeVar("UploaderT", bound=Uploader)
|
|
|
25
26
|
|
|
26
27
|
|
|
27
28
|
@dataclass
|
|
28
|
-
class
|
|
29
|
+
class RegistryEntry(ABC):
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class SourceRegistryEntry(RegistryEntry):
|
|
29
35
|
indexer: Type[IndexerT]
|
|
30
36
|
downloader: Type[DownloaderT]
|
|
31
37
|
|
|
@@ -44,7 +50,7 @@ def add_source_entry(source_type: str, entry: SourceRegistryEntry):
|
|
|
44
50
|
|
|
45
51
|
|
|
46
52
|
@dataclass
|
|
47
|
-
class DestinationRegistryEntry:
|
|
53
|
+
class DestinationRegistryEntry(RegistryEntry):
|
|
48
54
|
uploader: Type[UploaderT]
|
|
49
55
|
upload_stager: Optional[Type[UploadStagerT]] = None
|
|
50
56
|
|
|
@@ -6,10 +6,14 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
6
6
|
add_source_entry,
|
|
7
7
|
)
|
|
8
8
|
|
|
9
|
-
from .
|
|
10
|
-
from .
|
|
9
|
+
from .astradb import CONNECTOR_TYPE as ASTRA_DB_CONNECTOR_TYPE
|
|
10
|
+
from .astradb import astra_db_destination_entry
|
|
11
|
+
from .azure_cognitive_search import CONNECTOR_TYPE as AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE
|
|
12
|
+
from .azure_cognitive_search import azure_cognitive_search_destination_entry
|
|
11
13
|
from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
|
|
12
14
|
from .chroma import chroma_destination_entry
|
|
15
|
+
from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
|
|
16
|
+
from .couchbase import couchbase_destination_entry
|
|
13
17
|
from .databricks_volumes import CONNECTOR_TYPE as DATABRICKS_VOLUMES_CONNECTOR_TYPE
|
|
14
18
|
from .databricks_volumes import databricks_volumes_destination_entry
|
|
15
19
|
from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
|
|
@@ -39,10 +43,12 @@ from .sql import sql_destination_entry
|
|
|
39
43
|
from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
|
|
40
44
|
from .weaviate import weaviate_destination_entry
|
|
41
45
|
|
|
42
|
-
add_destination_entry(destination_type=
|
|
46
|
+
add_destination_entry(destination_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_destination_entry)
|
|
43
47
|
|
|
44
48
|
add_destination_entry(destination_type=CHROMA_CONNECTOR_TYPE, entry=chroma_destination_entry)
|
|
45
49
|
|
|
50
|
+
add_destination_entry(destination_type=COUCHBASE_CONNECTOR_TYPE, entry=couchbase_destination_entry)
|
|
51
|
+
|
|
46
52
|
add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
|
|
47
53
|
add_destination_entry(
|
|
48
54
|
destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry
|
|
@@ -77,3 +83,7 @@ add_destination_entry(
|
|
|
77
83
|
destination_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_destination_entry
|
|
78
84
|
)
|
|
79
85
|
add_destination_entry(destination_type=MILVUS_CONNECTOR_TYPE, entry=milvus_destination_entry)
|
|
86
|
+
add_destination_entry(
|
|
87
|
+
destination_type=AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE,
|
|
88
|
+
entry=azure_cognitive_search_destination_entry,
|
|
89
|
+
)
|
|
@@ -3,10 +3,11 @@ from dataclasses import dataclass, field
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Optional
|
|
5
5
|
|
|
6
|
-
from
|
|
7
|
-
from unstructured.__version__ import __version__ as integration_version
|
|
6
|
+
from pydantic import Field, Secret
|
|
8
7
|
|
|
9
|
-
from unstructured_ingest
|
|
8
|
+
from unstructured_ingest import __name__ as integration_name
|
|
9
|
+
from unstructured_ingest.__version__ import __version__ as integration_version
|
|
10
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
10
11
|
from unstructured_ingest.utils.data_prep import batch_generator
|
|
11
12
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
13
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -27,30 +28,30 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
27
28
|
if TYPE_CHECKING:
|
|
28
29
|
from astrapy.db import AstraDBCollection
|
|
29
30
|
|
|
30
|
-
CONNECTOR_TYPE = "
|
|
31
|
+
CONNECTOR_TYPE = "astradb"
|
|
31
32
|
|
|
32
33
|
|
|
33
34
|
@dataclass
|
|
34
|
-
class
|
|
35
|
-
token: str
|
|
36
|
-
api_endpoint: str
|
|
35
|
+
class AstraDBAccessConfig(AccessConfig):
|
|
36
|
+
token: str = Field(description="Astra DB Token with access to the database.")
|
|
37
|
+
api_endpoint: str = Field(description="The API endpoint for the Astra DB.")
|
|
37
38
|
|
|
38
39
|
|
|
39
40
|
@dataclass
|
|
40
|
-
class
|
|
41
|
+
class AstraDBConnectionConfig(ConnectionConfig):
|
|
41
42
|
connection_type: str = CONNECTOR_TYPE
|
|
42
|
-
access_config:
|
|
43
|
+
access_config: Secret[AstraDBAccessConfig]
|
|
43
44
|
|
|
44
45
|
|
|
45
46
|
@dataclass
|
|
46
|
-
class
|
|
47
|
+
class AstraDBUploadStagerConfig(UploadStagerConfig):
|
|
47
48
|
pass
|
|
48
49
|
|
|
49
50
|
|
|
50
51
|
@dataclass
|
|
51
|
-
class
|
|
52
|
-
upload_stager_config:
|
|
53
|
-
default_factory=lambda:
|
|
52
|
+
class AstraDBUploadStager(UploadStager):
|
|
53
|
+
upload_stager_config: AstraDBUploadStagerConfig = field(
|
|
54
|
+
default_factory=lambda: AstraDBUploadStagerConfig()
|
|
54
55
|
)
|
|
55
56
|
|
|
56
57
|
def conform_dict(self, element_dict: dict) -> dict:
|
|
@@ -79,22 +80,38 @@ class AstraUploadStager(UploadStager):
|
|
|
79
80
|
return output_path
|
|
80
81
|
|
|
81
82
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
83
|
+
class AstraDBUploaderConfig(UploaderConfig):
|
|
84
|
+
collection_name: str = Field(
|
|
85
|
+
description="The name of the Astra DB collection. "
|
|
86
|
+
"Note that the collection name must only include letters, "
|
|
87
|
+
"numbers, and underscores."
|
|
88
|
+
)
|
|
89
|
+
embedding_dimension: int = Field(
|
|
90
|
+
default=384, description="The dimensionality of the embeddings"
|
|
91
|
+
)
|
|
92
|
+
namespace: Optional[str] = Field(default=None, description="The Astra DB connection namespace.")
|
|
93
|
+
requested_indexing_policy: Optional[dict[str, Any]] = Field(
|
|
94
|
+
default=None,
|
|
95
|
+
description="The indexing policy to use for the collection.",
|
|
96
|
+
examples=['{"deny": ["metadata"]}'],
|
|
97
|
+
)
|
|
98
|
+
batch_size: int = Field(default=20, description="Number of records per batch")
|
|
89
99
|
|
|
90
100
|
|
|
91
101
|
@dataclass
|
|
92
|
-
class
|
|
93
|
-
connection_config:
|
|
94
|
-
upload_config:
|
|
102
|
+
class AstraDBUploader(Uploader):
|
|
103
|
+
connection_config: AstraDBConnectionConfig
|
|
104
|
+
upload_config: AstraDBUploaderConfig
|
|
95
105
|
connector_type: str = CONNECTOR_TYPE
|
|
96
106
|
|
|
97
|
-
|
|
107
|
+
def precheck(self) -> None:
|
|
108
|
+
try:
|
|
109
|
+
self.get_collection()
|
|
110
|
+
except Exception as e:
|
|
111
|
+
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
112
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
113
|
+
|
|
114
|
+
@requires_dependencies(["astrapy"], extras="astradb")
|
|
98
115
|
def get_collection(self) -> "AstraDBCollection":
|
|
99
116
|
from astrapy.db import AstraDB
|
|
100
117
|
|
|
@@ -103,14 +120,15 @@ class AstraUploader(Uploader):
|
|
|
103
120
|
embedding_dimension = self.upload_config.embedding_dimension
|
|
104
121
|
requested_indexing_policy = self.upload_config.requested_indexing_policy
|
|
105
122
|
|
|
106
|
-
# If the user has requested an indexing policy, pass it to the
|
|
123
|
+
# If the user has requested an indexing policy, pass it to the Astra DB
|
|
107
124
|
options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None
|
|
108
125
|
|
|
109
126
|
# Build the Astra DB object.
|
|
110
127
|
# caller_name/version for AstraDB tracking
|
|
128
|
+
access_configs = self.connection_config.access_config.get_secret_value()
|
|
111
129
|
astra_db = AstraDB(
|
|
112
|
-
api_endpoint=
|
|
113
|
-
token=
|
|
130
|
+
api_endpoint=access_configs.api_endpoint,
|
|
131
|
+
token=access_configs.token,
|
|
114
132
|
namespace=self.upload_config.namespace,
|
|
115
133
|
caller_name=integration_name,
|
|
116
134
|
caller_version=integration_version,
|
|
@@ -136,17 +154,17 @@ class AstraUploader(Uploader):
|
|
|
136
154
|
f"collection {self.upload_config.collection_name}"
|
|
137
155
|
)
|
|
138
156
|
|
|
139
|
-
|
|
157
|
+
astra_db_batch_size = self.upload_config.batch_size
|
|
140
158
|
collection = self.get_collection()
|
|
141
159
|
|
|
142
|
-
for chunk in batch_generator(elements_dict,
|
|
160
|
+
for chunk in batch_generator(elements_dict, astra_db_batch_size):
|
|
143
161
|
collection.insert_many(chunk)
|
|
144
162
|
|
|
145
163
|
|
|
146
|
-
|
|
147
|
-
connection_config=
|
|
148
|
-
upload_stager_config=
|
|
149
|
-
upload_stager=
|
|
150
|
-
uploader_config=
|
|
151
|
-
uploader=
|
|
164
|
+
astra_db_destination_entry = DestinationRegistryEntry(
|
|
165
|
+
connection_config=AstraDBConnectionConfig,
|
|
166
|
+
upload_stager_config=AstraDBUploadStagerConfig,
|
|
167
|
+
upload_stager=AstraDBUploadStager,
|
|
168
|
+
uploader_config=AstraDBUploaderConfig,
|
|
169
|
+
uploader=AstraDBUploader,
|
|
152
170
|
)
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import json
|
|
2
|
-
import typing as t
|
|
3
2
|
import uuid
|
|
4
3
|
from dataclasses import dataclass, field
|
|
5
4
|
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
|
+
|
|
7
|
+
from pydantic import Field, Secret
|
|
6
8
|
|
|
7
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
8
9
|
from unstructured_ingest.error import DestinationConnectionError, WriteError
|
|
9
10
|
from unstructured_ingest.utils.data_prep import batch_generator
|
|
10
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
@@ -20,27 +21,31 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
20
21
|
from unstructured_ingest.v2.logger import logger
|
|
21
22
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
22
23
|
DestinationRegistryEntry,
|
|
23
|
-
add_destination_entry,
|
|
24
24
|
)
|
|
25
25
|
from unstructured_ingest.v2.processes.connectors.utils import parse_datetime
|
|
26
26
|
|
|
27
|
-
if
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
28
|
from azure.search.documents import SearchClient
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
CONNECTOR_TYPE = "azure_cognitive_search"
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
@dataclass
|
|
35
34
|
class AzureCognitiveSearchAccessConfig(AccessConfig):
|
|
36
|
-
|
|
35
|
+
azure_cognitive_search_key: str = Field(
|
|
36
|
+
alias="key", description="Credential that is used for authenticating to an Azure service"
|
|
37
|
+
)
|
|
37
38
|
|
|
38
39
|
|
|
39
|
-
@dataclass
|
|
40
40
|
class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
|
|
41
|
-
endpoint: str
|
|
42
|
-
|
|
43
|
-
|
|
41
|
+
endpoint: str = Field(
|
|
42
|
+
description="The URL endpoint of an Azure AI (Cognitive) search service. "
|
|
43
|
+
"In the form of https://{{service_name}}.search.windows.net"
|
|
44
|
+
)
|
|
45
|
+
index: str = Field(
|
|
46
|
+
description="The name of the Azure AI (Cognitive) Search index to connect to."
|
|
47
|
+
)
|
|
48
|
+
access_config: Secret[AzureCognitiveSearchAccessConfig]
|
|
44
49
|
|
|
45
50
|
@requires_dependencies(["azure.search", "azure.core"], extras="azure-cognitive-search")
|
|
46
51
|
def generate_client(self) -> "SearchClient":
|
|
@@ -50,18 +55,18 @@ class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
|
|
|
50
55
|
return SearchClient(
|
|
51
56
|
endpoint=self.endpoint,
|
|
52
57
|
index_name=self.index,
|
|
53
|
-
credential=AzureKeyCredential(
|
|
58
|
+
credential=AzureKeyCredential(
|
|
59
|
+
self.access_config.get_secret_value().azure_cognitive_search_key
|
|
60
|
+
),
|
|
54
61
|
)
|
|
55
62
|
|
|
56
63
|
|
|
57
|
-
@dataclass
|
|
58
64
|
class AzureCognitiveSearchUploadStagerConfig(UploadStagerConfig):
|
|
59
65
|
pass
|
|
60
66
|
|
|
61
67
|
|
|
62
|
-
@dataclass
|
|
63
68
|
class AzureCognitiveSearchUploaderConfig(UploaderConfig):
|
|
64
|
-
batch_size: int = 100
|
|
69
|
+
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
65
70
|
|
|
66
71
|
|
|
67
72
|
@dataclass
|
|
@@ -122,7 +127,7 @@ class AzureCognitiveSearchUploadStager(UploadStager):
|
|
|
122
127
|
elements_filepath: Path,
|
|
123
128
|
output_dir: Path,
|
|
124
129
|
output_filename: str,
|
|
125
|
-
**kwargs:
|
|
130
|
+
**kwargs: Any,
|
|
126
131
|
) -> Path:
|
|
127
132
|
with open(elements_filepath) as elements_file:
|
|
128
133
|
elements_contents = json.load(elements_file)
|
|
@@ -143,7 +148,7 @@ class AzureCognitiveSearchUploader(Uploader):
|
|
|
143
148
|
|
|
144
149
|
@DestinationConnectionError.wrap
|
|
145
150
|
@requires_dependencies(["azure"], extras="azure-cognitive-search")
|
|
146
|
-
def write_dict(self, *args, elements_dict:
|
|
151
|
+
def write_dict(self, *args, elements_dict: list[dict[str, Any]], **kwargs) -> None:
|
|
147
152
|
import azure.core.exceptions
|
|
148
153
|
|
|
149
154
|
logger.info(
|
|
@@ -169,16 +174,25 @@ class AzureCognitiveSearchUploader(Uploader):
|
|
|
169
174
|
raise WriteError(
|
|
170
175
|
", ".join(
|
|
171
176
|
[
|
|
172
|
-
f"{error.
|
|
177
|
+
f"{error.azure_cognitive_search_key}: "
|
|
178
|
+
f"[{error.status_code}] {error.error_message}"
|
|
173
179
|
for error in errors
|
|
174
180
|
],
|
|
175
181
|
),
|
|
176
182
|
)
|
|
177
183
|
|
|
184
|
+
def precheck(self) -> None:
|
|
185
|
+
try:
|
|
186
|
+
client = self.connection_config.generate_client()
|
|
187
|
+
client.get_document_count()
|
|
188
|
+
except Exception as e:
|
|
189
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
190
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
191
|
+
|
|
178
192
|
def write_dict_wrapper(self, elements_dict):
|
|
179
193
|
return self.write_dict(elements_dict=elements_dict)
|
|
180
194
|
|
|
181
|
-
def run(self, contents: list[UploadContent], **kwargs:
|
|
195
|
+
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
182
196
|
|
|
183
197
|
elements_dict = []
|
|
184
198
|
for content in contents:
|
|
@@ -199,13 +213,10 @@ class AzureCognitiveSearchUploader(Uploader):
|
|
|
199
213
|
self.write_dict(elements_dict=chunk) # noqa: E203
|
|
200
214
|
|
|
201
215
|
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
upload_stager=AzureCognitiveSearchUploadStager,
|
|
209
|
-
upload_stager_config=AzureCognitiveSearchUploadStagerConfig,
|
|
210
|
-
),
|
|
216
|
+
azure_cognitive_search_destination_entry = DestinationRegistryEntry(
|
|
217
|
+
connection_config=AzureCognitiveSearchConnectionConfig,
|
|
218
|
+
uploader=AzureCognitiveSearchUploader,
|
|
219
|
+
uploader_config=AzureCognitiveSearchUploaderConfig,
|
|
220
|
+
upload_stager=AzureCognitiveSearchUploadStager,
|
|
221
|
+
upload_stager_config=AzureCognitiveSearchUploadStagerConfig,
|
|
211
222
|
)
|
|
@@ -3,11 +3,11 @@ import uuid
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from datetime import date, datetime
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import TYPE_CHECKING, Any,
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
7
7
|
|
|
8
8
|
from dateutil import parser
|
|
9
|
+
from pydantic import Field, Secret
|
|
9
10
|
|
|
10
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
11
11
|
from unstructured_ingest.error import DestinationConnectionError
|
|
12
12
|
from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
|
|
13
13
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
@@ -32,26 +32,35 @@ if TYPE_CHECKING:
|
|
|
32
32
|
CONNECTOR_TYPE = "chroma"
|
|
33
33
|
|
|
34
34
|
|
|
35
|
-
@dataclass
|
|
36
35
|
class ChromaAccessConfig(AccessConfig):
|
|
37
|
-
settings: Optional[
|
|
38
|
-
|
|
36
|
+
settings: Optional[dict[str, str]] = Field(
|
|
37
|
+
default=None, description="A dictionary of settings to communicate with the chroma server."
|
|
38
|
+
)
|
|
39
|
+
headers: Optional[dict[str, str]] = Field(
|
|
40
|
+
default=None, description="A dictionary of headers to send to the Chroma server."
|
|
41
|
+
)
|
|
39
42
|
|
|
40
43
|
|
|
41
|
-
@dataclass
|
|
42
44
|
class ChromaConnectionConfig(ConnectionConfig):
|
|
43
|
-
collection_name: str
|
|
44
|
-
access_config: ChromaAccessConfig
|
|
45
|
-
path: Optional[str] =
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
45
|
+
collection_name: str = Field(description="The name of the Chroma collection to write into.")
|
|
46
|
+
access_config: Secret[ChromaAccessConfig]
|
|
47
|
+
path: Optional[str] = Field(
|
|
48
|
+
default=None, description="Location where Chroma is persisted, if not connecting via http."
|
|
49
|
+
)
|
|
50
|
+
tenant: Optional[str] = Field(
|
|
51
|
+
default="default_tenant", description="The tenant to use for this client."
|
|
52
|
+
)
|
|
53
|
+
database: Optional[str] = Field(
|
|
54
|
+
default="default_database", description="The database to use for this client."
|
|
55
|
+
)
|
|
56
|
+
host: Optional[str] = Field(default=None, description="The hostname of the Chroma server.")
|
|
57
|
+
port: Optional[int] = Field(default=None, description="The port of the Chroma server.")
|
|
58
|
+
ssl: bool = Field(
|
|
59
|
+
default=False, description="Whether to use SSL to connect to the Chroma server."
|
|
60
|
+
)
|
|
61
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
52
62
|
|
|
53
63
|
|
|
54
|
-
@dataclass
|
|
55
64
|
class ChromaUploadStagerConfig(UploadStagerConfig):
|
|
56
65
|
pass
|
|
57
66
|
|
|
@@ -101,9 +110,8 @@ class ChromaUploadStager(UploadStager):
|
|
|
101
110
|
return output_path
|
|
102
111
|
|
|
103
112
|
|
|
104
|
-
@dataclass
|
|
105
113
|
class ChromaUploaderConfig(UploaderConfig):
|
|
106
|
-
batch_size: int = 100
|
|
114
|
+
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
107
115
|
|
|
108
116
|
|
|
109
117
|
@dataclass
|
|
@@ -111,19 +119,23 @@ class ChromaUploader(Uploader):
|
|
|
111
119
|
connector_type: str = CONNECTOR_TYPE
|
|
112
120
|
upload_config: ChromaUploaderConfig
|
|
113
121
|
connection_config: ChromaConnectionConfig
|
|
114
|
-
client: Optional["Client"] = field(init=False)
|
|
115
122
|
|
|
116
|
-
def
|
|
117
|
-
|
|
123
|
+
def precheck(self) -> None:
|
|
124
|
+
try:
|
|
125
|
+
self.create_client()
|
|
126
|
+
except Exception as e:
|
|
127
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
128
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
118
129
|
|
|
119
130
|
@requires_dependencies(["chromadb"], extras="chroma")
|
|
120
131
|
def create_client(self) -> "Client":
|
|
121
132
|
import chromadb
|
|
122
133
|
|
|
134
|
+
access_config = self.connection_config.access_config.get_secret_value()
|
|
123
135
|
if self.connection_config.path:
|
|
124
136
|
return chromadb.PersistentClient(
|
|
125
137
|
path=self.connection_config.path,
|
|
126
|
-
settings=
|
|
138
|
+
settings=access_config.settings,
|
|
127
139
|
tenant=self.connection_config.tenant,
|
|
128
140
|
database=self.connection_config.database,
|
|
129
141
|
)
|
|
@@ -133,8 +145,8 @@ class ChromaUploader(Uploader):
|
|
|
133
145
|
host=self.connection_config.host,
|
|
134
146
|
port=self.connection_config.port,
|
|
135
147
|
ssl=self.connection_config.ssl,
|
|
136
|
-
headers=
|
|
137
|
-
settings=
|
|
148
|
+
headers=access_config.headers,
|
|
149
|
+
settings=access_config.settings,
|
|
138
150
|
tenant=self.connection_config.tenant,
|
|
139
151
|
database=self.connection_config.database,
|
|
140
152
|
)
|
|
@@ -187,10 +199,9 @@ class ChromaUploader(Uploader):
|
|
|
187
199
|
f"collection {self.connection_config.collection_name} "
|
|
188
200
|
f"at {self.connection_config.host}",
|
|
189
201
|
)
|
|
202
|
+
client = self.create_client()
|
|
190
203
|
|
|
191
|
-
collection =
|
|
192
|
-
name=self.connection_config.collection_name
|
|
193
|
-
)
|
|
204
|
+
collection = client.get_or_create_collection(name=self.connection_config.collection_name)
|
|
194
205
|
for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
|
|
195
206
|
self.upsert_batch(collection, self.prepare_chroma_list(chunk))
|
|
196
207
|
|