unstructured-ingest 0.0.2.dev0__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (125) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cli.py +6 -1
  3. unstructured_ingest/cli/cmds/__init__.py +4 -4
  4. unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
  5. unstructured_ingest/cli/interfaces.py +13 -6
  6. unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
  7. unstructured_ingest/connector/biomed.py +12 -5
  8. unstructured_ingest/connector/confluence.py +3 -3
  9. unstructured_ingest/connector/github.py +3 -2
  10. unstructured_ingest/connector/google_drive.py +1 -2
  11. unstructured_ingest/connector/mongodb.py +1 -2
  12. unstructured_ingest/connector/notion/client.py +31 -16
  13. unstructured_ingest/connector/notion/connector.py +3 -2
  14. unstructured_ingest/connector/registry.py +2 -2
  15. unstructured_ingest/connector/vectara.py +7 -2
  16. unstructured_ingest/interfaces.py +13 -9
  17. unstructured_ingest/pipeline/interfaces.py +8 -3
  18. unstructured_ingest/pipeline/reformat/chunking.py +13 -9
  19. unstructured_ingest/pipeline/reformat/embedding.py +3 -3
  20. unstructured_ingest/runner/__init__.py +2 -2
  21. unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
  22. unstructured_ingest/runner/writers/__init__.py +2 -2
  23. unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
  24. unstructured_ingest/utils/chunking.py +45 -0
  25. unstructured_ingest/utils/dep_check.py +1 -1
  26. unstructured_ingest/utils/google_filetype.py +9 -0
  27. unstructured_ingest/v2/cli/base/cmd.py +66 -12
  28. unstructured_ingest/v2/cli/base/dest.py +21 -12
  29. unstructured_ingest/v2/cli/base/src.py +35 -21
  30. unstructured_ingest/v2/cli/cmds.py +14 -0
  31. unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
  32. unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
  33. unstructured_ingest/v2/interfaces/__init__.py +2 -1
  34. unstructured_ingest/v2/interfaces/connector.py +5 -7
  35. unstructured_ingest/v2/interfaces/downloader.py +17 -8
  36. unstructured_ingest/v2/interfaces/file_data.py +13 -2
  37. unstructured_ingest/v2/interfaces/indexer.py +3 -4
  38. unstructured_ingest/v2/interfaces/process.py +3 -4
  39. unstructured_ingest/v2/interfaces/processor.py +10 -10
  40. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  41. unstructured_ingest/v2/interfaces/uploader.py +3 -3
  42. unstructured_ingest/v2/pipeline/interfaces.py +3 -5
  43. unstructured_ingest/v2/pipeline/pipeline.py +73 -7
  44. unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
  45. unstructured_ingest/v2/pipeline/steps/download.py +90 -24
  46. unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
  47. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  48. unstructured_ingest/v2/pipeline/steps/index.py +14 -10
  49. unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
  50. unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
  51. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
  52. unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
  53. unstructured_ingest/v2/processes/__init__.py +18 -0
  54. unstructured_ingest/v2/processes/chunker.py +74 -28
  55. unstructured_ingest/v2/processes/connector_registry.py +8 -2
  56. unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
  57. unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +53 -35
  58. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +38 -27
  59. unstructured_ingest/v2/processes/connectors/chroma.py +38 -27
  60. unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
  61. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +95 -31
  62. unstructured_ingest/v2/processes/connectors/elasticsearch.py +92 -53
  63. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +47 -16
  64. unstructured_ingest/v2/processes/connectors/fsspec/box.py +23 -13
  65. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +18 -11
  66. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +49 -61
  67. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +46 -13
  68. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +50 -20
  69. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +27 -28
  70. unstructured_ingest/v2/processes/connectors/google_drive.py +52 -42
  71. unstructured_ingest/v2/processes/connectors/local.py +36 -28
  72. unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
  73. unstructured_ingest/v2/processes/connectors/mongodb.py +32 -22
  74. unstructured_ingest/v2/processes/connectors/onedrive.py +31 -16
  75. unstructured_ingest/v2/processes/connectors/opensearch.py +81 -43
  76. unstructured_ingest/v2/processes/connectors/pinecone.py +29 -23
  77. unstructured_ingest/v2/processes/connectors/salesforce.py +36 -26
  78. unstructured_ingest/v2/processes/connectors/sharepoint.py +64 -33
  79. unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
  80. unstructured_ingest/v2/processes/connectors/sql.py +52 -39
  81. unstructured_ingest/v2/processes/connectors/weaviate.py +35 -18
  82. unstructured_ingest/v2/processes/embedder.py +106 -47
  83. unstructured_ingest/v2/processes/filter.py +60 -0
  84. unstructured_ingest/v2/processes/partitioner.py +79 -33
  85. unstructured_ingest/v2/processes/uncompress.py +3 -3
  86. unstructured_ingest/v2/utils.py +45 -0
  87. unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
  88. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +92 -116
  89. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
  90. unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
  91. unstructured_ingest/v2/cli/cmds/astra.py +0 -85
  92. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
  93. unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
  94. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
  95. unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
  96. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
  97. unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
  98. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
  99. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -77
  100. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
  101. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
  102. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
  103. unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
  104. unstructured_ingest/v2/cli/cmds/local.py +0 -60
  105. unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
  106. unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
  107. unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
  108. unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
  109. unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
  110. unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
  111. unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
  112. unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
  113. unstructured_ingest/v2/cli/cmds/sql.py +0 -84
  114. unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
  115. unstructured_ingest/v2/cli/configs/__init__.py +0 -6
  116. unstructured_ingest/v2/cli/configs/chunk.py +0 -89
  117. unstructured_ingest/v2/cli/configs/embed.py +0 -74
  118. unstructured_ingest/v2/cli/configs/partition.py +0 -99
  119. unstructured_ingest/v2/cli/configs/processor.py +0 -88
  120. unstructured_ingest/v2/cli/interfaces.py +0 -27
  121. unstructured_ingest/v2/pipeline/utils.py +0 -15
  122. unstructured_ingest-0.0.2.dev0.dist-info/METADATA +0 -321
  123. /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  124. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
  125. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
@@ -3,29 +3,70 @@ from dataclasses import dataclass, fields
3
3
  from pathlib import Path
4
4
  from typing import Any, Optional
5
5
 
6
- from unstructured.chunking import dispatch
7
- from unstructured.documents.elements import Element, assign_and_map_hash_ids
8
- from unstructured.staging.base import dict_to_elements, elements_from_json
6
+ from pydantic import BaseModel, Field, SecretStr
9
7
 
10
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
8
+ from unstructured_ingest.utils.chunking import assign_and_map_hash_ids
9
+ from unstructured_ingest.utils.dep_check import requires_dependencies
11
10
  from unstructured_ingest.v2.interfaces.process import BaseProcess
12
11
  from unstructured_ingest.v2.logger import logger
13
12
 
13
+ CHUNK_MAX_CHARS_DEFAULT: int = 500
14
+ CHUNK_MULTI_PAGE_DEFAULT: bool = True
14
15
 
15
- @dataclass
16
- class ChunkerConfig(EnhancedDataClassJsonMixin):
17
- chunking_strategy: Optional[str] = None
18
- chunking_endpoint: Optional[str] = "https://api.unstructured.io/general/v0/general"
19
- chunk_by_api: bool = False
20
- chunk_api_key: Optional[str] = enhanced_field(default=None, sensitive=True)
21
16
 
22
- chunk_combine_text_under_n_chars: Optional[int] = None
23
- chunk_include_orig_elements: Optional[bool] = None
24
- chunk_max_characters: Optional[int] = None
25
- chunk_multipage_sections: Optional[bool] = None
26
- chunk_new_after_n_chars: Optional[int] = None
27
- chunk_overlap: Optional[int] = None
28
- chunk_overlap_all: Optional[bool] = None
17
+ class ChunkerConfig(BaseModel):
18
+ chunking_strategy: Optional[str] = Field(
19
+ default=None, description="The rule-set to use to form chunks. Omit to disable chunking."
20
+ )
21
+ chunking_endpoint: Optional[str] = Field(
22
+ default="https://api.unstructured.io/general/v0/general",
23
+ description="If chunking via api, use the following host.",
24
+ )
25
+ chunk_by_api: bool = Field(default=False, description="Flag to use api for chunking")
26
+ chunk_api_key: Optional[SecretStr] = Field(
27
+ default=None, description="API Key for chunking endpoint."
28
+ )
29
+
30
+ chunk_combine_text_under_n_chars: Optional[int] = Field(
31
+ default=None,
32
+ description="Combine consecutive chunks when the first does not exceed this length and"
33
+ " the second will fit without exceeding the hard-maximum length. Only"
34
+ " operative for 'by_title' chunking-strategy.",
35
+ )
36
+ chunk_include_orig_elements: Optional[bool] = Field(
37
+ default=None,
38
+ description="When chunking, add the original elements consolidated to form each chunk to"
39
+ " `.metadata.orig_elements` on that chunk.",
40
+ )
41
+ chunk_max_characters: int = Field(
42
+ default=CHUNK_MAX_CHARS_DEFAULT,
43
+ description="Hard maximum chunk length. No chunk will exceed this length. An oversized"
44
+ " element will be divided by text-splitting to fit this window.",
45
+ )
46
+ chunk_multipage_sections: bool = Field(
47
+ default=CHUNK_MULTI_PAGE_DEFAULT,
48
+ description="Ignore page boundaries when chunking such that elements from two different"
49
+ " pages can appear in the same chunk. Only operative for 'by_title'"
50
+ " chunking-strategy.",
51
+ )
52
+ chunk_new_after_n_chars: Optional[int] = Field(
53
+ default=None,
54
+ description="Soft-maximum chunk length. Another element will not be added to a chunk of"
55
+ " this length even when it would fit without exceeding the hard-maximum"
56
+ " length.",
57
+ )
58
+ chunk_overlap: Optional[int] = Field(
59
+ default=None,
60
+ description="Prefix chunk text with last overlap=N characters of prior chunk. Only"
61
+ " applies to oversized chunks divided by text-splitting. To apply overlap to"
62
+ " non-oversized chunks use the --overlap-all option.",
63
+ )
64
+ chunk_overlap_all: Optional[bool] = Field(
65
+ default=None,
66
+ description="Apply overlap to chunks formed from whole elements as well as those formed"
67
+ " by text-splitting oversized elements. Overlap length is take from --overlap"
68
+ " option value.",
69
+ )
29
70
 
30
71
  def to_chunking_kwargs(self) -> dict[str, Any]:
31
72
  return {
@@ -47,10 +88,14 @@ class Chunker(BaseProcess, ABC):
47
88
  def is_async(self) -> bool:
48
89
  return self.config.chunk_by_api
49
90
 
50
- def run(self, elements_filepath: Path, **kwargs: Any) -> list[Element]:
91
+ @requires_dependencies(dependencies=["unstructured"])
92
+ def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
93
+ from unstructured.chunking import dispatch
94
+ from unstructured.staging.base import elements_from_json
95
+
51
96
  elements = elements_from_json(filename=str(elements_filepath))
52
97
  if not elements:
53
- return elements
98
+ return [e.to_dict() for e in elements]
54
99
  local_chunking_strategies = ("basic", "by_title")
55
100
  if self.config.chunking_strategy not in local_chunking_strategies:
56
101
  logger.warning(
@@ -58,17 +103,19 @@ class Chunker(BaseProcess, ABC):
58
103
  self.config.chunking_strategy, ", ".join(local_chunking_strategies)
59
104
  )
60
105
  )
61
- return elements
106
+ return [e.to_dict() for e in elements]
62
107
  chunked_elements = dispatch.chunk(elements=elements, **self.config.to_chunking_kwargs())
63
- assign_and_map_hash_ids(chunked_elements)
64
- return chunked_elements
108
+ chunked_elements_dicts = [e.to_dict() for e in chunked_elements]
109
+ chunked_elements_dicts = assign_and_map_hash_ids(elements=chunked_elements_dicts)
110
+ return chunked_elements_dicts
65
111
 
66
- async def run_async(self, elements_filepath: Path, **kwargs: Any) -> list[Element]:
112
+ @requires_dependencies(dependencies=["unstructured_client"], extras="remote")
113
+ async def run_async(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
67
114
  from unstructured_client import UnstructuredClient
68
115
  from unstructured_client.models.shared import Files, PartitionParameters
69
116
 
70
117
  client = UnstructuredClient(
71
- api_key_auth=self.config.chunk_api_key,
118
+ api_key_auth=self.config.chunk_api_key.get_secret_value(),
72
119
  server_url=self.config.chunking_endpoint,
73
120
  )
74
121
  partition_request = self.config.to_chunking_kwargs()
@@ -89,9 +136,8 @@ class Chunker(BaseProcess, ABC):
89
136
  file_name=str(elements_filepath.resolve()),
90
137
  )
91
138
  filtered_partition_request["files"] = files
92
- partition_params = PartitionParameters(**filtered_partition_request)
139
+ partition_params = PartitionParameters(**filtered_partition_request)
93
140
  resp = client.general.partition(partition_params)
94
- elements_raw = resp.elements or []
95
- elements = dict_to_elements(elements_raw)
96
- assign_and_map_hash_ids(elements)
141
+ elements = resp.elements or []
142
+ elements = assign_and_map_hash_ids(elements=elements)
97
143
  return elements
@@ -1,3 +1,4 @@
1
+ from abc import ABC
1
2
  from dataclasses import dataclass
2
3
  from typing import Optional, Type, TypeVar
3
4
 
@@ -25,7 +26,12 @@ UploaderT = TypeVar("UploaderT", bound=Uploader)
25
26
 
26
27
 
27
28
  @dataclass
28
- class SourceRegistryEntry:
29
+ class RegistryEntry(ABC):
30
+ pass
31
+
32
+
33
+ @dataclass
34
+ class SourceRegistryEntry(RegistryEntry):
29
35
  indexer: Type[IndexerT]
30
36
  downloader: Type[DownloaderT]
31
37
 
@@ -44,7 +50,7 @@ def add_source_entry(source_type: str, entry: SourceRegistryEntry):
44
50
 
45
51
 
46
52
  @dataclass
47
- class DestinationRegistryEntry:
53
+ class DestinationRegistryEntry(RegistryEntry):
48
54
  uploader: Type[UploaderT]
49
55
  upload_stager: Optional[Type[UploadStagerT]] = None
50
56
 
@@ -6,10 +6,14 @@ from unstructured_ingest.v2.processes.connector_registry import (
6
6
  add_source_entry,
7
7
  )
8
8
 
9
- from .astra import CONNECTOR_TYPE as ASTRA_CONNECTOR_TYPE
10
- from .astra import astra_destination_entry
9
+ from .astradb import CONNECTOR_TYPE as ASTRA_DB_CONNECTOR_TYPE
10
+ from .astradb import astra_db_destination_entry
11
+ from .azure_cognitive_search import CONNECTOR_TYPE as AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE
12
+ from .azure_cognitive_search import azure_cognitive_search_destination_entry
11
13
  from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
12
14
  from .chroma import chroma_destination_entry
15
+ from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
16
+ from .couchbase import couchbase_destination_entry
13
17
  from .databricks_volumes import CONNECTOR_TYPE as DATABRICKS_VOLUMES_CONNECTOR_TYPE
14
18
  from .databricks_volumes import databricks_volumes_destination_entry
15
19
  from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
@@ -39,10 +43,12 @@ from .sql import sql_destination_entry
39
43
  from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
40
44
  from .weaviate import weaviate_destination_entry
41
45
 
42
- add_destination_entry(destination_type=ASTRA_CONNECTOR_TYPE, entry=astra_destination_entry)
46
+ add_destination_entry(destination_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_destination_entry)
43
47
 
44
48
  add_destination_entry(destination_type=CHROMA_CONNECTOR_TYPE, entry=chroma_destination_entry)
45
49
 
50
+ add_destination_entry(destination_type=COUCHBASE_CONNECTOR_TYPE, entry=couchbase_destination_entry)
51
+
46
52
  add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
47
53
  add_destination_entry(
48
54
  destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry
@@ -77,3 +83,7 @@ add_destination_entry(
77
83
  destination_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_destination_entry
78
84
  )
79
85
  add_destination_entry(destination_type=MILVUS_CONNECTOR_TYPE, entry=milvus_destination_entry)
86
+ add_destination_entry(
87
+ destination_type=AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE,
88
+ entry=azure_cognitive_search_destination_entry,
89
+ )
@@ -3,10 +3,11 @@ from dataclasses import dataclass, field
3
3
  from pathlib import Path
4
4
  from typing import TYPE_CHECKING, Any, Optional
5
5
 
6
- from unstructured import __name__ as integration_name
7
- from unstructured.__version__ import __version__ as integration_version
6
+ from pydantic import Field, Secret
8
7
 
9
- from unstructured_ingest.enhanced_dataclass import enhanced_field
8
+ from unstructured_ingest import __name__ as integration_name
9
+ from unstructured_ingest.__version__ import __version__ as integration_version
10
+ from unstructured_ingest.error import DestinationConnectionError
10
11
  from unstructured_ingest.utils.data_prep import batch_generator
11
12
  from unstructured_ingest.utils.dep_check import requires_dependencies
12
13
  from unstructured_ingest.v2.interfaces import (
@@ -27,30 +28,30 @@ from unstructured_ingest.v2.processes.connector_registry import (
27
28
  if TYPE_CHECKING:
28
29
  from astrapy.db import AstraDBCollection
29
30
 
30
- CONNECTOR_TYPE = "astra"
31
+ CONNECTOR_TYPE = "astradb"
31
32
 
32
33
 
33
34
  @dataclass
34
- class AstraAccessConfig(AccessConfig):
35
- token: str
36
- api_endpoint: str
35
+ class AstraDBAccessConfig(AccessConfig):
36
+ token: str = Field(description="Astra DB Token with access to the database.")
37
+ api_endpoint: str = Field(description="The API endpoint for the Astra DB.")
37
38
 
38
39
 
39
40
  @dataclass
40
- class AstraConnectionConfig(ConnectionConfig):
41
+ class AstraDBConnectionConfig(ConnectionConfig):
41
42
  connection_type: str = CONNECTOR_TYPE
42
- access_config: AstraAccessConfig = enhanced_field(sensitive=True)
43
+ access_config: Secret[AstraDBAccessConfig]
43
44
 
44
45
 
45
46
  @dataclass
46
- class AstraUploadStagerConfig(UploadStagerConfig):
47
+ class AstraDBUploadStagerConfig(UploadStagerConfig):
47
48
  pass
48
49
 
49
50
 
50
51
  @dataclass
51
- class AstraUploadStager(UploadStager):
52
- upload_stager_config: AstraUploadStagerConfig = field(
53
- default_factory=lambda: AstraUploadStagerConfig()
52
+ class AstraDBUploadStager(UploadStager):
53
+ upload_stager_config: AstraDBUploadStagerConfig = field(
54
+ default_factory=lambda: AstraDBUploadStagerConfig()
54
55
  )
55
56
 
56
57
  def conform_dict(self, element_dict: dict) -> dict:
@@ -79,22 +80,38 @@ class AstraUploadStager(UploadStager):
79
80
  return output_path
80
81
 
81
82
 
82
- @dataclass
83
- class AstraUploaderConfig(UploaderConfig):
84
- collection_name: str
85
- embedding_dimension: int
86
- namespace: Optional[str] = None
87
- requested_indexing_policy: Optional[dict[str, Any]] = None
88
- batch_size: int = 20
83
+ class AstraDBUploaderConfig(UploaderConfig):
84
+ collection_name: str = Field(
85
+ description="The name of the Astra DB collection. "
86
+ "Note that the collection name must only include letters, "
87
+ "numbers, and underscores."
88
+ )
89
+ embedding_dimension: int = Field(
90
+ default=384, description="The dimensionality of the embeddings"
91
+ )
92
+ namespace: Optional[str] = Field(default=None, description="The Astra DB connection namespace.")
93
+ requested_indexing_policy: Optional[dict[str, Any]] = Field(
94
+ default=None,
95
+ description="The indexing policy to use for the collection.",
96
+ examples=['{"deny": ["metadata"]}'],
97
+ )
98
+ batch_size: int = Field(default=20, description="Number of records per batch")
89
99
 
90
100
 
91
101
  @dataclass
92
- class AstraUploader(Uploader):
93
- connection_config: AstraConnectionConfig
94
- upload_config: AstraUploaderConfig
102
+ class AstraDBUploader(Uploader):
103
+ connection_config: AstraDBConnectionConfig
104
+ upload_config: AstraDBUploaderConfig
95
105
  connector_type: str = CONNECTOR_TYPE
96
106
 
97
- @requires_dependencies(["astrapy"], extras="astra")
107
+ def precheck(self) -> None:
108
+ try:
109
+ self.get_collection()
110
+ except Exception as e:
111
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
112
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
113
+
114
+ @requires_dependencies(["astrapy"], extras="astradb")
98
115
  def get_collection(self) -> "AstraDBCollection":
99
116
  from astrapy.db import AstraDB
100
117
 
@@ -103,14 +120,15 @@ class AstraUploader(Uploader):
103
120
  embedding_dimension = self.upload_config.embedding_dimension
104
121
  requested_indexing_policy = self.upload_config.requested_indexing_policy
105
122
 
106
- # If the user has requested an indexing policy, pass it to the AstraDB
123
+ # If the user has requested an indexing policy, pass it to the Astra DB
107
124
  options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None
108
125
 
109
126
  # Build the Astra DB object.
110
127
  # caller_name/version for AstraDB tracking
128
+ access_configs = self.connection_config.access_config.get_secret_value()
111
129
  astra_db = AstraDB(
112
- api_endpoint=self.connection_config.access_config.api_endpoint,
113
- token=self.connection_config.access_config.token,
130
+ api_endpoint=access_configs.api_endpoint,
131
+ token=access_configs.token,
114
132
  namespace=self.upload_config.namespace,
115
133
  caller_name=integration_name,
116
134
  caller_version=integration_version,
@@ -136,17 +154,17 @@ class AstraUploader(Uploader):
136
154
  f"collection {self.upload_config.collection_name}"
137
155
  )
138
156
 
139
- astra_batch_size = self.upload_config.batch_size
157
+ astra_db_batch_size = self.upload_config.batch_size
140
158
  collection = self.get_collection()
141
159
 
142
- for chunk in batch_generator(elements_dict, astra_batch_size):
160
+ for chunk in batch_generator(elements_dict, astra_db_batch_size):
143
161
  collection.insert_many(chunk)
144
162
 
145
163
 
146
- astra_destination_entry = DestinationRegistryEntry(
147
- connection_config=AstraConnectionConfig,
148
- upload_stager_config=AstraUploadStagerConfig,
149
- upload_stager=AstraUploadStager,
150
- uploader_config=AstraUploaderConfig,
151
- uploader=AstraUploader,
164
+ astra_db_destination_entry = DestinationRegistryEntry(
165
+ connection_config=AstraDBConnectionConfig,
166
+ upload_stager_config=AstraDBUploadStagerConfig,
167
+ upload_stager=AstraDBUploadStager,
168
+ uploader_config=AstraDBUploaderConfig,
169
+ uploader=AstraDBUploader,
152
170
  )
@@ -1,10 +1,11 @@
1
1
  import json
2
- import typing as t
3
2
  import uuid
4
3
  from dataclasses import dataclass, field
5
4
  from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ from pydantic import Field, Secret
6
8
 
7
- from unstructured_ingest.enhanced_dataclass import enhanced_field
8
9
  from unstructured_ingest.error import DestinationConnectionError, WriteError
9
10
  from unstructured_ingest.utils.data_prep import batch_generator
10
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
@@ -20,27 +21,31 @@ from unstructured_ingest.v2.interfaces import (
20
21
  from unstructured_ingest.v2.logger import logger
21
22
  from unstructured_ingest.v2.processes.connector_registry import (
22
23
  DestinationRegistryEntry,
23
- add_destination_entry,
24
24
  )
25
25
  from unstructured_ingest.v2.processes.connectors.utils import parse_datetime
26
26
 
27
- if t.TYPE_CHECKING:
27
+ if TYPE_CHECKING:
28
28
  from azure.search.documents import SearchClient
29
29
 
30
30
 
31
31
  CONNECTOR_TYPE = "azure_cognitive_search"
32
32
 
33
33
 
34
- @dataclass
35
34
  class AzureCognitiveSearchAccessConfig(AccessConfig):
36
- key: t.Optional[str] = enhanced_field(default=None, overload_name="azure_cognitive_search_key")
35
+ azure_cognitive_search_key: str = Field(
36
+ alias="key", description="Credential that is used for authenticating to an Azure service"
37
+ )
37
38
 
38
39
 
39
- @dataclass
40
40
  class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
41
- endpoint: str
42
- index: str
43
- access_config: AzureCognitiveSearchAccessConfig = enhanced_field(sensitive=True)
41
+ endpoint: str = Field(
42
+ description="The URL endpoint of an Azure AI (Cognitive) search service. "
43
+ "In the form of https://{{service_name}}.search.windows.net"
44
+ )
45
+ index: str = Field(
46
+ description="The name of the Azure AI (Cognitive) Search index to connect to."
47
+ )
48
+ access_config: Secret[AzureCognitiveSearchAccessConfig]
44
49
 
45
50
  @requires_dependencies(["azure.search", "azure.core"], extras="azure-cognitive-search")
46
51
  def generate_client(self) -> "SearchClient":
@@ -50,18 +55,18 @@ class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
50
55
  return SearchClient(
51
56
  endpoint=self.endpoint,
52
57
  index_name=self.index,
53
- credential=AzureKeyCredential(self.access_config.key),
58
+ credential=AzureKeyCredential(
59
+ self.access_config.get_secret_value().azure_cognitive_search_key
60
+ ),
54
61
  )
55
62
 
56
63
 
57
- @dataclass
58
64
  class AzureCognitiveSearchUploadStagerConfig(UploadStagerConfig):
59
65
  pass
60
66
 
61
67
 
62
- @dataclass
63
68
  class AzureCognitiveSearchUploaderConfig(UploaderConfig):
64
- batch_size: int = 100
69
+ batch_size: int = Field(default=100, description="Number of records per batch")
65
70
 
66
71
 
67
72
  @dataclass
@@ -122,7 +127,7 @@ class AzureCognitiveSearchUploadStager(UploadStager):
122
127
  elements_filepath: Path,
123
128
  output_dir: Path,
124
129
  output_filename: str,
125
- **kwargs: t.Any,
130
+ **kwargs: Any,
126
131
  ) -> Path:
127
132
  with open(elements_filepath) as elements_file:
128
133
  elements_contents = json.load(elements_file)
@@ -143,7 +148,7 @@ class AzureCognitiveSearchUploader(Uploader):
143
148
 
144
149
  @DestinationConnectionError.wrap
145
150
  @requires_dependencies(["azure"], extras="azure-cognitive-search")
146
- def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
151
+ def write_dict(self, *args, elements_dict: list[dict[str, Any]], **kwargs) -> None:
147
152
  import azure.core.exceptions
148
153
 
149
154
  logger.info(
@@ -169,16 +174,25 @@ class AzureCognitiveSearchUploader(Uploader):
169
174
  raise WriteError(
170
175
  ", ".join(
171
176
  [
172
- f"{error.key}: [{error.status_code}] {error.error_message}"
177
+ f"{error.azure_cognitive_search_key}: "
178
+ f"[{error.status_code}] {error.error_message}"
173
179
  for error in errors
174
180
  ],
175
181
  ),
176
182
  )
177
183
 
184
+ def precheck(self) -> None:
185
+ try:
186
+ client = self.connection_config.generate_client()
187
+ client.get_document_count()
188
+ except Exception as e:
189
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
190
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
191
+
178
192
  def write_dict_wrapper(self, elements_dict):
179
193
  return self.write_dict(elements_dict=elements_dict)
180
194
 
181
- def run(self, contents: list[UploadContent], **kwargs: t.Any) -> None:
195
+ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
182
196
 
183
197
  elements_dict = []
184
198
  for content in contents:
@@ -199,13 +213,10 @@ class AzureCognitiveSearchUploader(Uploader):
199
213
  self.write_dict(elements_dict=chunk) # noqa: E203
200
214
 
201
215
 
202
- add_destination_entry(
203
- destination_type=CONNECTOR_TYPE,
204
- entry=DestinationRegistryEntry(
205
- connection_config=AzureCognitiveSearchConnectionConfig,
206
- uploader=AzureCognitiveSearchUploader,
207
- uploader_config=AzureCognitiveSearchUploaderConfig,
208
- upload_stager=AzureCognitiveSearchUploadStager,
209
- upload_stager_config=AzureCognitiveSearchUploadStagerConfig,
210
- ),
216
+ azure_cognitive_search_destination_entry = DestinationRegistryEntry(
217
+ connection_config=AzureCognitiveSearchConnectionConfig,
218
+ uploader=AzureCognitiveSearchUploader,
219
+ uploader_config=AzureCognitiveSearchUploaderConfig,
220
+ upload_stager=AzureCognitiveSearchUploadStager,
221
+ upload_stager_config=AzureCognitiveSearchUploadStagerConfig,
211
222
  )
@@ -3,11 +3,11 @@ import uuid
3
3
  from dataclasses import dataclass, field
4
4
  from datetime import date, datetime
5
5
  from pathlib import Path
6
- from typing import TYPE_CHECKING, Any, Dict, Optional
6
+ from typing import TYPE_CHECKING, Any, Optional
7
7
 
8
8
  from dateutil import parser
9
+ from pydantic import Field, Secret
9
10
 
10
- from unstructured_ingest.enhanced_dataclass import enhanced_field
11
11
  from unstructured_ingest.error import DestinationConnectionError
12
12
  from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
13
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
@@ -32,26 +32,35 @@ if TYPE_CHECKING:
32
32
  CONNECTOR_TYPE = "chroma"
33
33
 
34
34
 
35
- @dataclass
36
35
  class ChromaAccessConfig(AccessConfig):
37
- settings: Optional[Dict[str, str]] = None
38
- headers: Optional[Dict[str, str]] = None
36
+ settings: Optional[dict[str, str]] = Field(
37
+ default=None, description="A dictionary of settings to communicate with the chroma server."
38
+ )
39
+ headers: Optional[dict[str, str]] = Field(
40
+ default=None, description="A dictionary of headers to send to the Chroma server."
41
+ )
39
42
 
40
43
 
41
- @dataclass
42
44
  class ChromaConnectionConfig(ConnectionConfig):
43
- collection_name: str
44
- access_config: ChromaAccessConfig = enhanced_field(sensitive=True)
45
- path: Optional[str] = None
46
- tenant: Optional[str] = "default_tenant"
47
- database: Optional[str] = "default_database"
48
- host: Optional[str] = None
49
- port: Optional[int] = None
50
- ssl: bool = False
51
- connector_type: str = CONNECTOR_TYPE
45
+ collection_name: str = Field(description="The name of the Chroma collection to write into.")
46
+ access_config: Secret[ChromaAccessConfig]
47
+ path: Optional[str] = Field(
48
+ default=None, description="Location where Chroma is persisted, if not connecting via http."
49
+ )
50
+ tenant: Optional[str] = Field(
51
+ default="default_tenant", description="The tenant to use for this client."
52
+ )
53
+ database: Optional[str] = Field(
54
+ default="default_database", description="The database to use for this client."
55
+ )
56
+ host: Optional[str] = Field(default=None, description="The hostname of the Chroma server.")
57
+ port: Optional[int] = Field(default=None, description="The port of the Chroma server.")
58
+ ssl: bool = Field(
59
+ default=False, description="Whether to use SSL to connect to the Chroma server."
60
+ )
61
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
52
62
 
53
63
 
54
- @dataclass
55
64
  class ChromaUploadStagerConfig(UploadStagerConfig):
56
65
  pass
57
66
 
@@ -101,9 +110,8 @@ class ChromaUploadStager(UploadStager):
101
110
  return output_path
102
111
 
103
112
 
104
- @dataclass
105
113
  class ChromaUploaderConfig(UploaderConfig):
106
- batch_size: int = 100
114
+ batch_size: int = Field(default=100, description="Number of records per batch")
107
115
 
108
116
 
109
117
  @dataclass
@@ -111,19 +119,23 @@ class ChromaUploader(Uploader):
111
119
  connector_type: str = CONNECTOR_TYPE
112
120
  upload_config: ChromaUploaderConfig
113
121
  connection_config: ChromaConnectionConfig
114
- client: Optional["Client"] = field(init=False)
115
122
 
116
- def __post_init__(self):
117
- self.client = self.create_client()
123
+ def precheck(self) -> None:
124
+ try:
125
+ self.create_client()
126
+ except Exception as e:
127
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
128
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
118
129
 
119
130
  @requires_dependencies(["chromadb"], extras="chroma")
120
131
  def create_client(self) -> "Client":
121
132
  import chromadb
122
133
 
134
+ access_config = self.connection_config.access_config.get_secret_value()
123
135
  if self.connection_config.path:
124
136
  return chromadb.PersistentClient(
125
137
  path=self.connection_config.path,
126
- settings=self.connection_config.access_config.settings,
138
+ settings=access_config.settings,
127
139
  tenant=self.connection_config.tenant,
128
140
  database=self.connection_config.database,
129
141
  )
@@ -133,8 +145,8 @@ class ChromaUploader(Uploader):
133
145
  host=self.connection_config.host,
134
146
  port=self.connection_config.port,
135
147
  ssl=self.connection_config.ssl,
136
- headers=self.connection_config.access_config.headers,
137
- settings=self.connection_config.access_config.settings,
148
+ headers=access_config.headers,
149
+ settings=access_config.settings,
138
150
  tenant=self.connection_config.tenant,
139
151
  database=self.connection_config.database,
140
152
  )
@@ -187,10 +199,9 @@ class ChromaUploader(Uploader):
187
199
  f"collection {self.connection_config.collection_name} "
188
200
  f"at {self.connection_config.host}",
189
201
  )
202
+ client = self.create_client()
190
203
 
191
- collection = self.client.get_or_create_collection(
192
- name=self.connection_config.collection_name
193
- )
204
+ collection = client.get_or_create_collection(name=self.connection_config.collection_name)
194
205
  for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
195
206
  self.upsert_batch(collection, self.prepare_chroma_list(chunk))
196
207