unstructured-ingest 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (123) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cli.py +6 -1
  3. unstructured_ingest/cli/cmds/__init__.py +4 -4
  4. unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
  5. unstructured_ingest/cli/interfaces.py +13 -6
  6. unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
  7. unstructured_ingest/connector/biomed.py +12 -5
  8. unstructured_ingest/connector/confluence.py +3 -3
  9. unstructured_ingest/connector/github.py +3 -2
  10. unstructured_ingest/connector/google_drive.py +1 -2
  11. unstructured_ingest/connector/mongodb.py +1 -2
  12. unstructured_ingest/connector/notion/client.py +31 -16
  13. unstructured_ingest/connector/notion/connector.py +3 -2
  14. unstructured_ingest/connector/registry.py +2 -2
  15. unstructured_ingest/connector/vectara.py +7 -2
  16. unstructured_ingest/interfaces.py +13 -9
  17. unstructured_ingest/pipeline/interfaces.py +8 -3
  18. unstructured_ingest/pipeline/reformat/chunking.py +13 -9
  19. unstructured_ingest/pipeline/reformat/embedding.py +3 -3
  20. unstructured_ingest/runner/__init__.py +2 -2
  21. unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
  22. unstructured_ingest/runner/writers/__init__.py +2 -2
  23. unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
  24. unstructured_ingest/utils/chunking.py +45 -0
  25. unstructured_ingest/utils/dep_check.py +1 -1
  26. unstructured_ingest/utils/google_filetype.py +9 -0
  27. unstructured_ingest/v2/cli/base/cmd.py +57 -13
  28. unstructured_ingest/v2/cli/base/dest.py +21 -12
  29. unstructured_ingest/v2/cli/base/src.py +35 -23
  30. unstructured_ingest/v2/cli/cmds.py +14 -0
  31. unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
  32. unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
  33. unstructured_ingest/v2/interfaces/connector.py +5 -7
  34. unstructured_ingest/v2/interfaces/downloader.py +8 -5
  35. unstructured_ingest/v2/interfaces/file_data.py +8 -2
  36. unstructured_ingest/v2/interfaces/indexer.py +3 -4
  37. unstructured_ingest/v2/interfaces/processor.py +10 -10
  38. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  39. unstructured_ingest/v2/interfaces/uploader.py +3 -3
  40. unstructured_ingest/v2/pipeline/pipeline.py +1 -5
  41. unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
  42. unstructured_ingest/v2/pipeline/steps/download.py +13 -11
  43. unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
  44. unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
  45. unstructured_ingest/v2/pipeline/steps/index.py +14 -10
  46. unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
  47. unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
  48. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
  49. unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
  50. unstructured_ingest/v2/processes/__init__.py +18 -0
  51. unstructured_ingest/v2/processes/chunker.py +74 -28
  52. unstructured_ingest/v2/processes/connector_registry.py +8 -2
  53. unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
  54. unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +45 -35
  55. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
  56. unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
  57. unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
  58. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +87 -32
  59. unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
  60. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
  61. unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
  62. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
  63. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
  64. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
  65. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
  66. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
  67. unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
  68. unstructured_ingest/v2/processes/connectors/local.py +22 -14
  69. unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
  70. unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
  71. unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
  72. unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
  73. unstructured_ingest/v2/processes/connectors/pinecone.py +23 -20
  74. unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
  75. unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
  76. unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
  77. unstructured_ingest/v2/processes/connectors/sql.py +29 -31
  78. unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
  79. unstructured_ingest/v2/processes/embedder.py +106 -47
  80. unstructured_ingest/v2/processes/filter.py +11 -5
  81. unstructured_ingest/v2/processes/partitioner.py +79 -33
  82. unstructured_ingest/v2/processes/uncompress.py +3 -3
  83. unstructured_ingest/v2/utils.py +45 -0
  84. unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
  85. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +89 -116
  86. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
  87. unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
  88. unstructured_ingest/v2/cli/cmds/astra.py +0 -85
  89. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
  90. unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
  91. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
  92. unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
  93. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
  94. unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
  95. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
  96. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
  97. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
  98. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
  99. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
  100. unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
  101. unstructured_ingest/v2/cli/cmds/local.py +0 -52
  102. unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
  103. unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
  104. unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
  105. unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
  106. unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
  107. unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
  108. unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
  109. unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
  110. unstructured_ingest/v2/cli/cmds/sql.py +0 -84
  111. unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
  112. unstructured_ingest/v2/cli/configs/__init__.py +0 -13
  113. unstructured_ingest/v2/cli/configs/chunk.py +0 -89
  114. unstructured_ingest/v2/cli/configs/embed.py +0 -74
  115. unstructured_ingest/v2/cli/configs/filter.py +0 -28
  116. unstructured_ingest/v2/cli/configs/partition.py +0 -99
  117. unstructured_ingest/v2/cli/configs/processor.py +0 -88
  118. unstructured_ingest/v2/cli/interfaces.py +0 -27
  119. unstructured_ingest/v2/pipeline/utils.py +0 -15
  120. unstructured_ingest-0.0.3.dist-info/METADATA +0 -175
  121. /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  122. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
  123. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
@@ -5,13 +5,11 @@ from dataclasses import dataclass
5
5
  from pathlib import Path
6
6
  from typing import Callable, Optional, TypedDict
7
7
 
8
- from unstructured.staging.base import elements_to_dicts
9
-
10
8
  from unstructured_ingest.v2.interfaces import FileData
11
9
  from unstructured_ingest.v2.logger import logger
12
10
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
13
- from unstructured_ingest.v2.pipeline.utils import sterilize_dict
14
11
  from unstructured_ingest.v2.processes.embedder import Embedder
12
+ from unstructured_ingest.v2.utils import serialize_base_model_json
15
13
 
16
14
  STEP_ID = "embed"
17
15
 
@@ -30,11 +28,7 @@ class EmbedStep(PipelineStep):
30
28
  return f"{self.identifier} ({self.process.config.embedding_provider})"
31
29
 
32
30
  def __post_init__(self):
33
- config = (
34
- sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
35
- if self.process.config
36
- else None
37
- )
31
+ config = self.process.config.json() if self.process.config else None
38
32
  logger.info(f"Created {self.identifier} with configs: {config}")
39
33
 
40
34
  def should_embed(self, filepath: Path, file_data: FileData) -> bool:
@@ -71,13 +65,13 @@ class EmbedStep(PipelineStep):
71
65
 
72
66
  self._save_output(
73
67
  output_filepath=str(output_filepath),
74
- embedded_content=elements_to_dicts(embed_content_raw),
68
+ embedded_content=embed_content_raw,
75
69
  )
76
70
  return EmbedStepResponse(file_data_path=file_data_path, path=str(output_filepath))
77
71
 
78
72
  def get_hash(self, extras: Optional[list[str]]) -> str:
79
- hashable_string = json.dumps(
80
- self.process.config.to_dict(), sort_keys=True, ensure_ascii=True
73
+ hashable_string = serialize_base_model_json(
74
+ model=self.process.config, sort_keys=True, ensure_ascii=True
81
75
  )
82
76
  if extras:
83
77
  hashable_string += "".join(extras)
@@ -5,7 +5,6 @@ from typing import Callable, Optional
5
5
  from unstructured_ingest.v2.interfaces.file_data import FileData
6
6
  from unstructured_ingest.v2.logger import logger
7
7
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
8
- from unstructured_ingest.v2.pipeline.utils import sterilize_dict
9
8
  from unstructured_ingest.v2.processes.filter import Filterer
10
9
 
11
10
  STEP_ID = "filter"
@@ -17,11 +16,7 @@ class FilterStep(PipelineStep):
17
16
  identifier: str = STEP_ID
18
17
 
19
18
  def __post_init__(self):
20
- config = (
21
- sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
22
- if self.process.config
23
- else None
24
- )
19
+ config = self.process.config.json() if self.process.config else None
25
20
  logger.info(f"Created {self.identifier} with configs: {config}")
26
21
 
27
22
  async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
@@ -6,7 +6,7 @@ from typing import Generator, Optional, TypeVar
6
6
  from unstructured_ingest.v2.interfaces.indexer import Indexer
7
7
  from unstructured_ingest.v2.logger import logger
8
8
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
9
- from unstructured_ingest.v2.pipeline.utils import sterilize_dict
9
+ from unstructured_ingest.v2.utils import serialize_base_model_json
10
10
 
11
11
  IndexerT = TypeVar("IndexerT", bound=Indexer)
12
12
 
@@ -22,15 +22,9 @@ class IndexStep(PipelineStep):
22
22
  return f"{self.identifier} ({self.process.__class__.__name__})"
23
23
 
24
24
  def __post_init__(self):
25
- config = (
26
- sterilize_dict(self.process.index_config.to_dict(redact_sensitive=True))
27
- if self.process.index_config
28
- else None
29
- )
25
+ config = self.process.index_config.json() if self.process.index_config else None
30
26
  connection_config = (
31
- sterilize_dict(self.process.connection_config.to_dict(redact_sensitive=True))
32
- if self.process.connection_config
33
- else None
27
+ self.process.connection_config.json() if self.process.connection_config else None
34
28
  )
35
29
  logger.info(
36
30
  f"Created {self.identifier} with configs: {config}, "
@@ -55,7 +49,17 @@ class IndexStep(PipelineStep):
55
49
  continue
56
50
 
57
51
  def get_hash(self, extras: Optional[list[str]]) -> str:
58
- hashable_string = json.dumps(self.process.index_config.to_dict())
52
+ index_config_dict = json.loads(
53
+ serialize_base_model_json(model=self.process.index_config, sort_keys=True)
54
+ )
55
+ connection_config_dict = json.loads(
56
+ serialize_base_model_json(model=self.process.connection_config, sort_keys=True)
57
+ )
58
+ hashable_dict = {
59
+ "index_config": index_config_dict,
60
+ "connection_config": connection_config_dict,
61
+ }
62
+ hashable_string = json.dumps(hashable_dict, sort_keys=True)
59
63
  if extras:
60
64
  hashable_string += "".join(extras)
61
65
  return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
@@ -8,8 +8,8 @@ from typing import Callable, Optional, TypedDict
8
8
  from unstructured_ingest.v2.interfaces import FileData
9
9
  from unstructured_ingest.v2.logger import logger
10
10
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
11
- from unstructured_ingest.v2.pipeline.utils import sterilize_dict
12
11
  from unstructured_ingest.v2.processes.partitioner import Partitioner
12
+ from unstructured_ingest.v2.utils import serialize_base_model_json
13
13
 
14
14
  STEP_ID = "partition"
15
15
 
@@ -28,7 +28,7 @@ class PartitionStep(PipelineStep):
28
28
  return f"{self.identifier} ({self.process.config.strategy})"
29
29
 
30
30
  def __post_init__(self):
31
- config = sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
31
+ config = self.process.config.json()
32
32
  logger.info(f"Created {self.identifier} with configs: {config}")
33
33
 
34
34
  def should_partition(self, filepath: Path, file_data: FileData) -> bool:
@@ -56,7 +56,7 @@ class PartitionStep(PipelineStep):
56
56
  if not self.should_partition(filepath=output_filepath, file_data=file_data):
57
57
  logger.debug(f"Skipping partitioning, output already exists: {output_filepath}")
58
58
  return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
59
- fn_kwargs = {"filename": path, "metadata": file_data.metadata}
59
+ fn_kwargs = {"filename": path, "metadata": file_data.metadata.to_dict()}
60
60
  if not asyncio.iscoroutinefunction(fn):
61
61
  partitioned_content = fn(**fn_kwargs)
62
62
  elif semaphore := self.context.semaphore:
@@ -70,8 +70,8 @@ class PartitionStep(PipelineStep):
70
70
  return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
71
71
 
72
72
  def get_hash(self, extras: Optional[list[str]]) -> str:
73
- hashable_string = json.dumps(
74
- self.process.config.to_dict(), sort_keys=True, ensure_ascii=True
73
+ hashable_string = serialize_base_model_json(
74
+ model=self.process.config, sort_keys=True, ensure_ascii=True
75
75
  )
76
76
  if extras:
77
77
  hashable_string += "".join(extras)
@@ -1,6 +1,5 @@
1
1
  import asyncio
2
2
  import hashlib
3
- import json
4
3
  from dataclasses import dataclass
5
4
  from pathlib import Path
6
5
  from typing import Callable, Optional, TypedDict
@@ -9,7 +8,7 @@ from unstructured_ingest.v2.interfaces.file_data import FileData
9
8
  from unstructured_ingest.v2.interfaces.upload_stager import UploadStager
10
9
  from unstructured_ingest.v2.logger import logger
11
10
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
12
- from unstructured_ingest.v2.pipeline.utils import sterilize_dict
11
+ from unstructured_ingest.v2.utils import serialize_base_model_json
13
12
 
14
13
  STEP_ID = "upload_stage"
15
14
 
@@ -29,9 +28,7 @@ class UploadStageStep(PipelineStep):
29
28
 
30
29
  def __post_init__(self):
31
30
  config = (
32
- sterilize_dict(self.process.upload_stager_config.to_dict(redact_sensitive=True))
33
- if self.process.upload_stager_config
34
- else None
31
+ self.process.upload_stager_config.json() if self.process.upload_stager_config else None
35
32
  )
36
33
  self.cache_dir.mkdir(parents=True, exist_ok=True)
37
34
  logger.info(f"Created {self.identifier} with configs: {config}")
@@ -56,8 +53,8 @@ class UploadStageStep(PipelineStep):
56
53
  return UploadStageStepResponse(file_data_path=file_data_path, path=str(staged_output_path))
57
54
 
58
55
  def get_hash(self, extras: Optional[list[str]]) -> str:
59
- hashable_string = json.dumps(
60
- self.process.upload_stager_config.to_dict(), sort_keys=True, ensure_ascii=True
56
+ hashable_string = serialize_base_model_json(
57
+ model=self.process.upload_stager_config, sort_keys=True, ensure_ascii=True
61
58
  )
62
59
  if extras:
63
60
  hashable_string += "".join(extras)
@@ -5,7 +5,6 @@ from typing import Callable, TypedDict
5
5
  from unstructured_ingest.v2.interfaces.file_data import FileData
6
6
  from unstructured_ingest.v2.logger import logger
7
7
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
8
- from unstructured_ingest.v2.pipeline.utils import sterilize_dict
9
8
  from unstructured_ingest.v2.processes.uncompress import Uncompressor
10
9
 
11
10
  STEP_ID = "uncompress"
@@ -21,11 +20,7 @@ class UncompressStep(PipelineStep):
21
20
  identifier: str = STEP_ID
22
21
 
23
22
  def __post_init__(self):
24
- config = (
25
- sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
26
- if self.process.config
27
- else None
28
- )
23
+ config = self.process.config.json() if self.process.config else None
29
24
  logger.info(f"Created {self.identifier} with configs: {config}")
30
25
 
31
26
  def _run(self, path: str, file_data_path: str) -> list[UncompressStepResponse]:
@@ -7,7 +7,6 @@ from unstructured_ingest.v2.interfaces import FileData
7
7
  from unstructured_ingest.v2.interfaces.uploader import UploadContent, Uploader
8
8
  from unstructured_ingest.v2.logger import logger
9
9
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep, iterable_input, timed
10
- from unstructured_ingest.v2.pipeline.utils import sterilize_dict
11
10
 
12
11
  STEP_ID = "upload"
13
12
 
@@ -26,15 +25,9 @@ class UploadStep(PipelineStep):
26
25
  return f"{self.identifier} ({self.process.__class__.__name__})"
27
26
 
28
27
  def __post_init__(self):
29
- config = (
30
- sterilize_dict(self.process.upload_config.to_dict(redact_sensitive=True))
31
- if self.process.upload_config
32
- else None
33
- )
28
+ config = self.process.upload_config.json() if self.process.upload_config else None
34
29
  connection_config = (
35
- sterilize_dict(self.process.connection_config.to_dict(redact_sensitive=True))
36
- if self.process.connection_config
37
- else None
30
+ self.process.connection_config.json() if self.process.connection_config else None
38
31
  )
39
32
  logger.info(
40
33
  f"Created {self.identifier} with configs: {config}, "
@@ -0,0 +1,18 @@
1
+ from .chunker import Chunker, ChunkerConfig
2
+ from .embedder import Embedder, EmbedderConfig
3
+ from .filter import Filterer, FiltererConfig
4
+ from .partitioner import Partitioner, PartitionerConfig
5
+ from .uncompress import UncompressConfig, Uncompressor
6
+
7
+ __all__ = [
8
+ "Chunker",
9
+ "ChunkerConfig",
10
+ "Embedder",
11
+ "EmbedderConfig",
12
+ "Filterer",
13
+ "FiltererConfig",
14
+ "Partitioner",
15
+ "PartitionerConfig",
16
+ "Uncompressor",
17
+ "UncompressConfig",
18
+ ]
@@ -3,29 +3,70 @@ from dataclasses import dataclass, fields
3
3
  from pathlib import Path
4
4
  from typing import Any, Optional
5
5
 
6
- from unstructured.chunking import dispatch
7
- from unstructured.documents.elements import Element, assign_and_map_hash_ids
8
- from unstructured.staging.base import dict_to_elements, elements_from_json
6
+ from pydantic import BaseModel, Field, SecretStr
9
7
 
10
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
8
+ from unstructured_ingest.utils.chunking import assign_and_map_hash_ids
9
+ from unstructured_ingest.utils.dep_check import requires_dependencies
11
10
  from unstructured_ingest.v2.interfaces.process import BaseProcess
12
11
  from unstructured_ingest.v2.logger import logger
13
12
 
13
+ CHUNK_MAX_CHARS_DEFAULT: int = 500
14
+ CHUNK_MULTI_PAGE_DEFAULT: bool = True
14
15
 
15
- @dataclass
16
- class ChunkerConfig(EnhancedDataClassJsonMixin):
17
- chunking_strategy: Optional[str] = None
18
- chunking_endpoint: Optional[str] = "https://api.unstructured.io/general/v0/general"
19
- chunk_by_api: bool = False
20
- chunk_api_key: Optional[str] = enhanced_field(default=None, sensitive=True)
21
16
 
22
- chunk_combine_text_under_n_chars: Optional[int] = None
23
- chunk_include_orig_elements: Optional[bool] = None
24
- chunk_max_characters: Optional[int] = None
25
- chunk_multipage_sections: Optional[bool] = None
26
- chunk_new_after_n_chars: Optional[int] = None
27
- chunk_overlap: Optional[int] = None
28
- chunk_overlap_all: Optional[bool] = None
17
+ class ChunkerConfig(BaseModel):
18
+ chunking_strategy: Optional[str] = Field(
19
+ default=None, description="The rule-set to use to form chunks. Omit to disable chunking."
20
+ )
21
+ chunking_endpoint: Optional[str] = Field(
22
+ default="https://api.unstructured.io/general/v0/general",
23
+ description="If chunking via api, use the following host.",
24
+ )
25
+ chunk_by_api: bool = Field(default=False, description="Flag to use api for chunking")
26
+ chunk_api_key: Optional[SecretStr] = Field(
27
+ default=None, description="API Key for chunking endpoint."
28
+ )
29
+
30
+ chunk_combine_text_under_n_chars: Optional[int] = Field(
31
+ default=None,
32
+ description="Combine consecutive chunks when the first does not exceed this length and"
33
+ " the second will fit without exceeding the hard-maximum length. Only"
34
+ " operative for 'by_title' chunking-strategy.",
35
+ )
36
+ chunk_include_orig_elements: Optional[bool] = Field(
37
+ default=None,
38
+ description="When chunking, add the original elements consolidated to form each chunk to"
39
+ " `.metadata.orig_elements` on that chunk.",
40
+ )
41
+ chunk_max_characters: int = Field(
42
+ default=CHUNK_MAX_CHARS_DEFAULT,
43
+ description="Hard maximum chunk length. No chunk will exceed this length. An oversized"
44
+ " element will be divided by text-splitting to fit this window.",
45
+ )
46
+ chunk_multipage_sections: bool = Field(
47
+ default=CHUNK_MULTI_PAGE_DEFAULT,
48
+ description="Ignore page boundaries when chunking such that elements from two different"
49
+ " pages can appear in the same chunk. Only operative for 'by_title'"
50
+ " chunking-strategy.",
51
+ )
52
+ chunk_new_after_n_chars: Optional[int] = Field(
53
+ default=None,
54
+ description="Soft-maximum chunk length. Another element will not be added to a chunk of"
55
+ " this length even when it would fit without exceeding the hard-maximum"
56
+ " length.",
57
+ )
58
+ chunk_overlap: Optional[int] = Field(
59
+ default=None,
60
+ description="Prefix chunk text with last overlap=N characters of prior chunk. Only"
61
+ " applies to oversized chunks divided by text-splitting. To apply overlap to"
62
+ " non-oversized chunks use the --overlap-all option.",
63
+ )
64
+ chunk_overlap_all: Optional[bool] = Field(
65
+ default=None,
66
+ description="Apply overlap to chunks formed from whole elements as well as those formed"
67
+ " by text-splitting oversized elements. Overlap length is take from --overlap"
68
+ " option value.",
69
+ )
29
70
 
30
71
  def to_chunking_kwargs(self) -> dict[str, Any]:
31
72
  return {
@@ -47,10 +88,14 @@ class Chunker(BaseProcess, ABC):
47
88
  def is_async(self) -> bool:
48
89
  return self.config.chunk_by_api
49
90
 
50
- def run(self, elements_filepath: Path, **kwargs: Any) -> list[Element]:
91
+ @requires_dependencies(dependencies=["unstructured"])
92
+ def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
93
+ from unstructured.chunking import dispatch
94
+ from unstructured.staging.base import elements_from_json
95
+
51
96
  elements = elements_from_json(filename=str(elements_filepath))
52
97
  if not elements:
53
- return elements
98
+ return [e.to_dict() for e in elements]
54
99
  local_chunking_strategies = ("basic", "by_title")
55
100
  if self.config.chunking_strategy not in local_chunking_strategies:
56
101
  logger.warning(
@@ -58,17 +103,19 @@ class Chunker(BaseProcess, ABC):
58
103
  self.config.chunking_strategy, ", ".join(local_chunking_strategies)
59
104
  )
60
105
  )
61
- return elements
106
+ return [e.to_dict() for e in elements]
62
107
  chunked_elements = dispatch.chunk(elements=elements, **self.config.to_chunking_kwargs())
63
- assign_and_map_hash_ids(chunked_elements)
64
- return chunked_elements
108
+ chunked_elements_dicts = [e.to_dict() for e in chunked_elements]
109
+ chunked_elements_dicts = assign_and_map_hash_ids(elements=chunked_elements_dicts)
110
+ return chunked_elements_dicts
65
111
 
66
- async def run_async(self, elements_filepath: Path, **kwargs: Any) -> list[Element]:
112
+ @requires_dependencies(dependencies=["unstructured_client"], extras="remote")
113
+ async def run_async(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
67
114
  from unstructured_client import UnstructuredClient
68
115
  from unstructured_client.models.shared import Files, PartitionParameters
69
116
 
70
117
  client = UnstructuredClient(
71
- api_key_auth=self.config.chunk_api_key,
118
+ api_key_auth=self.config.chunk_api_key.get_secret_value(),
72
119
  server_url=self.config.chunking_endpoint,
73
120
  )
74
121
  partition_request = self.config.to_chunking_kwargs()
@@ -89,9 +136,8 @@ class Chunker(BaseProcess, ABC):
89
136
  file_name=str(elements_filepath.resolve()),
90
137
  )
91
138
  filtered_partition_request["files"] = files
92
- partition_params = PartitionParameters(**filtered_partition_request)
139
+ partition_params = PartitionParameters(**filtered_partition_request)
93
140
  resp = client.general.partition(partition_params)
94
- elements_raw = resp.elements or []
95
- elements = dict_to_elements(elements_raw)
96
- assign_and_map_hash_ids(elements)
141
+ elements = resp.elements or []
142
+ elements = assign_and_map_hash_ids(elements=elements)
97
143
  return elements
@@ -1,3 +1,4 @@
1
+ from abc import ABC
1
2
  from dataclasses import dataclass
2
3
  from typing import Optional, Type, TypeVar
3
4
 
@@ -25,7 +26,12 @@ UploaderT = TypeVar("UploaderT", bound=Uploader)
25
26
 
26
27
 
27
28
  @dataclass
28
- class SourceRegistryEntry:
29
+ class RegistryEntry(ABC):
30
+ pass
31
+
32
+
33
+ @dataclass
34
+ class SourceRegistryEntry(RegistryEntry):
29
35
  indexer: Type[IndexerT]
30
36
  downloader: Type[DownloaderT]
31
37
 
@@ -44,7 +50,7 @@ def add_source_entry(source_type: str, entry: SourceRegistryEntry):
44
50
 
45
51
 
46
52
  @dataclass
47
- class DestinationRegistryEntry:
53
+ class DestinationRegistryEntry(RegistryEntry):
48
54
  uploader: Type[UploaderT]
49
55
  upload_stager: Optional[Type[UploadStagerT]] = None
50
56
 
@@ -6,10 +6,14 @@ from unstructured_ingest.v2.processes.connector_registry import (
6
6
  add_source_entry,
7
7
  )
8
8
 
9
- from .astra import CONNECTOR_TYPE as ASTRA_CONNECTOR_TYPE
10
- from .astra import astra_destination_entry
9
+ from .astradb import CONNECTOR_TYPE as ASTRA_DB_CONNECTOR_TYPE
10
+ from .astradb import astra_db_destination_entry
11
+ from .azure_cognitive_search import CONNECTOR_TYPE as AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE
12
+ from .azure_cognitive_search import azure_cognitive_search_destination_entry
11
13
  from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
12
14
  from .chroma import chroma_destination_entry
15
+ from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
16
+ from .couchbase import couchbase_destination_entry
13
17
  from .databricks_volumes import CONNECTOR_TYPE as DATABRICKS_VOLUMES_CONNECTOR_TYPE
14
18
  from .databricks_volumes import databricks_volumes_destination_entry
15
19
  from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
@@ -39,10 +43,12 @@ from .sql import sql_destination_entry
39
43
  from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
40
44
  from .weaviate import weaviate_destination_entry
41
45
 
42
- add_destination_entry(destination_type=ASTRA_CONNECTOR_TYPE, entry=astra_destination_entry)
46
+ add_destination_entry(destination_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_destination_entry)
43
47
 
44
48
  add_destination_entry(destination_type=CHROMA_CONNECTOR_TYPE, entry=chroma_destination_entry)
45
49
 
50
+ add_destination_entry(destination_type=COUCHBASE_CONNECTOR_TYPE, entry=couchbase_destination_entry)
51
+
46
52
  add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
47
53
  add_destination_entry(
48
54
  destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry
@@ -77,3 +83,7 @@ add_destination_entry(
77
83
  destination_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_destination_entry
78
84
  )
79
85
  add_destination_entry(destination_type=MILVUS_CONNECTOR_TYPE, entry=milvus_destination_entry)
86
+ add_destination_entry(
87
+ destination_type=AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE,
88
+ entry=azure_cognitive_search_destination_entry,
89
+ )
@@ -3,10 +3,10 @@ from dataclasses import dataclass, field
3
3
  from pathlib import Path
4
4
  from typing import TYPE_CHECKING, Any, Optional
5
5
 
6
- from unstructured import __name__ as integration_name
7
- from unstructured.__version__ import __version__ as integration_version
6
+ from pydantic import Field, Secret
8
7
 
9
- from unstructured_ingest.enhanced_dataclass import enhanced_field
8
+ from unstructured_ingest import __name__ as integration_name
9
+ from unstructured_ingest.__version__ import __version__ as integration_version
10
10
  from unstructured_ingest.error import DestinationConnectionError
11
11
  from unstructured_ingest.utils.data_prep import batch_generator
12
12
  from unstructured_ingest.utils.dep_check import requires_dependencies
@@ -28,30 +28,30 @@ from unstructured_ingest.v2.processes.connector_registry import (
28
28
  if TYPE_CHECKING:
29
29
  from astrapy.db import AstraDBCollection
30
30
 
31
- CONNECTOR_TYPE = "astra"
31
+ CONNECTOR_TYPE = "astradb"
32
32
 
33
33
 
34
34
  @dataclass
35
- class AstraAccessConfig(AccessConfig):
36
- token: str
37
- api_endpoint: str
35
+ class AstraDBAccessConfig(AccessConfig):
36
+ token: str = Field(description="Astra DB Token with access to the database.")
37
+ api_endpoint: str = Field(description="The API endpoint for the Astra DB.")
38
38
 
39
39
 
40
40
  @dataclass
41
- class AstraConnectionConfig(ConnectionConfig):
41
+ class AstraDBConnectionConfig(ConnectionConfig):
42
42
  connection_type: str = CONNECTOR_TYPE
43
- access_config: AstraAccessConfig = enhanced_field(sensitive=True)
43
+ access_config: Secret[AstraDBAccessConfig]
44
44
 
45
45
 
46
46
  @dataclass
47
- class AstraUploadStagerConfig(UploadStagerConfig):
47
+ class AstraDBUploadStagerConfig(UploadStagerConfig):
48
48
  pass
49
49
 
50
50
 
51
51
  @dataclass
52
- class AstraUploadStager(UploadStager):
53
- upload_stager_config: AstraUploadStagerConfig = field(
54
- default_factory=lambda: AstraUploadStagerConfig()
52
+ class AstraDBUploadStager(UploadStager):
53
+ upload_stager_config: AstraDBUploadStagerConfig = field(
54
+ default_factory=lambda: AstraDBUploadStagerConfig()
55
55
  )
56
56
 
57
57
  def conform_dict(self, element_dict: dict) -> dict:
@@ -80,19 +80,28 @@ class AstraUploadStager(UploadStager):
80
80
  return output_path
81
81
 
82
82
 
83
- @dataclass
84
- class AstraUploaderConfig(UploaderConfig):
85
- collection_name: str
86
- embedding_dimension: int
87
- namespace: Optional[str] = None
88
- requested_indexing_policy: Optional[dict[str, Any]] = None
89
- batch_size: int = 20
83
+ class AstraDBUploaderConfig(UploaderConfig):
84
+ collection_name: str = Field(
85
+ description="The name of the Astra DB collection. "
86
+ "Note that the collection name must only include letters, "
87
+ "numbers, and underscores."
88
+ )
89
+ embedding_dimension: int = Field(
90
+ default=384, description="The dimensionality of the embeddings"
91
+ )
92
+ namespace: Optional[str] = Field(default=None, description="The Astra DB connection namespace.")
93
+ requested_indexing_policy: Optional[dict[str, Any]] = Field(
94
+ default=None,
95
+ description="The indexing policy to use for the collection.",
96
+ examples=['{"deny": ["metadata"]}'],
97
+ )
98
+ batch_size: int = Field(default=20, description="Number of records per batch")
90
99
 
91
100
 
92
101
  @dataclass
93
- class AstraUploader(Uploader):
94
- connection_config: AstraConnectionConfig
95
- upload_config: AstraUploaderConfig
102
+ class AstraDBUploader(Uploader):
103
+ connection_config: AstraDBConnectionConfig
104
+ upload_config: AstraDBUploaderConfig
96
105
  connector_type: str = CONNECTOR_TYPE
97
106
 
98
107
  def precheck(self) -> None:
@@ -102,7 +111,7 @@ class AstraUploader(Uploader):
102
111
  logger.error(f"Failed to validate connection {e}", exc_info=True)
103
112
  raise DestinationConnectionError(f"failed to validate connection: {e}")
104
113
 
105
- @requires_dependencies(["astrapy"], extras="astra")
114
+ @requires_dependencies(["astrapy"], extras="astradb")
106
115
  def get_collection(self) -> "AstraDBCollection":
107
116
  from astrapy.db import AstraDB
108
117
 
@@ -111,14 +120,15 @@ class AstraUploader(Uploader):
111
120
  embedding_dimension = self.upload_config.embedding_dimension
112
121
  requested_indexing_policy = self.upload_config.requested_indexing_policy
113
122
 
114
- # If the user has requested an indexing policy, pass it to the AstraDB
123
+ # If the user has requested an indexing policy, pass it to the Astra DB
115
124
  options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None
116
125
 
117
126
  # Build the Astra DB object.
118
127
  # caller_name/version for AstraDB tracking
128
+ access_configs = self.connection_config.access_config.get_secret_value()
119
129
  astra_db = AstraDB(
120
- api_endpoint=self.connection_config.access_config.api_endpoint,
121
- token=self.connection_config.access_config.token,
130
+ api_endpoint=access_configs.api_endpoint,
131
+ token=access_configs.token,
122
132
  namespace=self.upload_config.namespace,
123
133
  caller_name=integration_name,
124
134
  caller_version=integration_version,
@@ -144,17 +154,17 @@ class AstraUploader(Uploader):
144
154
  f"collection {self.upload_config.collection_name}"
145
155
  )
146
156
 
147
- astra_batch_size = self.upload_config.batch_size
157
+ astra_db_batch_size = self.upload_config.batch_size
148
158
  collection = self.get_collection()
149
159
 
150
- for chunk in batch_generator(elements_dict, astra_batch_size):
160
+ for chunk in batch_generator(elements_dict, astra_db_batch_size):
151
161
  collection.insert_many(chunk)
152
162
 
153
163
 
154
- astra_destination_entry = DestinationRegistryEntry(
155
- connection_config=AstraConnectionConfig,
156
- upload_stager_config=AstraUploadStagerConfig,
157
- upload_stager=AstraUploadStager,
158
- uploader_config=AstraUploaderConfig,
159
- uploader=AstraUploader,
164
+ astra_db_destination_entry = DestinationRegistryEntry(
165
+ connection_config=AstraDBConnectionConfig,
166
+ upload_stager_config=AstraDBUploadStagerConfig,
167
+ upload_stager=AstraDBUploadStager,
168
+ uploader_config=AstraDBUploaderConfig,
169
+ uploader=AstraDBUploader,
160
170
  )