PyPI - unstructured-ingest - Versions diffs - 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl - Mend

unstructured-ingest 0.0.3py3-none-any.whl → 0.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (125) hide show

unstructured_ingest/__version__.py +1 -1
unstructured_ingest/cli/cli.py +6 -1
unstructured_ingest/cli/cmds/__init__.py +4 -4
unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
unstructured_ingest/cli/interfaces.py +13 -6
unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
unstructured_ingest/connector/biomed.py +12 -5
unstructured_ingest/connector/confluence.py +3 -3
unstructured_ingest/connector/github.py +3 -2
unstructured_ingest/connector/google_drive.py +1 -2
unstructured_ingest/connector/mongodb.py +1 -2
unstructured_ingest/connector/notion/client.py +31 -16
unstructured_ingest/connector/notion/connector.py +3 -2
unstructured_ingest/connector/registry.py +2 -2
unstructured_ingest/connector/vectara.py +7 -2
unstructured_ingest/interfaces.py +13 -9
unstructured_ingest/pipeline/interfaces.py +8 -3
unstructured_ingest/pipeline/reformat/chunking.py +13 -9
unstructured_ingest/pipeline/reformat/embedding.py +3 -3
unstructured_ingest/runner/__init__.py +2 -2
unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
unstructured_ingest/runner/writers/__init__.py +2 -2
unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
unstructured_ingest/utils/chunking.py +45 -0
unstructured_ingest/utils/dep_check.py +1 -1
unstructured_ingest/utils/google_filetype.py +9 -0
unstructured_ingest/v2/cli/base/cmd.py +57 -13
unstructured_ingest/v2/cli/base/dest.py +21 -12
unstructured_ingest/v2/cli/base/src.py +35 -23
unstructured_ingest/v2/cli/cmds.py +14 -0
unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
unstructured_ingest/v2/interfaces/connector.py +5 -7
unstructured_ingest/v2/interfaces/downloader.py +8 -5
unstructured_ingest/v2/interfaces/file_data.py +8 -2
unstructured_ingest/v2/interfaces/indexer.py +3 -4
unstructured_ingest/v2/interfaces/processor.py +10 -10
unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
unstructured_ingest/v2/interfaces/uploader.py +3 -3
unstructured_ingest/v2/pipeline/pipeline.py +9 -6
unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
unstructured_ingest/v2/pipeline/steps/download.py +13 -11
unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
unstructured_ingest/v2/pipeline/steps/index.py +14 -10
unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
unstructured_ingest/v2/processes/__init__.py +18 -0
unstructured_ingest/v2/processes/chunker.py +74 -28
unstructured_ingest/v2/processes/connector_registry.py +8 -2
unstructured_ingest/v2/processes/connectors/__init__.py +18 -3
unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +46 -39
unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
unstructured_ingest/v2/processes/connectors/couchbase.py +333 -0
unstructured_ingest/v2/processes/connectors/databricks_volumes.py +87 -32
unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
unstructured_ingest/v2/processes/connectors/kdbai.py +170 -0
unstructured_ingest/v2/processes/connectors/local.py +27 -16
unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
unstructured_ingest/v2/processes/connectors/pinecone.py +22 -21
unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
unstructured_ingest/v2/processes/connectors/sql.py +29 -31
unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
unstructured_ingest/v2/processes/embedder.py +106 -47
unstructured_ingest/v2/processes/filter.py +11 -5
unstructured_ingest/v2/processes/partitioner.py +79 -33
unstructured_ingest/v2/processes/uncompress.py +3 -3
unstructured_ingest/v2/utils.py +45 -0
unstructured_ingest-0.0.5.dist-info/LICENSE.md +201 -0
unstructured_ingest-0.0.5.dist-info/METADATA +574 -0
{unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/RECORD +91 -116
{unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/WHEEL +1 -1
unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
unstructured_ingest/v2/cli/cmds/astra.py +0 -85
unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
unstructured_ingest/v2/cli/cmds/local.py +0 -52
unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
unstructured_ingest/v2/cli/cmds/sql.py +0 -84
unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
unstructured_ingest/v2/cli/configs/__init__.py +0 -13
unstructured_ingest/v2/cli/configs/chunk.py +0 -89
unstructured_ingest/v2/cli/configs/embed.py +0 -74
unstructured_ingest/v2/cli/configs/filter.py +0 -28
unstructured_ingest/v2/cli/configs/partition.py +0 -99
unstructured_ingest/v2/cli/configs/processor.py +0 -88
unstructured_ingest/v2/cli/interfaces.py +0 -27
unstructured_ingest/v2/pipeline/utils.py +0 -15
unstructured_ingest-0.0.3.dist-info/METADATA +0 -175
/unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
{unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/entry_points.txt +0 -0
{unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/top_level.txt +0 -0

unstructured_ingest/v2/pipeline/steps/download.py CHANGED Viewed

@@ -9,7 +9,7 @@ from unstructured_ingest.v2.interfaces import FileData, download_responses
 from unstructured_ingest.v2.interfaces.downloader import Downloader
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
-from unstructured_ingest.v2.pipeline.utils import sterilize_dict
+from unstructured_ingest.v2.utils import serialize_base_model_json
 DownloaderT = TypeVar("DownloaderT", bound=Downloader)
@@ -30,15 +30,9 @@ class DownloadStep(PipelineStep):
         return f"{self.identifier} ({self.process.__class__.__name__})"
     def __post_init__(self):
-        config = (
-            sterilize_dict(self.process.download_config.to_dict(redact_sensitive=True))
-            if self.process.download_config
-            else None
-        )
+        config = self.process.download_config.json() if self.process.download_config else None
         connection_config = (
-            sterilize_dict(self.process.connection_config.to_dict(redact_sensitive=True))
-            if self.process.connection_config
-            else None
+            self.process.connection_config.json() if self.process.connection_config else None
         )
         logger.info(
             f"Created {self.identifier} with configs: {config}, "
@@ -180,9 +174,17 @@ class DownloadStep(PipelineStep):
         return str(filepath)
     def get_hash(self, extras: Optional[list[str]]) -> str:
-        hashable_string = json.dumps(
-            sterilize_dict(self.process.download_config.to_dict()), sort_keys=True
+        download_config_dict = json.loads(
+            serialize_base_model_json(model=self.process.download_config)
+        )
+        connection_config_dict = json.loads(
+            serialize_base_model_json(model=self.process.connection_config)
         )
+        hashable_dict = {
+            "download_config": download_config_dict,
+            "connection_config": connection_config_dict,
+        }
+        hashable_string = json.dumps(hashable_dict, sort_keys=True)
         if extras:
             hashable_string += "".join(extras)
         return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]

unstructured_ingest/v2/pipeline/steps/embed.py CHANGED Viewed

@@ -5,13 +5,11 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Callable, Optional, TypedDict
-from unstructured.staging.base import elements_to_dicts
 from unstructured_ingest.v2.interfaces import FileData
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
-from unstructured_ingest.v2.pipeline.utils import sterilize_dict
 from unstructured_ingest.v2.processes.embedder import Embedder
+from unstructured_ingest.v2.utils import serialize_base_model_json
 STEP_ID = "embed"
@@ -30,11 +28,7 @@ class EmbedStep(PipelineStep):
         return f"{self.identifier} ({self.process.config.embedding_provider})"
     def __post_init__(self):
-        config = (
-            sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
-            if self.process.config
-            else None
-        )
+        config = self.process.config.json() if self.process.config else None
         logger.info(f"Created {self.identifier} with configs: {config}")
     def should_embed(self, filepath: Path, file_data: FileData) -> bool:
@@ -71,13 +65,13 @@ class EmbedStep(PipelineStep):
         self._save_output(
             output_filepath=str(output_filepath),
-            embedded_content=elements_to_dicts(embed_content_raw),
+            embedded_content=embed_content_raw,
         )
         return EmbedStepResponse(file_data_path=file_data_path, path=str(output_filepath))
     def get_hash(self, extras: Optional[list[str]]) -> str:
-        hashable_string = json.dumps(
-            self.process.config.to_dict(), sort_keys=True, ensure_ascii=True
+        hashable_string = serialize_base_model_json(
+            model=self.process.config, sort_keys=True, ensure_ascii=True
         )
         if extras:
             hashable_string += "".join(extras)

unstructured_ingest/v2/pipeline/steps/filter.py CHANGED Viewed

@@ -5,7 +5,6 @@ from typing import Callable, Optional
 from unstructured_ingest.v2.interfaces.file_data import FileData
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
-from unstructured_ingest.v2.pipeline.utils import sterilize_dict
 from unstructured_ingest.v2.processes.filter import Filterer
 STEP_ID = "filter"
@@ -17,11 +16,7 @@ class FilterStep(PipelineStep):
     identifier: str = STEP_ID
     def __post_init__(self):
-        config = (
-            sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
-            if self.process.config
-            else None
-        )
+        config = self.process.config.json() if self.process.config else None
         logger.info(f"Created {self.identifier} with configs: {config}")
     async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:

unstructured_ingest/v2/pipeline/steps/index.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import Generator, Optional, TypeVar
 from unstructured_ingest.v2.interfaces.indexer import Indexer
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
-from unstructured_ingest.v2.pipeline.utils import sterilize_dict
+from unstructured_ingest.v2.utils import serialize_base_model_json
 IndexerT = TypeVar("IndexerT", bound=Indexer)
@@ -22,15 +22,9 @@ class IndexStep(PipelineStep):
         return f"{self.identifier} ({self.process.__class__.__name__})"
     def __post_init__(self):
-        config = (
-            sterilize_dict(self.process.index_config.to_dict(redact_sensitive=True))
-            if self.process.index_config
-            else None
-        )
+        config = self.process.index_config.json() if self.process.index_config else None
         connection_config = (
-            sterilize_dict(self.process.connection_config.to_dict(redact_sensitive=True))
-            if self.process.connection_config
-            else None
+            self.process.connection_config.json() if self.process.connection_config else None
         )
         logger.info(
             f"Created {self.identifier} with configs: {config}, "
@@ -55,7 +49,17 @@ class IndexStep(PipelineStep):
                 continue
     def get_hash(self, extras: Optional[list[str]]) -> str:
-        hashable_string = json.dumps(self.process.index_config.to_dict())
+        index_config_dict = json.loads(
+            serialize_base_model_json(model=self.process.index_config, sort_keys=True)
+        )
+        connection_config_dict = json.loads(
+            serialize_base_model_json(model=self.process.connection_config, sort_keys=True)
+        )
+        hashable_dict = {
+            "index_config": index_config_dict,
+            "connection_config": connection_config_dict,
+        }
+        hashable_string = json.dumps(hashable_dict, sort_keys=True)
         if extras:
             hashable_string += "".join(extras)
         return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]

unstructured_ingest/v2/pipeline/steps/partition.py CHANGED Viewed

@@ -8,8 +8,8 @@ from typing import Callable, Optional, TypedDict
 from unstructured_ingest.v2.interfaces import FileData
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
-from unstructured_ingest.v2.pipeline.utils import sterilize_dict
 from unstructured_ingest.v2.processes.partitioner import Partitioner
+from unstructured_ingest.v2.utils import serialize_base_model_json
 STEP_ID = "partition"
@@ -28,7 +28,7 @@ class PartitionStep(PipelineStep):
         return f"{self.identifier} ({self.process.config.strategy})"
     def __post_init__(self):
-        config = sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
+        config = self.process.config.json()
         logger.info(f"Created {self.identifier} with configs: {config}")
     def should_partition(self, filepath: Path, file_data: FileData) -> bool:
@@ -56,7 +56,7 @@ class PartitionStep(PipelineStep):
         if not self.should_partition(filepath=output_filepath, file_data=file_data):
             logger.debug(f"Skipping partitioning, output already exists: {output_filepath}")
             return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
-        fn_kwargs = {"filename": path, "metadata": file_data.metadata}
+        fn_kwargs = {"filename": path, "metadata": file_data.metadata.to_dict()}
         if not asyncio.iscoroutinefunction(fn):
             partitioned_content = fn(**fn_kwargs)
         elif semaphore := self.context.semaphore:
@@ -70,8 +70,8 @@ class PartitionStep(PipelineStep):
         return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
     def get_hash(self, extras: Optional[list[str]]) -> str:
-        hashable_string = json.dumps(
-            self.process.config.to_dict(), sort_keys=True, ensure_ascii=True
+        hashable_string = serialize_base_model_json(
+            model=self.process.config, sort_keys=True, ensure_ascii=True
         )
         if extras:
             hashable_string += "".join(extras)

unstructured_ingest/v2/pipeline/steps/stage.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import asyncio
 import hashlib
-import json
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Callable, Optional, TypedDict
@@ -9,7 +8,7 @@ from unstructured_ingest.v2.interfaces.file_data import FileData
 from unstructured_ingest.v2.interfaces.upload_stager import UploadStager
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
-from unstructured_ingest.v2.pipeline.utils import sterilize_dict
+from unstructured_ingest.v2.utils import serialize_base_model_json
 STEP_ID = "upload_stage"
@@ -29,9 +28,7 @@ class UploadStageStep(PipelineStep):
     def __post_init__(self):
         config = (
-            sterilize_dict(self.process.upload_stager_config.to_dict(redact_sensitive=True))
-            if self.process.upload_stager_config
-            else None
+            self.process.upload_stager_config.json() if self.process.upload_stager_config else None
         )
         self.cache_dir.mkdir(parents=True, exist_ok=True)
         logger.info(f"Created {self.identifier} with configs: {config}")
@@ -56,8 +53,8 @@ class UploadStageStep(PipelineStep):
         return UploadStageStepResponse(file_data_path=file_data_path, path=str(staged_output_path))
     def get_hash(self, extras: Optional[list[str]]) -> str:
-        hashable_string = json.dumps(
-            self.process.upload_stager_config.to_dict(), sort_keys=True, ensure_ascii=True
+        hashable_string = serialize_base_model_json(
+            model=self.process.upload_stager_config, sort_keys=True, ensure_ascii=True
         )
         if extras:
             hashable_string += "".join(extras)

unstructured_ingest/v2/pipeline/steps/uncompress.py CHANGED Viewed

@@ -5,7 +5,6 @@ from typing import Callable, TypedDict
 from unstructured_ingest.v2.interfaces.file_data import FileData
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
-from unstructured_ingest.v2.pipeline.utils import sterilize_dict
 from unstructured_ingest.v2.processes.uncompress import Uncompressor
 STEP_ID = "uncompress"
@@ -21,11 +20,7 @@ class UncompressStep(PipelineStep):
     identifier: str = STEP_ID
     def __post_init__(self):
-        config = (
-            sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
-            if self.process.config
-            else None
-        )
+        config = self.process.config.json() if self.process.config else None
         logger.info(f"Created {self.identifier} with configs: {config}")
     def _run(self, path: str, file_data_path: str) -> list[UncompressStepResponse]:

unstructured_ingest/v2/pipeline/steps/upload.py CHANGED Viewed

@@ -7,7 +7,6 @@ from unstructured_ingest.v2.interfaces import FileData
 from unstructured_ingest.v2.interfaces.uploader import UploadContent, Uploader
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.pipeline.interfaces import PipelineStep, iterable_input, timed
-from unstructured_ingest.v2.pipeline.utils import sterilize_dict
 STEP_ID = "upload"
@@ -26,15 +25,9 @@ class UploadStep(PipelineStep):
         return f"{self.identifier} ({self.process.__class__.__name__})"
     def __post_init__(self):
-        config = (
-            sterilize_dict(self.process.upload_config.to_dict(redact_sensitive=True))
-            if self.process.upload_config
-            else None
-        )
+        config = self.process.upload_config.json() if self.process.upload_config else None
         connection_config = (
-            sterilize_dict(self.process.connection_config.to_dict(redact_sensitive=True))
-            if self.process.connection_config
-            else None
+            self.process.connection_config.json() if self.process.connection_config else None
         )
         logger.info(
             f"Created {self.identifier} with configs: {config}, "

unstructured_ingest/v2/processes/__init__.py CHANGED Viewed

@@ -0,0 +1,18 @@
+from .chunker import Chunker, ChunkerConfig
+from .embedder import Embedder, EmbedderConfig
+from .filter import Filterer, FiltererConfig
+from .partitioner import Partitioner, PartitionerConfig
+from .uncompress import UncompressConfig, Uncompressor
+__all__ = [
+    "Chunker",
+    "ChunkerConfig",
+    "Embedder",
+    "EmbedderConfig",
+    "Filterer",
+    "FiltererConfig",
+    "Partitioner",
+    "PartitionerConfig",
+    "Uncompressor",
+    "UncompressConfig",
+]

unstructured_ingest/v2/processes/chunker.py CHANGED Viewed

@@ -3,29 +3,70 @@ from dataclasses import dataclass, fields
 from pathlib import Path
 from typing import Any, Optional
-from unstructured.chunking import dispatch
-from unstructured.documents.elements import Element, assign_and_map_hash_ids
-from unstructured.staging.base import dict_to_elements, elements_from_json
+from pydantic import BaseModel, Field, SecretStr
-from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
+from unstructured_ingest.utils.chunking import assign_and_map_hash_ids
+from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.interfaces.process import BaseProcess
 from unstructured_ingest.v2.logger import logger
+CHUNK_MAX_CHARS_DEFAULT: int = 500
+CHUNK_MULTI_PAGE_DEFAULT: bool = True
-@dataclass
-class ChunkerConfig(EnhancedDataClassJsonMixin):
-    chunking_strategy: Optional[str] = None
-    chunking_endpoint: Optional[str] = "https://api.unstructured.io/general/v0/general"
-    chunk_by_api: bool = False
-    chunk_api_key: Optional[str] = enhanced_field(default=None, sensitive=True)
-    chunk_combine_text_under_n_chars: Optional[int] = None
-    chunk_include_orig_elements: Optional[bool] = None
-    chunk_max_characters: Optional[int] = None
-    chunk_multipage_sections: Optional[bool] = None
-    chunk_new_after_n_chars: Optional[int] = None
-    chunk_overlap: Optional[int] = None
-    chunk_overlap_all: Optional[bool] = None
+class ChunkerConfig(BaseModel):
+    chunking_strategy: Optional[str] = Field(
+        default=None, description="The rule-set to use to form chunks. Omit to disable chunking."
+    )
+    chunking_endpoint: Optional[str] = Field(
+        default="https://api.unstructured.io/general/v0/general",
+        description="If chunking via api, use the following host.",
+    )
+    chunk_by_api: bool = Field(default=False, description="Flag to use api for chunking")
+    chunk_api_key: Optional[SecretStr] = Field(
+        default=None, description="API Key for chunking endpoint."
+    )
+    chunk_combine_text_under_n_chars: Optional[int] = Field(
+        default=None,
+        description="Combine consecutive chunks when the first does not exceed this length and"
+        " the second will fit without exceeding the hard-maximum length. Only"
+        " operative for 'by_title' chunking-strategy.",
+    )
+    chunk_include_orig_elements: Optional[bool] = Field(
+        default=None,
+        description="When chunking, add the original elements consolidated to form each chunk to"
+        " `.metadata.orig_elements` on that chunk.",
+    )
+    chunk_max_characters: int = Field(
+        default=CHUNK_MAX_CHARS_DEFAULT,
+        description="Hard maximum chunk length. No chunk will exceed this length. An oversized"
+        " element will be divided by text-splitting to fit this window.",
+    )
+    chunk_multipage_sections: bool = Field(
+        default=CHUNK_MULTI_PAGE_DEFAULT,
+        description="Ignore page boundaries when chunking such that elements from two different"
+        " pages can appear in the same chunk. Only operative for 'by_title'"
+        " chunking-strategy.",
+    )
+    chunk_new_after_n_chars: Optional[int] = Field(
+        default=None,
+        description="Soft-maximum chunk length. Another element will not be added to a chunk of"
+        " this length even when it would fit without exceeding the hard-maximum"
+        " length.",
+    )
+    chunk_overlap: Optional[int] = Field(
+        default=None,
+        description="Prefix chunk text with last overlap=N characters of prior chunk. Only"
+        " applies to oversized chunks divided by text-splitting. To apply overlap to"
+        " non-oversized chunks use the --overlap-all option.",
+    )
+    chunk_overlap_all: Optional[bool] = Field(
+        default=None,
+        description="Apply overlap to chunks formed from whole elements as well as those formed"
+        " by text-splitting oversized elements. Overlap length is take from --overlap"
+        " option value.",
+    )
     def to_chunking_kwargs(self) -> dict[str, Any]:
         return {
@@ -47,10 +88,14 @@ class Chunker(BaseProcess, ABC):
     def is_async(self) -> bool:
         return self.config.chunk_by_api
-    def run(self, elements_filepath: Path, **kwargs: Any) -> list[Element]:
+    @requires_dependencies(dependencies=["unstructured"])
+    def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
+        from unstructured.chunking import dispatch
+        from unstructured.staging.base import elements_from_json
         elements = elements_from_json(filename=str(elements_filepath))
         if not elements:
-            return elements
+            return [e.to_dict() for e in elements]
         local_chunking_strategies = ("basic", "by_title")
         if self.config.chunking_strategy not in local_chunking_strategies:
             logger.warning(
@@ -58,17 +103,19 @@ class Chunker(BaseProcess, ABC):
                     self.config.chunking_strategy, ", ".join(local_chunking_strategies)
                 )
             )
-            return elements
+            return [e.to_dict() for e in elements]
         chunked_elements = dispatch.chunk(elements=elements, **self.config.to_chunking_kwargs())
-        assign_and_map_hash_ids(chunked_elements)
-        return chunked_elements
+        chunked_elements_dicts = [e.to_dict() for e in chunked_elements]
+        chunked_elements_dicts = assign_and_map_hash_ids(elements=chunked_elements_dicts)
+        return chunked_elements_dicts
-    async def run_async(self, elements_filepath: Path, **kwargs: Any) -> list[Element]:
+    @requires_dependencies(dependencies=["unstructured_client"], extras="remote")
+    async def run_async(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
         from unstructured_client import UnstructuredClient
         from unstructured_client.models.shared import Files, PartitionParameters
         client = UnstructuredClient(
-            api_key_auth=self.config.chunk_api_key,
+            api_key_auth=self.config.chunk_api_key.get_secret_value(),
             server_url=self.config.chunking_endpoint,
         )
         partition_request = self.config.to_chunking_kwargs()
@@ -89,9 +136,8 @@ class Chunker(BaseProcess, ABC):
                 file_name=str(elements_filepath.resolve()),
             )
             filtered_partition_request["files"] = files
-        partition_params = PartitionParameters(**filtered_partition_request)
+            partition_params = PartitionParameters(**filtered_partition_request)
         resp = client.general.partition(partition_params)
-        elements_raw = resp.elements or []
-        elements = dict_to_elements(elements_raw)
-        assign_and_map_hash_ids(elements)
+        elements = resp.elements or []
+        elements = assign_and_map_hash_ids(elements=elements)
         return elements

unstructured_ingest/v2/processes/connector_registry.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from abc import ABC
 from dataclasses import dataclass
 from typing import Optional, Type, TypeVar
@@ -25,7 +26,12 @@ UploaderT = TypeVar("UploaderT", bound=Uploader)
 @dataclass
-class SourceRegistryEntry:
+class RegistryEntry(ABC):
+    pass
+@dataclass
+class SourceRegistryEntry(RegistryEntry):
     indexer: Type[IndexerT]
     downloader: Type[DownloaderT]
@@ -44,7 +50,7 @@ def add_source_entry(source_type: str, entry: SourceRegistryEntry):
 @dataclass
-class DestinationRegistryEntry:
+class DestinationRegistryEntry(RegistryEntry):
     uploader: Type[UploaderT]
     upload_stager: Optional[Type[UploadStagerT]] = None

unstructured_ingest/v2/processes/connectors/__init__.py CHANGED Viewed

@@ -6,16 +6,22 @@ from unstructured_ingest.v2.processes.connector_registry import (
     add_source_entry,
 )
-from .astra import CONNECTOR_TYPE as ASTRA_CONNECTOR_TYPE
-from .astra import astra_destination_entry
+from .astradb import CONNECTOR_TYPE as ASTRA_DB_CONNECTOR_TYPE
+from .astradb import astra_db_destination_entry
+from .azure_cognitive_search import CONNECTOR_TYPE as AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE
+from .azure_cognitive_search import azure_cognitive_search_destination_entry
 from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
 from .chroma import chroma_destination_entry
+from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
+from .couchbase import couchbase_destination_entry, couchbase_source_entry
 from .databricks_volumes import CONNECTOR_TYPE as DATABRICKS_VOLUMES_CONNECTOR_TYPE
 from .databricks_volumes import databricks_volumes_destination_entry
 from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
 from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
 from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
 from .google_drive import google_drive_source_entry
+from .kdbai import CONNECTOR_TYPE as KDBAI_CONNECTOR_TYPE
+from .kdbai import kdbai_destination_entry
 from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE
 from .local import local_destination_entry, local_source_entry
 from .milvus import CONNECTOR_TYPE as MILVUS_CONNECTOR_TYPE
@@ -39,10 +45,13 @@ from .sql import sql_destination_entry
 from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
 from .weaviate import weaviate_destination_entry
-add_destination_entry(destination_type=ASTRA_CONNECTOR_TYPE, entry=astra_destination_entry)
+add_destination_entry(destination_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_destination_entry)
 add_destination_entry(destination_type=CHROMA_CONNECTOR_TYPE, entry=chroma_destination_entry)
+add_source_entry(source_type=COUCHBASE_CONNECTOR_TYPE, entry=couchbase_source_entry)
+add_destination_entry(destination_type=COUCHBASE_CONNECTOR_TYPE, entry=couchbase_destination_entry)
 add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
 add_destination_entry(
     destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry
@@ -77,3 +86,9 @@ add_destination_entry(
     destination_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_destination_entry
 )
 add_destination_entry(destination_type=MILVUS_CONNECTOR_TYPE, entry=milvus_destination_entry)
+add_destination_entry(
+    destination_type=AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE,
+    entry=azure_cognitive_search_destination_entry,
+)
+add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry)

unstructured-ingest 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.0.3py3-none-any.whl → 0.0.5py3-none-any.whl