PyPI - ragbits-document-search - Versions diffs - 1.4.0.dev202601310254__py3-none-any.whl - Mend

ragbits-document-search 1.4.0.dev202601310254__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

ragbits/document_search/ingestion/strategies/batched.py ADDED Viewed

@@ -0,0 +1,261 @@
+import asyncio
+from collections.abc import Iterable
+from dataclasses import dataclass
+from ragbits.core.sources.base import Source
+from ragbits.core.utils.helpers import batched
+from ragbits.core.vector_stores.base import VectorStore
+from ragbits.document_search.documents.document import Document, DocumentMeta
+from ragbits.document_search.documents.element import Element
+from ragbits.document_search.ingestion.enrichers.router import ElementEnricherRouter
+from ragbits.document_search.ingestion.parsers.router import DocumentParserRouter
+from ragbits.document_search.ingestion.strategies.base import (
+    IngestDocumentResult,
+    IngestError,
+    IngestExecutionResult,
+    IngestStrategy,
+)
+@dataclass
+class IngestTaskResult:
+    """
+    Represents the result of the document batch ingest task.
+    """
+    document_uri: str
+    elements: list[Element]
+class BatchedIngestStrategy(IngestStrategy):
+    """
+    Ingest strategy that processes documents in batches.
+    """
+    def __init__(
+        self,
+        batch_size: int | None = None,
+        enrich_batch_size: int | None = None,
+        index_batch_size: int | None = None,
+        num_retries: int = 3,
+        backoff_multiplier: int = 1,
+        backoff_max: int = 60,
+    ) -> None:
+        """
+        Initialize the BatchedIngestStrategy instance.
+        Args:
+            batch_size: The batch size for parsing documents.
+                Describes the maximum number of documents to parse at once. If None, all documents are parsed at once.
+            enrich_batch_size: The batch size for enriching elements.
+                Describes the maximum number of document elements to enrich at once.
+                If None, all elements are enriched at once.
+            index_batch_size: The batch size for indexing elements.
+                Describes the maximum number of document elements to index at once.
+                If None, all elements are indexed at once.
+            num_retries: The number of retries per document ingest task error.
+            backoff_multiplier: The base delay multiplier for exponential backoff (in seconds).
+            backoff_max: The maximum allowed delay (in seconds) between retries.
+        """
+        super().__init__(num_retries=num_retries, backoff_multiplier=backoff_multiplier, backoff_max=backoff_max)
+        self.batch_size = batch_size
+        self.enrich_batch_size = enrich_batch_size
+        self.index_batch_size = index_batch_size
+    async def __call__(
+        self,
+        documents: Iterable[DocumentMeta | Document | Source],
+        vector_store: VectorStore,
+        parser_router: DocumentParserRouter,
+        enricher_router: ElementEnricherRouter,
+    ) -> IngestExecutionResult:
+        """
+        Ingest documents sequentially in batches.
+        Args:
+            documents: The documents to ingest.
+            vector_store: The vector store to store document chunks.
+            parser_router: The document parser router to use.
+            enricher_router: The intermediate element enricher router to use.
+        Returns:
+            The ingest execution result.
+        """
+        results = IngestExecutionResult()
+        for documents_batch in batched(documents, self.batch_size):
+            # Parse documents
+            parse_results = await self._parse_batch(documents_batch, parser_router)
+            # Split documents into successful and failed
+            successfully_parsed = [result for result in parse_results if isinstance(result, IngestTaskResult)]
+            failed_parsed = [result for result in parse_results if isinstance(result, IngestDocumentResult)]
+            # Further split successful documents into to enrich and ready
+            to_enrich = [
+                result
+                for result in successfully_parsed
+                if any(type(element) in enricher_router for element in result.elements)
+            ]
+            ready_parsed = [
+                result
+                for result in successfully_parsed
+                if not any(type(element) in enricher_router for element in result.elements)
+            ]
+            # Enrich documents
+            enrich_results = await self._enrich_batch(to_enrich, enricher_router)
+            # Split enriched documents into successful and failed
+            successfully_enriched = [result for result in enrich_results if isinstance(result, IngestTaskResult)]
+            failed_enriched = [result for result in enrich_results if isinstance(result, IngestDocumentResult)]
+            # Combine ready documents with successfully enriched documents for indexing
+            to_index = ready_parsed + successfully_enriched
+            # Index the combined documents
+            index_results = await self._index_batch(to_index, vector_store)
+            # Split indexed documents into successful and failed
+            successfully_indexed = [result for result in index_results if not result.error]
+            failed_indexed = [result for result in index_results if result.error]
+            # Combine all failed documents
+            all_failed = failed_parsed + failed_enriched + failed_indexed
+            # Update the final result
+            results.successful.extend(successfully_indexed)
+            results.failed.extend(all_failed)
+        return results
+    async def _parse_batch(
+        self,
+        batch: list[DocumentMeta | Document | Source],
+        parser_router: DocumentParserRouter,
+    ) -> list[IngestTaskResult | IngestDocumentResult]:
+        """
+        Parse batch of documents.
+        Args:
+            batch: The documents to parse.
+            parser_router: The document parser router to use.
+        Returns:
+            The task results.
+        """
+        uris = [document.metadata.id if isinstance(document, Document) else document.id for document in batch]
+        responses = await asyncio.gather(
+            *[
+                self._call_with_error_handling(
+                    self._parse_document,
+                    document=document,
+                    parser_router=parser_router,
+                )
+                for document in batch
+            ],
+            return_exceptions=True,
+        )
+        results: list[IngestTaskResult | IngestDocumentResult] = []
+        for uri, response in zip(uris, responses, strict=True):
+            if isinstance(response, BaseException):
+                if isinstance(response, Exception):
+                    results.append(
+                        IngestDocumentResult(
+                            document_uri=uri,
+                            error=IngestError.from_exception(response),
+                        )
+                    )
+                # Handle only standard exceptions, not BaseExceptions like SystemExit, KeyboardInterrupt, etc.
+                else:
+                    raise response
+            else:
+                results.append(
+                    IngestTaskResult(
+                        document_uri=uri,
+                        elements=response,
+                    )
+                )
+        return results
+    async def _enrich_batch(
+        self,
+        batch: list[IngestTaskResult],
+        enricher_router: ElementEnricherRouter,
+    ) -> list[IngestTaskResult | IngestDocumentResult]:
+        """
+        Enrich batch of documents.
+        Args:
+            batch: The documents to enrich.
+            enricher_router: The intermediate element enricher router to use.
+        Returns:
+            The task results.
+        """
+        async def _enrich_document(result: IngestTaskResult) -> IngestTaskResult | IngestDocumentResult:
+            try:
+                enriched_elements = [
+                    element
+                    for elements_batch in batched(result.elements, self.enrich_batch_size)
+                    for element in await self._call_with_error_handling(
+                        self._enrich_elements,
+                        elements=elements_batch,
+                        enricher_router=enricher_router,
+                    )
+                ]
+                return IngestTaskResult(
+                    document_uri=result.document_uri,
+                    elements=enriched_elements,
+                )
+            except Exception as exc:
+                return IngestDocumentResult(
+                    document_uri=result.document_uri,
+                    error=IngestError.from_exception(exc),
+                )
+        return await asyncio.gather(*[_enrich_document(result) for result in batch])
+    async def _index_batch(
+        self,
+        batch: list[IngestTaskResult],
+        vector_store: VectorStore,
+    ) -> list[IngestDocumentResult]:
+        """
+        Index batch of documents.
+        Args:
+            batch: The documents to index.
+            vector_store: The vector store to store document chunks.
+        Returns:
+            The task results.
+        """
+        async def _index_document(result: IngestTaskResult) -> IngestDocumentResult:
+            try:
+                await self._call_with_error_handling(
+                    self._remove_elements,
+                    document_ids=[result.document_uri],
+                    vector_store=vector_store,
+                )
+                for elements_batch in batched(result.elements, self.index_batch_size):
+                    await self._call_with_error_handling(
+                        self._insert_elements,
+                        elements=elements_batch,
+                        vector_store=vector_store,
+                    )
+                return IngestDocumentResult(
+                    document_uri=result.document_uri,
+                    num_elements=len(result.elements),
+                )
+            except Exception as exc:
+                return IngestDocumentResult(
+                    document_uri=result.document_uri,
+                    error=IngestError.from_exception(exc),
+                )
+        return await asyncio.gather(*[_index_document(result) for result in batch])

ragbits/document_search/ingestion/strategies/ray.py ADDED Viewed

@@ -0,0 +1,138 @@
+import asyncio
+from collections.abc import Iterable
+from ragbits.core.sources.base import Source
+from ragbits.core.utils.decorators import requires_dependencies
+from ragbits.core.vector_stores.base import VectorStore
+from ragbits.document_search.documents.document import Document, DocumentMeta
+from ragbits.document_search.ingestion.enrichers.router import ElementEnricherRouter
+from ragbits.document_search.ingestion.parsers.router import DocumentParserRouter
+from ragbits.document_search.ingestion.strategies.base import (
+    IngestDocumentResult,
+    IngestExecutionResult,
+)
+from ragbits.document_search.ingestion.strategies.batched import BatchedIngestStrategy, IngestTaskResult
+class RayDistributedIngestStrategy(BatchedIngestStrategy):
+    """
+    Ingest strategy that processes documents on a cluster, using Ray.
+    """
+    def __init__(
+        self,
+        batch_size: int = 1,
+        enrich_batch_size: int | None = None,
+        index_batch_size: int | None = None,
+        parse_memory: float | None = None,
+        processing_memory: float | None = None,
+        num_retries: int = 3,
+        backoff_multiplier: int = 1,
+        backoff_max: int = 60,
+    ) -> None:
+        """
+        Initialize the RayDistributedIngestStrategy instance.
+        Args:
+            batch_size: The batch size for parsing documents.
+            enrich_batch_size: The batch size for enriching elements.
+                Describes the maximum number of document elements to enrich at once.
+                If None, all elements are enriched at once.
+            index_batch_size: The batch size for indexing elements.
+                Describes the maximum number of document elements to index at once.
+                If None, all elements are indexed at once.
+            parse_memory: The heap memory in bytes to reserve for each parallel parsing tasks.
+            processing_memory: The heap memory in bytes to reserve for each parallel elements processing tasks.
+            num_retries: The number of retries per document ingest task error.
+            backoff_multiplier: The base delay multiplier for exponential backoff (in seconds).
+            backoff_max: The maximum allowed delay (in seconds) between retries.
+        """
+        super().__init__(
+            batch_size=batch_size,
+            enrich_batch_size=enrich_batch_size,
+            index_batch_size=index_batch_size,
+            num_retries=num_retries,
+            backoff_multiplier=backoff_multiplier,
+            backoff_max=backoff_max,
+        )
+        self.parse_memory = parse_memory
+        self.processing_memory = processing_memory
+    @requires_dependencies(["ray.data"], "ray")
+    async def __call__(
+        self,
+        documents: Iterable[DocumentMeta | Document | Source],
+        vector_store: VectorStore,
+        parser_router: DocumentParserRouter,
+        enricher_router: ElementEnricherRouter,
+    ) -> IngestExecutionResult:
+        """
+        Ingest documents in parallel in batches.
+        Args:
+            documents: The documents to ingest.
+            vector_store: The vector store to store document chunks.
+            parser_router: The document parser router to use.
+            enricher_router: The intermediate element enricher router to use.
+        Returns:
+            The ingest execution result.
+        """
+        import ray
+        # Parse documents
+        parse_results = ray.data.from_items(list(documents)).map_batches(
+            fn=lambda batch: {"results": asyncio.run(self._parse_batch(batch["item"], parser_router))},
+            batch_size=self.batch_size,
+            num_cpus=1,
+            memory=self.parse_memory,
+            zero_copy_batch=True,
+        )
+        # Split documents into successful and failed
+        successfully_parsed = parse_results.filter(lambda data: isinstance(data["results"], IngestTaskResult))
+        failed_parsed = parse_results.filter(lambda data: isinstance(data["results"], IngestDocumentResult))
+        # Further split valid documents into to enrich and ready
+        to_enrich = successfully_parsed.filter(
+            lambda data: any(type(element) in enricher_router for element in data["results"].elements)
+        )
+        ready_parsed = successfully_parsed.filter(
+            lambda data: not any(type(element) in enricher_router for element in data["results"].elements)
+        )
+        # Enrich documents
+        enrich_results = to_enrich.map_batches(
+            fn=lambda batch: {"results": asyncio.run(self._enrich_batch(batch["results"], enricher_router))},
+            batch_size=self.batch_size,
+            num_cpus=0,
+            memory=self.processing_memory,
+        )
+        # Split enriched documents into successful and failed
+        successfully_enriched = enrich_results.filter(lambda data: isinstance(data["results"], IngestTaskResult))
+        failed_enriched = enrich_results.filter(lambda data: isinstance(data["results"], IngestDocumentResult))
+        # Combine ready documents with successfully enriched documents for indexing
+        to_index = ready_parsed.union(successfully_enriched)
+        # Index the combined documents
+        index_results = to_index.map_batches(
+            fn=lambda batch: {"results": asyncio.run(self._index_batch(batch["results"], vector_store))},
+            batch_size=self.batch_size,
+            num_cpus=0,
+            memory=self.processing_memory,
+        )
+        # Split indexed documents into successful and failed
+        successfully_indexed = index_results.filter(lambda data: not data["results"].error)
+        failed_indexed = index_results.filter(lambda data: data["results"].error)
+        # Combine all failed documents
+        all_failed = failed_parsed.union(failed_enriched, failed_indexed)
+        # Return the final result
+        return IngestExecutionResult(
+            successful=[data["results"] for data in successfully_indexed.take_all()],
+            failed=[data["results"] for data in all_failed.take_all()],
+        )

ragbits/document_search/ingestion/strategies/sequential.py ADDED Viewed

@@ -0,0 +1,23 @@
+from ragbits.document_search.ingestion.strategies.batched import BatchedIngestStrategy
+class SequentialIngestStrategy(BatchedIngestStrategy):
+    """
+    Ingest strategy that processes documents in sequence, one at a time.
+    """
+    def __init__(self, num_retries: int = 3, backoff_multiplier: int = 1, backoff_max: int = 60) -> None:
+        """
+        Initialize the SequentialIngestStrategy instance.
+        Args:
+            num_retries: The number of retries per document ingest task error.
+            backoff_multiplier: The base delay multiplier for exponential backoff (in seconds).
+            backoff_max: The maximum allowed delay (in seconds) between retries.
+        """
+        super().__init__(
+            batch_size=1,
+            num_retries=num_retries,
+            backoff_multiplier=backoff_multiplier,
+            backoff_max=backoff_max,
+        )

ragbits/document_search/py.typed ADDED Viewed

File without changes

ragbits/document_search/retrieval/__init__.py ADDED Viewed

File without changes

ragbits/document_search/retrieval/rephrasers/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+from ragbits.document_search.retrieval.rephrasers.base import QueryRephraser, QueryRephraserOptions
+from ragbits.document_search.retrieval.rephrasers.llm import (
+    LLMQueryRephraser,
+    LLMQueryRephraserOptions,
+    LLMQueryRephraserPrompt,
+    LLMQueryRephraserPromptInput,
+)
+from ragbits.document_search.retrieval.rephrasers.noop import NoopQueryRephraser
+__all__ = [
+    "LLMQueryRephraser",
+    "LLMQueryRephraserOptions",
+    "LLMQueryRephraserPrompt",
+    "LLMQueryRephraserPromptInput",
+    "NoopQueryRephraser",
+    "QueryRephraser",
+    "QueryRephraserOptions",
+]

ragbits/document_search/retrieval/rephrasers/base.py ADDED Viewed

@@ -0,0 +1,39 @@
+from abc import ABC, abstractmethod
+from collections.abc import Iterable
+from typing import ClassVar, TypeVar
+from ragbits.core.options import Options
+from ragbits.core.utils.config_handling import ConfigurableComponent
+from ragbits.document_search.retrieval import rephrasers
+class QueryRephraserOptions(Options):
+    """
+    Object representing the options for the rephraser.
+    """
+QueryRephraserOptionsT = TypeVar("QueryRephraserOptionsT", bound=QueryRephraserOptions)
+class QueryRephraser(ConfigurableComponent[QueryRephraserOptionsT], ABC):
+    """
+    Rephrases a query. Can provide multiple rephrased queries from one sentence / question.
+    """
+    options_cls: type[QueryRephraserOptionsT]
+    default_module: ClassVar = rephrasers
+    configuration_key: ClassVar = "rephraser"
+    @abstractmethod
+    async def rephrase(self, query: str, options: QueryRephraserOptionsT | None = None) -> Iterable[str]:
+        """
+        Rephrase a query.
+        Args:
+            query: The query to rephrase.
+            options: The options for the rephraser.
+        Returns:
+            The rephrased queries.
+        """

ragbits/document_search/retrieval/rephrasers/llm.py ADDED Viewed

@@ -0,0 +1,141 @@
+from collections.abc import Iterable
+from typing import Generic
+from pydantic import BaseModel
+from typing_extensions import Self
+from ragbits.core.audit.traces import traceable
+from ragbits.core.llms.base import LLM, LLMClientOptionsT
+from ragbits.core.prompt import Prompt
+from ragbits.core.types import NOT_GIVEN, NotGiven
+from ragbits.core.utils.config_handling import ObjectConstructionConfig, import_by_path
+from ragbits.document_search.retrieval.rephrasers.base import QueryRephraser, QueryRephraserOptions
+class LLMQueryRephraserPromptInput(BaseModel):
+    """
+    Input data for the query rephraser prompt.
+    """
+    query: str
+    n: int | None = None
+class LLMQueryRephraserPrompt(Prompt[LLMQueryRephraserPromptInput, list]):
+    """
+    Prompt for generating a rephrased user query.
+    """
+    system_prompt = """
+        You are an expert in query rephrasing and clarity improvement.
+        {%- if n and n > 1 %}
+        Your task is to generate {{ n }} different versions of the given user query to retrieve relevant documents
+        from a vector database. They can be phrased as statements, as they will be used as a search query.
+        By generating multiple perspectives on the user query, your goal is to help the user overcome some of the
+        limitations of the distance-based similarity search.
+        Alternative queries should only contain information present in the original query. Do not include anything
+        in the alternative query, you have not seen in the original version.
+        It is VERY important you DO NOT ADD any comments or notes. Return ONLY alternative queries.
+        Provide these alternative queries separated by newlines. DO NOT ADD any enumeration.
+        {%- else %}
+        Your task is to return a single paraphrased version of a user's query,
+        correcting any typos, handling abbreviations and improving clarity.
+        Focus on making the query more precise and readable while keeping its original intent.
+        Just return the rephrased query. No additional explanations are needed.
+        {%- endif %}
+    """
+    user_prompt = "Query: {{ query }}"
+    @staticmethod
+    def _response_parser(value: str) -> list[str]:
+        return [stripped_line for line in value.strip().split("\n") if (stripped_line := line.strip())]
+    response_parser = _response_parser
+class LLMQueryRephraserOptions(QueryRephraserOptions, Generic[LLMClientOptionsT]):
+    """
+    Object representing the options for the LLM query rephraser.
+    Attributes:
+        n: The number of rephrasings to generate. Any number below 2 will generate only one rephrasing.
+        llm_options: The options for the LLM.
+    """
+    n: int | None | NotGiven = NOT_GIVEN
+    llm_options: LLMClientOptionsT | None | NotGiven = NOT_GIVEN
+class LLMQueryRephraser(QueryRephraser[LLMQueryRephraserOptions[LLMClientOptionsT]]):
+    """
+    A rephraser class that uses a LLM to rephrase queries.
+    """
+    options_cls: type[LLMQueryRephraserOptions] = LLMQueryRephraserOptions
+    def __init__(
+        self,
+        llm: LLM[LLMClientOptionsT],
+        prompt: type[Prompt[LLMQueryRephraserPromptInput, list[str]]] | None = None,
+        default_options: LLMQueryRephraserOptions[LLMClientOptionsT] | None = None,
+    ) -> None:
+        """
+        Initialize the LLMQueryRephraser with a LLM.
+        Args:
+            llm: A LLM instance to handle query rephrasing.
+            prompt: The prompt to use for rephrasing queries.
+            default_options: The default options for the rephraser.
+        """
+        super().__init__(default_options=default_options)
+        self._llm = llm
+        self._prompt = prompt or LLMQueryRephraserPrompt
+    @traceable
+    async def rephrase(
+        self,
+        query: str,
+        options: LLMQueryRephraserOptions[LLMClientOptionsT] | None = None,
+    ) -> Iterable[str]:
+        """
+        Rephrase a given query using the LLM.
+        Args:
+            query: The query to be rephrased. If not provided, a custom prompt must be given.
+            options: The options for the rephraser.
+        Returns:
+            A list containing the rephrased query.
+        Raises:
+            LLMConnectionError: If there is a connection error with the LLM API.
+            LLMStatusError: If the LLM API returns an error status code.
+            LLMResponseError: If the LLM API response is invalid.
+        """
+        merged_options = (self.default_options | options) if options else self.default_options
+        llm_options = merged_options.llm_options or None
+        prompt = self._prompt(LLMQueryRephraserPromptInput(query=query, n=merged_options.n or None))
+        return await self._llm.generate(prompt, options=llm_options)
+    @classmethod
+    def from_config(cls, config: dict) -> Self:
+        """
+        Create an instance of `LLMQueryRephraser` from a configuration dictionary.
+        Args:
+            config: A dictionary containing configuration settings for the rephraser.
+        Returns:
+            An instance of the rephraser class initialized with the provided configuration.
+        Raises:
+           ValidationError: If the LLM or prompt configuration doesn't follow the expected format.
+           InvalidConfigError: If an LLM or prompt class can't be found or is not the correct type.
+        """
+        config["llm"] = LLM.subclass_from_config(ObjectConstructionConfig.model_validate(config["llm"]))
+        config["prompt"] = (
+            import_by_path(ObjectConstructionConfig.model_validate(config["prompt"]).type)
+            if "prompt" in config
+            else None
+        )
+        return super().from_config(config)

ragbits/document_search/retrieval/rephrasers/noop.py ADDED Viewed

@@ -0,0 +1,26 @@
+from collections.abc import Iterable
+from ragbits.core.audit.traces import traceable
+from ragbits.document_search.retrieval.rephrasers.base import QueryRephraser, QueryRephraserOptions
+class NoopQueryRephraser(QueryRephraser[QueryRephraserOptions]):
+    """
+    A no-op query paraphraser that does not change the query.
+    """
+    options_cls: type[QueryRephraserOptions] = QueryRephraserOptions
+    @traceable
+    async def rephrase(self, query: str, options: QueryRephraserOptions | None = None) -> Iterable[str]:  # noqa: PLR6301
+        """
+        Mock implementation which outputs the same query as in input.
+        Args:
+            query: The query to rephrase.
+            options: The options for the rephraser.
+        Returns:
+            The list with non-transformed query.
+        """
+        return [query]

ragbits/document_search/retrieval/rerankers/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from ragbits.document_search.retrieval.rerankers.base import Reranker, RerankerOptions
+from ragbits.document_search.retrieval.rerankers.noop import NoopReranker
+__all__ = ["NoopReranker", "Reranker", "RerankerOptions"]