PyPI - transformers-haystack - Versions diffs - 0.1.0__py3-none-any.whl - Mend

transformers-haystack 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

haystack_integrations/components/classifiers/py.typed ADDED Viewed

File without changes

haystack_integrations/components/classifiers/transformers/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+from .zero_shot_document_classifier import TransformersZeroShotDocumentClassifier
+__all__ = ["TransformersZeroShotDocumentClassifier"]

haystack_integrations/components/classifiers/transformers/zero_shot_document_classifier.py ADDED Viewed

@@ -0,0 +1,247 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+from dataclasses import replace
+from typing import Any
+from haystack import Document, component, default_from_dict, default_to_dict
+from haystack.utils import ComponentDevice, Secret
+from haystack.utils.hf import deserialize_hf_model_kwargs, serialize_hf_model_kwargs
+from haystack_integrations.components.common.transformers.utils import _resolve_hf_pipeline_kwargs
+from transformers import Pipeline as HfPipeline
+from transformers import pipeline
+@component
+class TransformersZeroShotDocumentClassifier:
+    """
+    Performs zero-shot classification of documents based on given labels and adds the predicted label to their metadata.
+    The component uses a Hugging Face pipeline for zero-shot classification.
+    Provide the model and the set of labels to be used for categorization during initialization.
+    Additionally, you can configure the component to allow multiple labels to be true.
+    Classification is run on the document's content field by default. If you want it to run on another field, set the
+    `classification_field` to one of the document's metadata fields.
+    Available models for the task of zero-shot-classification include:
+        - `valhalla/distilbart-mnli-12-3`
+        - `cross-encoder/nli-distilroberta-base`
+        - `cross-encoder/nli-deberta-v3-xsmall`
+    ### Usage example
+    The following is a pipeline that classifies documents based on predefined classification labels
+    retrieved from a search pipeline:
+    ```python
+    from haystack import Document
+    from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
+    from haystack.core.pipeline import Pipeline
+    from haystack.document_stores.in_memory import InMemoryDocumentStore
+    from haystack_integrations.components.classifiers.transformers import TransformersZeroShotDocumentClassifier
+    documents = [Document(id="0", content="Today was a nice day!"),
+                 Document(id="1", content="Yesterday was a bad day!")]
+    document_store = InMemoryDocumentStore()
+    retriever = InMemoryBM25Retriever(document_store=document_store)
+    document_classifier = TransformersZeroShotDocumentClassifier(
+        model="cross-encoder/nli-deberta-v3-xsmall",
+        labels=["positive", "negative"],
+    )
+    document_store.write_documents(documents)
+    pipeline = Pipeline()
+    pipeline.add_component(instance=retriever, name="retriever")
+    pipeline.add_component(instance=document_classifier, name="document_classifier")
+    pipeline.connect("retriever", "document_classifier")
+    queries = ["How was your day today?", "How was your day yesterday?"]
+    expected_predictions = ["positive", "negative"]
+    for idx, query in enumerate(queries):
+        result = pipeline.run({"retriever": {"query": query, "top_k": 1}})
+        assert result["document_classifier"]["documents"][0].to_dict()["id"] == str(idx)
+        assert (result["document_classifier"]["documents"][0].to_dict()["classification"]["label"]
+                == expected_predictions[idx])
+    ```
+    """
+    def __init__(
+        self,
+        model: str,
+        labels: list[str],
+        multi_label: bool = False,
+        classification_field: str | None = None,
+        device: ComponentDevice | None = None,
+        token: Secret | None = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
+        huggingface_pipeline_kwargs: dict[str, Any] | None = None,
+    ) -> None:
+        """
+        Initializes the TransformersZeroShotDocumentClassifier.
+        See the Hugging Face [website](https://huggingface.co/models?pipeline_tag=zero-shot-classification&sort=downloads&search=nli)
+        for the full list of zero-shot classification models (NLI) models.
+        :param model:
+            The name or path of a Hugging Face model for zero shot document classification.
+        :param labels:
+            The set of possible class labels to classify each document into, for example,
+            ["positive", "negative"]. The labels depend on the selected model.
+        :param multi_label:
+            Whether or not multiple candidate labels can be true.
+            If `False`, the scores are normalized such that
+            the sum of the label likelihoods for each sequence is 1. If `True`, the labels are considered
+            independent and probabilities are normalized for each candidate by doing a softmax of the entailment
+            score vs. the contradiction score.
+        :param classification_field:
+            Name of document's meta field to be used for classification.
+            If not set, `Document.content` is used by default.
+        :param device:
+            The device on which the model is loaded. If `None`, the default device is automatically
+            selected. If a device/device map is specified in `huggingface_pipeline_kwargs`, it overrides this parameter.
+        :param token:
+            The Hugging Face token to use as HTTP bearer authorization.
+            Check your HF token in your [account settings](https://huggingface.co/settings/tokens).
+        :param huggingface_pipeline_kwargs:
+            Dictionary containing keyword arguments used to initialize the
+            Hugging Face pipeline for text classification.
+        """
+        self.classification_field = classification_field
+        self.token = token
+        self.labels = labels
+        self.multi_label = multi_label
+        huggingface_pipeline_kwargs = _resolve_hf_pipeline_kwargs(
+            huggingface_pipeline_kwargs=huggingface_pipeline_kwargs or {},
+            model=model,
+            task="zero-shot-classification",
+            supported_tasks=["zero-shot-classification"],
+            device=device,
+            token=token,
+        )
+        self.huggingface_pipeline_kwargs = huggingface_pipeline_kwargs
+        self.pipeline: HfPipeline | None = None
+    def _get_telemetry_data(self) -> dict[str, Any]:
+        """
+        Data that is sent to Posthog for usage analytics.
+        """
+        if isinstance(self.huggingface_pipeline_kwargs["model"], str):
+            return {"model": self.huggingface_pipeline_kwargs["model"]}
+        return {"model": f"[object of type {type(self.huggingface_pipeline_kwargs['model'])}]"}
+    def warm_up(self) -> None:
+        """
+        Initializes the component.
+        """
+        if self.pipeline is None:
+            self.pipeline = pipeline(**self.huggingface_pipeline_kwargs)
+    def to_dict(self) -> dict[str, Any]:
+        """
+        Serializes the component to a dictionary.
+        :returns:
+            Dictionary with serialized data.
+        """
+        serialization_dict = default_to_dict(
+            self,
+            labels=self.labels,
+            model=self.huggingface_pipeline_kwargs["model"],
+            huggingface_pipeline_kwargs=self.huggingface_pipeline_kwargs,
+            token=self.token,
+            multi_label=self.multi_label,
+            classification_field=self.classification_field,
+        )
+        huggingface_pipeline_kwargs = serialization_dict["init_parameters"]["huggingface_pipeline_kwargs"]
+        huggingface_pipeline_kwargs.pop("token", None)
+        serialize_hf_model_kwargs(huggingface_pipeline_kwargs)
+        return serialization_dict
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "TransformersZeroShotDocumentClassifier":
+        """
+        Deserializes the component from a dictionary.
+        :param data:
+            Dictionary to deserialize from.
+        :returns:
+            Deserialized component.
+        """
+        if data["init_parameters"].get("huggingface_pipeline_kwargs") is not None:
+            deserialize_hf_model_kwargs(data["init_parameters"]["huggingface_pipeline_kwargs"])
+        return default_from_dict(cls, data)
+    @component.output_types(documents=list[Document])
+    def run(self, documents: list[Document], batch_size: int = 1) -> dict[str, Any]:
+        """
+        Classifies the documents based on the provided labels and adds them to their metadata.
+        The classification results are stored in the `classification` dict within
+        each document's metadata. If `multi_label` is set to `True`, the scores for each label are available under
+        the `details` key within the `classification` dictionary.
+        :param documents:
+            Documents to process.
+        :param batch_size:
+            Batch size used for processing the content in each document.
+        :returns:
+            A dictionary with the following key:
+            - `documents`: A list of documents with an added metadata field called `classification`.
+        """
+        if self.pipeline is None:
+            self.warm_up()
+        if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
+            msg = (
+                "TransformerZeroShotDocumentClassifier expects a list of documents as input. "
+                "In case you want to classify and route a text, please use the TransformersZeroShotTextRouter."
+            )
+            raise TypeError(msg)
+        invalid_doc_ids = []
+        for doc in documents:
+            if self.classification_field is not None and self.classification_field not in doc.meta:
+                invalid_doc_ids.append(doc.id)
+        if invalid_doc_ids:
+            msg = (
+                f"The following documents do not have the classification field '{self.classification_field}': "
+                f"{', '.join(invalid_doc_ids)}"
+            )
+            raise ValueError(msg)
+        texts = [
+            (doc.content if self.classification_field is None else doc.meta[self.classification_field])
+            for doc in documents
+        ]
+        # mypy doesn't know this is set in warm_up
+        predictions = self.pipeline(  # type: ignore[misc]
+            texts, self.labels, multi_label=self.multi_label, batch_size=batch_size
+        )
+        new_documents = []
+        for prediction, document in zip(predictions, documents, strict=True):
+            formatted_prediction = {
+                "label": prediction["labels"][0],
+                "score": prediction["scores"][0],
+                "details": dict(zip(prediction["labels"], prediction["scores"], strict=True)),
+            }
+            new_meta = {**document.meta, "classification": formatted_prediction}
+            new_documents.append(replace(document, meta=new_meta))
+        return {"documents": new_documents}

haystack_integrations/components/common/py.typed ADDED Viewed

File without changes

haystack_integrations/components/common/transformers/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0

haystack_integrations/components/common/transformers/utils.py ADDED Viewed

@@ -0,0 +1,234 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+import asyncio
+import copy
+from typing import Any
+import torch
+from haystack import logging
+from haystack.dataclasses import AsyncStreamingCallbackT, ComponentInfo, StreamingChunk, SyncStreamingCallbackT
+from haystack.utils.auth import Secret
+from haystack.utils.device import ComponentDevice
+from huggingface_hub import model_info
+from transformers import (
+    PreTrainedTokenizer,
+    PreTrainedTokenizerBase,
+    PreTrainedTokenizerFast,
+    StoppingCriteria,
+    TextStreamer,
+)
+logger = logging.getLogger(__name__)
+def _resolve_hf_device_map(device: ComponentDevice | None, model_kwargs: dict[str, Any] | None) -> dict[str, Any]:
+    """
+    Update `model_kwargs` to include the keyword argument `device_map`.
+    This method is useful you want to force loading a transformers model when using `AutoModel.from_pretrained` to
+    use `device_map`.
+    We handle the edge case where `device` and `device_map` is specified by ignoring the `device` parameter and printing
+    a warning.
+    :param device: The device on which the model is loaded. If `None`, the default device is automatically
+        selected.
+    :param model_kwargs: Additional HF keyword arguments passed to `AutoModel.from_pretrained`.
+        For details on what kwargs you can pass, see the model's documentation.
+    """
+    model_kwargs = copy.copy(model_kwargs) or {}
+    if model_kwargs.get("device_map"):
+        if device is not None:
+            logger.warning(
+                "The parameters `device` and `device_map` from `model_kwargs` are both provided. "
+                "Ignoring `device` and using `device_map`."
+            )
+        # Resolve device if device_map is provided in model_kwargs
+        device_map = model_kwargs["device_map"]
+    else:
+        device_map = ComponentDevice.resolve_device(device).to_hf()
+    # Set up device_map which allows quantized loading and multi device inference
+    # requires accelerate which is always installed when using `pip install transformers[torch]`
+    model_kwargs["device_map"] = device_map
+    return model_kwargs
+def _resolve_hf_pipeline_kwargs(
+    huggingface_pipeline_kwargs: dict[str, Any],
+    model: str,
+    task: str | None,
+    supported_tasks: list[str],
+    device: ComponentDevice | None,
+    token: Secret | None,
+) -> dict[str, Any]:
+    """
+    Resolve the HuggingFace pipeline keyword arguments based on explicit user inputs.
+    :param huggingface_pipeline_kwargs: Dictionary containing keyword arguments used to initialize a
+        Hugging Face pipeline.
+    :param model: The name or path of a Hugging Face model for on the HuggingFace Hub.
+    :param task: The task for the Hugging Face pipeline.
+    :param supported_tasks: The list of supported tasks to check the task of the model against. If the task of the model
+        is not present within this list then a ValueError is thrown.
+    :param device: The device on which the model is loaded. If `None`, the default device is automatically
+        selected. If a device/device map is specified in `huggingface_pipeline_kwargs`, it overrides this parameter.
+    :param token: The token to use as HTTP bearer authorization for remote files.
+        If the token is also specified in the `huggingface_pipeline_kwargs`, this parameter will be ignored.
+    """
+    resolved_token = token.resolve_value() if token else None
+    # check if the huggingface_pipeline_kwargs contain the essential parameters
+    # otherwise, populate them with values from other init parameters
+    huggingface_pipeline_kwargs.setdefault("model", model)
+    huggingface_pipeline_kwargs.setdefault("token", resolved_token)
+    resolved_device = ComponentDevice.resolve_device(device)
+    resolved_device.update_hf_kwargs(huggingface_pipeline_kwargs, overwrite=False)
+    # task identification and validation
+    task = task or huggingface_pipeline_kwargs.get("task")
+    if task is None and isinstance(huggingface_pipeline_kwargs["model"], str):
+        task = model_info(huggingface_pipeline_kwargs["model"], token=huggingface_pipeline_kwargs["token"]).pipeline_tag
+    if task not in supported_tasks:
+        msg = f"Task '{task}' is not supported. The supported tasks are: {', '.join(supported_tasks)}."
+        raise ValueError(msg)
+    huggingface_pipeline_kwargs["task"] = task
+    return huggingface_pipeline_kwargs
+class _StopWordsCriteria(StoppingCriteria):
+    """
+    Stops text generation in HuggingFace generators if any one of the stop words is generated.
+    Note: When a stop word is encountered, the generation of new text is stopped.
+    However, if the stop word is in the prompt itself, it can stop generating new text
+    prematurely after the first token. This is particularly important for LLMs designed
+    for dialogue generation. For these models, like for example mosaicml/mpt-7b-chat,
+    the output includes both the new text and the original prompt. Therefore, it's important
+    to make sure your prompt has no stop words.
+    """
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
+        stop_words: list[str],
+        device: str | torch.device = "cpu",
+    ) -> None:
+        """Creates an instance of _StopWordsCriteria."""
+        super().__init__()
+        # check if tokenizer is a valid tokenizer
+        if not isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
+            msg = (
+                f"Invalid tokenizer provided for _StopWordsCriteria - {tokenizer}. "
+                f"Please provide a valid tokenizer from the HuggingFace Transformers library."
+            )
+            raise TypeError(msg)
+        if not tokenizer.pad_token:
+            if tokenizer.eos_token:
+                tokenizer.pad_token = tokenizer.eos_token
+            else:
+                tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+        encoded_stop_words = tokenizer(stop_words, add_special_tokens=False, padding=True, return_tensors="pt")
+        self.stop_ids = encoded_stop_words.input_ids.to(device)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs: Any) -> bool:  # noqa: ARG002
+        """Check if any of the stop words are generated in the current text generation step."""
+        for stop_id in self.stop_ids:
+            found_stop_word = self.is_stop_word_found(input_ids, stop_id)
+            if found_stop_word:
+                return True
+        return False
+    @staticmethod
+    def is_stop_word_found(generated_text_ids: torch.Tensor, stop_id: torch.Tensor) -> bool:
+        """
+        Performs phrase matching.
+        Checks if a sequence of stop tokens appears in a continuous or sequential order within the generated text.
+        """
+        generated_text_ids = generated_text_ids[-1]
+        len_generated_text_ids = generated_text_ids.size(0)
+        len_stop_id = stop_id.size(0)
+        return all(generated_text_ids[len_generated_text_ids - len_stop_id :].eq(stop_id))
+class _HFTokenStreamingHandler(TextStreamer):
+    """
+    Streaming handler for TransformersChatGenerator.
+    Note: This is a helper class for TransformersChatGenerator enabling streaming
+    of generated text via Haystack SyncStreamingCallbackT callbacks.
+    Do not use this class directly.
+    """
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        stream_handler: SyncStreamingCallbackT,
+        stop_words: list[str] | None = None,
+        component_info: ComponentInfo | None = None,
+    ) -> None:
+        """Creates an instance of _HFTokenStreamingHandler."""
+        super().__init__(tokenizer=tokenizer, skip_prompt=True)
+        self.token_handler = stream_handler
+        self.stop_words = stop_words or []
+        self.component_info = component_info
+        self._call_counter = 0
+    def on_finalized_text(self, word: str, stream_end: bool = False) -> None:
+        """Callback function for handling the generated text."""
+        self._call_counter += 1
+        word_to_send = word + "\n" if stream_end else word
+        if word_to_send.strip() not in self.stop_words:
+            self.token_handler(
+                StreamingChunk(
+                    content=word_to_send, index=0, start=self._call_counter == 1, component_info=self.component_info
+                )
+            )
+class _AsyncHFTokenStreamingHandler(TextStreamer):
+    """
+    Async streaming handler for TransformersChatGenerator.
+    Note: This is a helper class for TransformersChatGenerator enabling
+    async streaming of generated text via Haystack Callable[StreamingChunk, Awaitable[None]] callbacks.
+    Do not use this class directly.
+    """
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        stream_handler: AsyncStreamingCallbackT,
+        stop_words: list[str] | None = None,
+        component_info: ComponentInfo | None = None,
+    ) -> None:
+        """Creates an instance of _AsyncHFTokenStreamingHandler."""
+        super().__init__(tokenizer=tokenizer, skip_prompt=True)
+        self.token_handler = stream_handler
+        self.stop_words = stop_words or []
+        self.component_info = component_info
+        self._queue: asyncio.Queue[StreamingChunk] = asyncio.Queue()
+    def on_finalized_text(self, word: str, stream_end: bool = False) -> None:
+        """Synchronous callback that puts chunks in a queue."""
+        word_to_send = word + "\n" if stream_end else word
+        if word_to_send.strip() not in self.stop_words:
+            self._queue.put_nowait(StreamingChunk(content=word_to_send, component_info=self.component_info))
+    async def process_queue(self) -> None:
+        """Process the queue of streaming chunks."""
+        while True:
+            try:
+                chunk = await self._queue.get()
+                await self.token_handler(chunk)
+                self._queue.task_done()
+            except asyncio.CancelledError:
+                break

haystack_integrations/components/extractors/py.typed ADDED Viewed

File without changes

haystack_integrations/components/extractors/transformers/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+from .named_entity_extractor import NamedEntityAnnotation, TransformersNamedEntityExtractor
+__all__ = ["NamedEntityAnnotation", "TransformersNamedEntityExtractor"]