PyPI - fabricatio - Versions diffs - 0.2.10.dev0__cp312-cp312-win_amd64.whl → 0.2.11.dev0__cp312-cp312-win_amd64.whl - Mend

fabricatio 0.2.10.dev0__cp312-cp312-win_amd64.whl → 0.2.11.dev0__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

fabricatio/actions/article.py +12 -2
fabricatio/actions/article_rag.py +132 -11
fabricatio/actions/fs.py +25 -0
fabricatio/actions/output.py +17 -3
fabricatio/actions/rag.py +42 -20
fabricatio/actions/rules.py +14 -3
fabricatio/capabilities/extract.py +65 -0
fabricatio/capabilities/rag.py +5 -2
fabricatio/capabilities/rating.py +5 -2
fabricatio/capabilities/task.py +16 -16
fabricatio/config.py +9 -2
fabricatio/decorators.py +30 -30
fabricatio/fs/__init__.py +9 -2
fabricatio/fs/readers.py +6 -10
fabricatio/models/adv_kwargs_types.py +5 -12
fabricatio/models/extra/aricle_rag.py +235 -0
fabricatio/models/extra/article_essence.py +8 -7
fabricatio/models/extra/article_main.py +39 -1
fabricatio/models/extra/problem.py +7 -3
fabricatio/models/extra/rag.py +49 -23
fabricatio/models/generic.py +58 -30
fabricatio/models/kwargs_types.py +11 -2
fabricatio/models/usages.py +9 -26
fabricatio/parser.py +16 -12
fabricatio/rust.cp312-win_amd64.pyd +0 -0
fabricatio/rust.pyi +140 -12
fabricatio/utils.py +23 -2
fabricatio-0.2.11.dev0.data/scripts/tdown.exe +0 -0
{fabricatio-0.2.10.dev0.dist-info → fabricatio-0.2.11.dev0.dist-info}/METADATA +18 -12
{fabricatio-0.2.10.dev0.dist-info → fabricatio-0.2.11.dev0.dist-info}/RECORD +32 -29
fabricatio-0.2.10.dev0.data/scripts/tdown.exe +0 -0
{fabricatio-0.2.10.dev0.dist-info → fabricatio-0.2.11.dev0.dist-info}/WHEEL +0 -0
{fabricatio-0.2.10.dev0.dist-info → fabricatio-0.2.11.dev0.dist-info}/licenses/LICENSE +0 -0

fabricatio/models/extra/rag.py CHANGED Viewed

@@ -1,10 +1,13 @@
 """A module containing the RAG (Retrieval-Augmented Generation) models."""
-from abc import ABCMeta, abstractmethod
-from typing import TYPE_CHECKING, Any, ClassVar, Dict, List, Self, Sequence
+from abc import ABC
+from functools import partial
+from typing import TYPE_CHECKING, Any, ClassVar, Dict, List, Self, Sequence, Set
 from fabricatio.decorators import precheck_package
-from pydantic import BaseModel, ConfigDict, JsonValue
+from fabricatio.models.generic import Vectorizable
+from fabricatio.utils import ok
+from pydantic import JsonValue
 if TYPE_CHECKING:
     from importlib.util import find_spec
@@ -15,14 +18,18 @@ if TYPE_CHECKING:
         from pymilvus import CollectionSchema
-class MilvusDataBase(BaseModel, metaclass=ABCMeta):
+class MilvusDataBase(Vectorizable, ABC):
     """A base class for Milvus data."""
-    model_config = ConfigDict(use_attribute_docstrings=True)
     primary_field_name: ClassVar[str] = "id"
+    """The name of the primary field in Milvus."""
     vector_field_name: ClassVar[str] = "vector"
+    """The name of the vector field in Milvus."""
+    index_type: ClassVar[str] = "FLAT"
+    """The type of index to be used in Milvus."""
+    metric_type: ClassVar[str] = "COSINE"
+    """The type of metric to be used in Milvus."""
     def prepare_insertion(self, vector: List[float]) -> Dict[str, Any]:
         """Prepares the data for insertion into Milvus.
@@ -32,11 +39,6 @@ class MilvusDataBase(BaseModel, metaclass=ABCMeta):
         """
         return {**self.model_dump(exclude_none=True, by_alias=True), self.vector_field_name: vector}
-    @property
-    @abstractmethod
-    def to_vectorize(self) -> str:
-        """The text representation of the data."""
     @classmethod
     @precheck_package(
         "pymilvus", "pymilvus is not installed. Have you installed `fabricatio[rag]` instead of `fabricatio`?"
@@ -50,23 +52,47 @@ class MilvusDataBase(BaseModel, metaclass=ABCMeta):
             FieldSchema(cls.vector_field_name, dtype=DataType.FLOAT_VECTOR, dim=dimension),
         ]
-        type_mapping = {
-            str: DataType.STRING,
-            int: DataType.INT64,
-            float: DataType.DOUBLE,
-            JsonValue: DataType.JSON,
-            # TODO add more mapping
-        }
         for k, v in cls.model_fields.items():
             k: str
             v: FieldInfo
-            fields.append(
-                FieldSchema(k, dtype=type_mapping.get(v.annotation, DataType.UNKNOWN), description=v.description or "")
-            )
+            schema = partial(FieldSchema, k, description=v.description or "")
+            anno = ok(v.annotation)
+            if anno == int:
+                fields.append(schema(dtype=DataType.INT64))
+            elif anno == str:
+                fields.append(schema(dtype=DataType.VARCHAR, max_length=65535))
+            elif anno == float:
+                fields.append(schema(dtype=DataType.DOUBLE))
+            elif anno == list[str] or anno == List[str] or anno == set[str] or anno == Set[str]:
+                fields.append(
+                    schema(dtype=DataType.ARRAY, element_type=DataType.VARCHAR, max_length=65535, max_capacity=4096)
+                )
+            elif anno == list[int] or anno == List[int] or anno == set[int] or anno == Set[int]:
+                fields.append(schema(dtype=DataType.ARRAY, element_type=DataType.INT64, max_capacity=4096))
+            elif anno == list[float] or anno == List[float] or anno == set[float] or anno == Set[float]:
+                fields.append(schema(dtype=DataType.ARRAY, element_type=DataType.DOUBLE, max_capacity=4096))
+            elif anno == JsonValue:
+                fields.append(schema(dtype=DataType.JSON))
+            else:
+                raise NotImplementedError(f"{k}:{anno} is not supported")
         return CollectionSchema(fields)
     @classmethod
     def from_sequence(cls, data: Sequence[Dict[str, Any]]) -> List[Self]:
         """Constructs a list of instances from a sequence of dictionaries."""
         return [cls(**d) for d in data]
+class MilvusClassicModel(MilvusDataBase):
+    """A class representing a classic model stored in Milvus."""
+    text: str
+    """The text to be stored in Milvus."""
+    subject: str = ""
+    """The subject of the text."""
+    def _prepare_vectorization_inner(self) -> str:
+        return self.text

fabricatio/models/generic.py CHANGED Viewed

@@ -3,17 +3,10 @@
 from abc import ABC, abstractmethod
 from datetime import datetime
 from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, Optional, Self, Type, Union, final, overload
+from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Self, Type, Union, final, overload
-import orjson
-import rtoml
-from fabricatio.config import configs
-from fabricatio.fs.readers import MAGIKA, safe_text_read
-from fabricatio.journal import logger
-from fabricatio.parser import JsonCapture
+import ujson
 from fabricatio.rust import blake3_hash, detect_language
-from fabricatio.rust_instances import TEMPLATE_MANAGER
-from fabricatio.utils import ok
 from litellm.utils import token_counter
 from pydantic import (
     BaseModel,
@@ -28,6 +21,13 @@ from pydantic import (
 )
 from pydantic.json_schema import GenerateJsonSchema, JsonSchemaValue
+from fabricatio.config import configs
+from fabricatio.fs.readers import safe_text_read
+from fabricatio.journal import logger
+from fabricatio.parser import JsonCapture
+from fabricatio.rust_instances import TEMPLATE_MANAGER
+from fabricatio.utils import ok
 class Base(BaseModel):
     """Base class for all models with Pydantic configuration.
@@ -53,7 +53,7 @@ class Display(Base):
         Returns:
             str: JSON string with 1-level indentation for readability
         """
-        return self.model_dump_json(indent=1,by_alias=True)
+        return self.model_dump_json(indent=1, by_alias=True)
     def compact(self) -> str:
         """Generate compact JSON representation.
@@ -75,9 +75,9 @@ class Display(Base):
             str: Combined display output with boundary markers
         """
         return (
-            "--- Start of Extra Info Sequence ---"
-            + "\n".join(d.compact() if compact else d.display() for d in seq)
-            + "--- End of Extra Info Sequence ---"
+                "--- Start of Extra Info Sequence ---"
+                + "\n".join(d.compact() if compact else d.display() for d in seq)
+                + "--- End of Extra Info Sequence ---"
         )
@@ -118,6 +118,15 @@ class WordCount(Base):
     """Expected word count of this research component."""
+class FromMapping(Base):
+    """Class that provides a method to generate a list of objects from a mapping."""
+    @classmethod
+    @abstractmethod
+    def from_mapping(cls, mapping: Mapping[str, Any], **kwargs: Any) -> List[Self]:
+        """Generate a list of objects from a mapping."""
 class AsPrompt(Base):
     """Class that provides a method to generate a prompt from the model.
@@ -170,11 +179,17 @@ class WithRef[T](Base):
         )
     @overload
-    def update_ref[S: WithRef](self: S, reference: T) -> S: ...
+    def update_ref[S: WithRef](self: S, reference: T) -> S:
+        ...
     @overload
-    def update_ref[S: WithRef](self: S, reference: "WithRef[T]") -> S: ...
+    def update_ref[S: WithRef](self: S, reference: "WithRef[T]") -> S:
+        ...
     @overload
-    def update_ref[S: WithRef](self: S, reference: None = None) -> S: ...
+    def update_ref[S: WithRef](self: S, reference: None = None) -> S:
+        ...
     def update_ref[S: WithRef](self: S, reference: Union[T, "WithRef[T]", None] = None) -> S:  # noqa: PYI019
         """Update the reference of the object.
@@ -225,7 +240,7 @@ class PersistentAble(Base):
             - Hash generated from JSON content ensures uniqueness
         """
         p = Path(path)
-        out = self.model_dump_json(indent=1,by_alias=True)
+        out = self.model_dump_json(indent=1, by_alias=True)
         # Generate a timestamp in the format YYYYMMDD_HHMMSS
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
@@ -299,16 +314,18 @@ class Language(Base):
     """Class that provides a language attribute."""
     @property
-    def language(self)->str:
+    def language(self) -> str:
         """Get the language of the object."""
-        if isinstance(self,Described):
+        if isinstance(self, Described):
             return detect_language(self.description)
-        if isinstance(self,Titled):
+        if isinstance(self, Titled):
             return detect_language(self.title)
-        if isinstance(self,Named):
+        if isinstance(self, Named):
             return detect_language(self.name)
         return detect_language(self.model_dump_json(by_alias=True))
 class ModelHash(Base):
     """Class that provides a hash value for the object.
@@ -454,9 +471,9 @@ class WithFormatedJsonSchema(Base):
         Returns:
             str: The JSON schema of the model in a formatted string.
         """
-        return orjson.dumps(
+        return ujson.dumps(
             cls.model_json_schema(schema_generator=UnsortGenerate),
-            option=orjson.OPT_INDENT_2,
+            option=ujson.OPT_INDENT_2,
         ).decode()
@@ -469,9 +486,11 @@ class CreateJsonObjPrompt(WithFormatedJsonSchema):
     @classmethod
     @overload
     def create_json_prompt(cls, requirement: List[str]) -> List[str]: ...
     @classmethod
     @overload
     def create_json_prompt(cls, requirement: str) -> str: ...
     @classmethod
     def create_json_prompt(cls, requirement: str | List[str]) -> str | List[str]:
         """Create the prompt for creating a JSON object with given requirement.
@@ -550,7 +569,7 @@ class FinalizedDumpAble(Base):
         Returns:
             str: The finalized dump of the object.
         """
-        return self.model_dump_json(indent=1,by_alias=True)
+        return self.model_dump_json(indent=1, by_alias=True)
     def finalized_dump_to(self, path: str | Path) -> Self:
         """Finalize the dump of the object to a file.
@@ -638,6 +657,8 @@ class WithDependency(Base):
         Returns:
             str: The generated prompt for the task.
         """
+        from fabricatio.fs import MAGIKA
         return TEMPLATE_MANAGER.render_template(
             configs.templates.dependencies_template,
             {
@@ -662,8 +683,9 @@ class Vectorizable(Base):
     This class includes methods to prepare the model for vectorization, ensuring it fits within a specified token length.
     """
+    @abstractmethod
     def _prepare_vectorization_inner(self) -> str:
-        return rtoml.dumps(self.model_dump())
+        """Prepare the model for vectorization."""
     @final
     def prepare_vectorization(self, max_length: Optional[int] = None) -> str:
@@ -681,8 +703,7 @@ class Vectorizable(Base):
         max_length = max_length or configs.embedding.max_sequence_length
         chunk = self._prepare_vectorization_inner()
         if max_length and (length := token_counter(text=chunk)) > max_length:
-            logger.error(err := f"Chunk exceeds maximum sequence length {max_length}, got {length}, see {chunk}")
-            raise ValueError(err)
+            raise ValueError(f"Chunk exceeds maximum sequence length {max_length}, got {length}, see \n{chunk}")
         return chunk
@@ -733,6 +754,12 @@ class ScopedConfig(Base):
     llm_rpm: Optional[PositiveInt] = None
     """The requests per minute of the LLM model."""
+    llm_presence_penalty: Optional[PositiveFloat] = None
+    """The presence penalty of the LLM model."""
+    llm_frequency_penalty: Optional[PositiveFloat] = None
+    """The frequency penalty of the LLM model."""
     embedding_api_endpoint: Optional[HttpUrl] = None
     """The OpenAI API endpoint."""
@@ -857,13 +884,14 @@ class Patch[T](ProposedAble):
             # copy the desc info of each corresponding fields from `ref_cls`
             for field_name in [f for f in cls.model_fields if f in ref_cls.model_fields]:
                 my_schema["properties"][field_name]["description"] = (
-                    ref_cls.model_fields[field_name].description or my_schema["properties"][field_name]["description"]
+                        ref_cls.model_fields[field_name].description or my_schema["properties"][field_name][
+                    "description"]
                 )
             my_schema["description"] = ref_cls.__doc__
-        return orjson.dumps(
+        return ujson.dumps(
             my_schema,
-            option=orjson.OPT_INDENT_2,
+            option=ujson.OPT_INDENT_2,
         ).decode()

fabricatio/models/kwargs_types.py CHANGED Viewed

@@ -1,11 +1,18 @@
 """This module contains the types for the keyword arguments of the methods in the models module."""
-from typing import Any, Dict, List, Optional, Required, TypedDict
+from typing import Any, Dict, List, NotRequired, Optional, Required, TypedDict
 from litellm.caching.caching import CacheMode
 from litellm.types.caching import CachingSupportedCallTypes
+class ChunkKwargs(TypedDict):
+    """Configuration parameters for chunking operations."""
+    max_chunk_size: int
+    max_overlapping_rate: NotRequired[float]
 class EmbeddingKwargs(TypedDict, total=False):
     """Configuration parameters for text embedding operations.
@@ -38,6 +45,8 @@ class LLMKwargs(TypedDict, total=False):
     no_store: bool  # If store the response of this call to cache
     cache_ttl: int  # how long the stored cache is alive, in seconds
     s_maxage: int  # max accepted age of cached response, in seconds
+    presence_penalty: float
+    frequency_penalty: float
 class GenerateKwargs(LLMKwargs, total=False):
@@ -59,7 +68,7 @@ class ValidateKwargs[T](GenerateKwargs, total=False):
     default: Optional[T]
     max_validations: int
-    co_extractor: GenerateKwargs
 class CompositeScoreKwargs(ValidateKwargs[List[Dict[str, float]]], total=False):

fabricatio/models/usages.py CHANGED Viewed

@@ -63,7 +63,7 @@ class LLMUsage(ScopedConfig):
         self._added_deployment = ROUTER.upsert_deployment(deployment)
         return ROUTER
-    # noinspection PyTypeChecker,PydanticTypeChecker
+    # noinspection PyTypeChecker,PydanticTypeChecker,t
     async def aquery(
         self,
         messages: List[Dict[str, str]],
@@ -122,6 +122,12 @@ class LLMUsage(ScopedConfig):
                 "cache-ttl": kwargs.get("cache_ttl"),
                 "s-maxage": kwargs.get("s_maxage"),
             },
+            presence_penalty=kwargs.get("presence_penalty")
+            or self.llm_presence_penalty
+            or configs.llm.presence_penalty,
+            frequency_penalty=kwargs.get("frequency_penalty")
+            or self.llm_frequency_penalty
+            or configs.llm.frequency_penalty,
         )
     async def ainvoke(
@@ -236,7 +242,6 @@ class LLMUsage(ScopedConfig):
         validator: Callable[[str], T | None],
         default: T = ...,
         max_validations: PositiveInt = 2,
-        co_extractor: Optional[GenerateKwargs] = None,
         **kwargs: Unpack[GenerateKwargs],
     ) -> T: ...
     @overload
@@ -246,7 +251,6 @@ class LLMUsage(ScopedConfig):
         validator: Callable[[str], T | None],
         default: T = ...,
         max_validations: PositiveInt = 2,
-        co_extractor: Optional[GenerateKwargs] = None,
         **kwargs: Unpack[GenerateKwargs],
     ) -> List[T]: ...
     @overload
@@ -256,7 +260,6 @@ class LLMUsage(ScopedConfig):
         validator: Callable[[str], T | None],
         default: None = None,
         max_validations: PositiveInt = 2,
-        co_extractor: Optional[GenerateKwargs] = None,
         **kwargs: Unpack[GenerateKwargs],
     ) -> Optional[T]: ...
@@ -267,7 +270,6 @@ class LLMUsage(ScopedConfig):
         validator: Callable[[str], T | None],
         default: None = None,
         max_validations: PositiveInt = 2,
-        co_extractor: Optional[GenerateKwargs] = None,
         **kwargs: Unpack[GenerateKwargs],
     ) -> List[Optional[T]]: ...
@@ -277,7 +279,6 @@ class LLMUsage(ScopedConfig):
         validator: Callable[[str], T | None],
         default: Optional[T] = None,
         max_validations: PositiveInt = 3,
-        co_extractor: Optional[GenerateKwargs] = None,
         **kwargs: Unpack[GenerateKwargs],
     ) -> Optional[T] | List[Optional[T]] | List[T] | T:
         """Asynchronously asks a question and validates the response using a given validator.
@@ -287,34 +288,16 @@ class LLMUsage(ScopedConfig):
             validator (Callable[[str], T | None]): A function to validate the response.
             default (T | None): Default value to return if validation fails. Defaults to None.
             max_validations (PositiveInt): Maximum number of validation attempts. Defaults to 3.
-            co_extractor (Optional[GenerateKwargs]): Keyword arguments for the co-extractor, if provided will enable co-extraction.
             **kwargs (Unpack[GenerateKwargs]): Additional keyword arguments for the LLM usage.
         Returns:
-            Optional[T] | List[Optional[T]] | List[T] | T: The validated response.
+            Optional[T] | List[T | None] | List[T] | T: The validated response.
         """
         async def _inner(q: str) -> Optional[T]:
             for lap in range(max_validations):
                 try:
-                    if ((validated := validator(response := await self.aask(question=q, **kwargs))) is not None) or (
-                        co_extractor is not None
-                        and logger.debug("Co-extraction is enabled.") is None
-                        and (
-                            validated := validator(
-                                response := await self.aask(
-                                    question=(
-                                        TEMPLATE_MANAGER.render_template(
-                                            configs.templates.co_validation_template,
-                                            {"original_q": q, "original_a": response},
-                                        )
-                                    ),
-                                    **co_extractor,
-                                )
-                            )
-                        )
-                        is not None
-                    ):
+                    if (validated := validator(response := await self.aask(question=q, **kwargs))) is not None:
                         logger.debug(f"Successfully validated the response at {lap}th attempt.")
                         return validated

fabricatio/parser.py CHANGED Viewed

@@ -1,12 +1,13 @@
 """A module to parse text using regular expressions."""
+import re
+from functools import lru_cache
+from re import Pattern, compile
 from typing import Any, Callable, Iterable, List, Optional, Self, Tuple, Type
-import orjson
-import regex
+import ujson
 from json_repair import repair_json
 from pydantic import BaseModel, ConfigDict, Field, PositiveInt, PrivateAttr, ValidationError
-from regex import Pattern, compile
 from fabricatio.config import configs
 from fabricatio.journal import logger
@@ -25,7 +26,7 @@ class Capture(BaseModel):
     """The target groups to capture from the pattern."""
     pattern: str = Field(frozen=True)
     """The regular expression pattern to search for."""
-    flags: PositiveInt = Field(default=regex.DOTALL | regex.MULTILINE | regex.IGNORECASE, frozen=True)
+    flags: PositiveInt = Field(default=re.DOTALL | re.MULTILINE | re.IGNORECASE, frozen=True)
     """The flags to use when compiling the regular expression pattern."""
     capture_type: Optional[str] = None
     """The type of capture to perform, e.g., 'json', which is used to dispatch the fixer accordingly."""
@@ -49,7 +50,8 @@ class Capture(BaseModel):
                 logger.debug("Applying json repair to text.")
                 if isinstance(text, str):
                     return repair_json(text, ensure_ascii=False)  # pyright: ignore [reportReturnType]
-                return [repair_json(item, ensure_ascii=False) for item in text]  # pyright: ignore [reportReturnType, reportGeneralTypeIssues]
+                return [repair_json(item, ensure_ascii=False) for item in
+                        text]  # pyright: ignore [reportReturnType, reportGeneralTypeIssues]
             case _:
                 return text  # pyright: ignore [reportReturnType]
@@ -63,7 +65,7 @@ class Capture(BaseModel):
             str | None: The captured text if the pattern is found, otherwise None.
         """
-        if (match :=self._compiled.match(text) or self._compiled.search(text) ) is None:
+        if (match := self._compiled.match(text) or self._compiled.search(text)) is None:
             logger.debug(f"Capture Failed {type(text)}: \n{text}")
             return None
         groups = self.fix(match.groups())
@@ -94,12 +96,12 @@ class Capture(BaseModel):
             return None
     def validate_with[K, T, E](
-        self,
-        text: str,
-        target_type: Type[T],
-        elements_type: Optional[Type[E]] = None,
-        length: Optional[int] = None,
-        deserializer: Callable[[Tuple[str, ...]], K] | Callable[[str], K] = orjson.loads,
+            self,
+            text: str,
+            target_type: Type[T],
+            elements_type: Optional[Type[E]] = None,
+            length: Optional[int] = None,
+            deserializer: Callable[[Tuple[str, ...]], K] | Callable[[str], K] = ujson.loads,
     ) -> T | None:
         """Validate the given text using the pattern.
@@ -124,6 +126,7 @@ class Capture(BaseModel):
         return None
     @classmethod
+    @lru_cache(32)
     def capture_code_block(cls, language: str) -> Self:
         """Capture the first occurrence of a code block in the given text.
@@ -136,6 +139,7 @@ class Capture(BaseModel):
         return cls(pattern=f"```{language}(.*?)```", capture_type=language)
     @classmethod
+    @lru_cache(32)
     def capture_generic_block(cls, language: str) -> Self:
         """Capture the first occurrence of a generic code block in the given text.

fabricatio/rust.cp312-win_amd64.pyd CHANGED Viewed

Binary file