PyPI - genflowly-encoding-library - Versions diffs - 0.1.0__tar.gz - Mend

genflowly-encoding-library 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

genflowly_encoding_library-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,16 @@
+Metadata-Version: 2.4
+Name: genflowly-encoding-library
+Version: 0.1.0
+Summary: Text encoding and chunking library.
+Author: Genflowly
+Requires-Python: >=3.9
+Requires-Dist: pymilvus
+Requires-Dist: sentence-transformers>=2.2.0
+Requires-Dist: scikit-learn
+Requires-Dist: scipy
+Requires-Dist: numpy
+Provides-Extra: dev
+Requires-Dist: pytest; extra == "dev"
+Requires-Dist: flake8; extra == "dev"
+Dynamic: author
+Dynamic: requires-python

genflowly_encoding_library-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,22 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "genflowly-encoding-library"
+version = "0.1.0"
+description = "Text encoding and chunking library."
+dependencies = [
+    "pymilvus",
+    "sentence-transformers>=2.2.0",
+    "scikit-learn",
+    "scipy",
+    "numpy"
+]
+requires-python = ">=3.9"
+[project.optional-dependencies]
+dev = [
+    "pytest",
+    "flake8"
+]

genflowly_encoding_library-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

genflowly_encoding_library-0.1.0/setup.py ADDED Viewed

@@ -0,0 +1,18 @@
+from setuptools import setup, find_packages
+setup(
+    name="genflowly-encoding-library",
+    version="0.1.0",
+    description="A library for text chunking and embedding operations.",
+    author="Genflowly",
+    packages=find_packages(where="src"),
+    package_dir={"": "src"},
+    install_requires=[
+        "pymilvus",
+        "sentence-transformers>=2.2.0",
+        "scikit-learn",
+        "scipy",
+        "numpy"
+    ],
+    python_requires=">=3.9",
+)

genflowly_encoding_library-0.1.0/src/encoding_library/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+from .encoders.factory import EncoderFactory
+from .chunkers.factory import ChunkerFactory
+from .chunkers.enums import ChunkingType, RecursiveTechnique, SemanticTechnique
+from .encoders.enums import ModelType, ModelName
+__all__ = [
+    "EncoderFactory",
+    "ChunkerFactory",
+    "ChunkingType",
+    "RecursiveTechnique",
+    "SemanticTechnique",
+    "ModelType",
+    "ModelName"
+]

genflowly_encoding_library-0.1.0/src/encoding_library/encoders/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .encoder import Encoder
+from .sentence_transformer_encoder import SentenceTransformerEncoder
+from .factory import EncoderFactory

genflowly_encoding_library-0.1.0/src/encoding_library/encoders/encoder.py ADDED Viewed

@@ -0,0 +1,21 @@
+from abc import ABC, abstractmethod
+from typing import List, Dict, Any
+class Encoder(ABC):
+    """
+    Abstract base class for all encoders (embedding generators).
+    """
+    @abstractmethod
+    def encode_documents(self, documents: List[str]) -> List[Any]:
+        """
+        Encode a list of documents (strings) into embeddings.
+        """
+        pass
+    @abstractmethod
+    def get_dimension(self) -> int:
+        """
+        Return the dimension of embeddings generated by this encoder.
+        """
+        pass

genflowly_encoding_library-0.1.0/src/encoding_library/encoders/enums.py ADDED Viewed

@@ -0,0 +1,11 @@
+from enum import Enum
+class ModelType(Enum):
+    SENTENCE_TRANSFORMER = "sentence-transformer"
+    OPENAI = "openai"
+    COHERE = "cohere"
+class ModelName(Enum):
+    ALL_MINILM_L6_V2 = "all-MiniLM-L6-v2"
+    # Add other model names as needed
+    TEXT_EMBEDDING_3_SMALL = "text-embedding-3-small"

genflowly_encoding_library-0.1.0/src/encoding_library/encoders/factory.py ADDED Viewed

@@ -0,0 +1,38 @@
+from typing import Union
+from .encoder import Encoder
+from .sentence_transformer_encoder import SentenceTransformerEncoder
+from .enums import ModelType, ModelName
+class EncoderFactory:
+    """
+    Factory class to get the appropriate encoder.
+    """
+    @staticmethod
+    def get_encoder(model_type: Union[str, ModelType] = ModelType.SENTENCE_TRANSFORMER,
+                    model_name: Union[str, ModelName] = ModelName.ALL_MINILM_L6_V2,
+                    **kwargs) -> Encoder:
+        """
+        Get encoder based on model type and name.
+        Default is SentenceTransformer with all-MiniLM-L6-v2.
+        """
+        # Normalize input to Enum
+        if isinstance(model_type, str):
+            try:
+                model_type = ModelType(model_type)
+            except ValueError:
+                raise ValueError(f"Unknown model type: {model_type}")
+        if model_type == ModelType.SENTENCE_TRANSFORMER:
+            device = kwargs.get('device', 'cpu')
+            # Handle ModelName enum or string for model_name
+            if isinstance(model_name, ModelName):
+                name_str = model_name.value
+            else:
+                name_str = model_name
+            return SentenceTransformerEncoder(model_name=name_str, device=device)
+        # Add other model types here if needed (e.g. OpenAI, Cohere)
+        raise ValueError(f"Unknown model type: {model_type}")

genflowly_encoding_library-0.1.0/src/encoding_library/encoders/sentence_transformer_encoder.py ADDED Viewed

@@ -0,0 +1,46 @@
+import logging
+from typing import List, Dict, Any
+from pymilvus.model.dense import SentenceTransformerEmbeddingFunction
+from .encoder import Encoder
+logger = logging.getLogger()
+class SentenceTransformerEncoder(Encoder):
+    """
+    Concrete implementation of Encoder using SentenceTransformer.
+    """
+    _instances = {}
+    def __new__(cls, model_name: str = 'all-MiniLM-L6-v2', device: str = 'cpu'):
+        # Singleton pattern per model_name to avoid reloading models (expensive)
+        if model_name not in cls._instances:
+            cls._instances[model_name] = super(SentenceTransformerEncoder, cls).__new__(cls)
+        return cls._instances[model_name]
+    def __init__(self, model_name: str = 'all-MiniLM-L6-v2', device: str = 'cpu'):
+        # Only initialize once
+        if not hasattr(self, 'initialized'):
+            self.model_name = model_name
+            self.device = device
+            try:
+                logger.info(f"Loading SentenceTransformer model: {model_name} on {device}")
+                self.embedding_fn = SentenceTransformerEmbeddingFunction(
+                    model_name=model_name,
+                    device=device
+                )
+                self.initialized = True
+                logger.info(f"Model loaded successfully. Dimension: {self.embedding_fn.dim}")
+            except Exception as e:
+                logger.error(f"Failed to load model {model_name}: {e}")
+                raise e
+    def get_dimension(self) -> int:
+        return self.embedding_fn.dim
+    def encode_documents(self, documents: List[str]) -> List[Any]:
+        """
+        Encode list of documents.
+        """
+        return self.embedding_fn.encode_documents(documents)

genflowly_encoding_library-0.1.0/src/encoding_library/utils/__init__.py ADDED Viewed

File without changes

genflowly_encoding_library-0.1.0/src/encoding_library/utils/text_preprocessor.py ADDED Viewed

@@ -0,0 +1,16 @@
+import re
+from typing import List
+def split_into_sentences(text: str) -> List[str]:
+    """
+    Split text into sentences to handle long documents better.
+    Simple regex split on punctuation followed by whitespace.
+    """
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    sentences = [s.strip() for s in sentences if s.strip()]
+    if not sentences:
+        # Fallback for text without punctuation or empty
+        sentences = [text]
+    return sentences

genflowly_encoding_library-0.1.0/src/genflowly_encoding_library.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,16 @@
+Metadata-Version: 2.4
+Name: genflowly-encoding-library
+Version: 0.1.0
+Summary: Text encoding and chunking library.
+Author: Genflowly
+Requires-Python: >=3.9
+Requires-Dist: pymilvus
+Requires-Dist: sentence-transformers>=2.2.0
+Requires-Dist: scikit-learn
+Requires-Dist: scipy
+Requires-Dist: numpy
+Provides-Extra: dev
+Requires-Dist: pytest; extra == "dev"
+Requires-Dist: flake8; extra == "dev"
+Dynamic: author
+Dynamic: requires-python

genflowly_encoding_library-0.1.0/src/genflowly_encoding_library.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,15 @@
+pyproject.toml
+setup.py
+src/encoding_library/__init__.py
+src/encoding_library/encoders/__init__.py
+src/encoding_library/encoders/encoder.py
+src/encoding_library/encoders/enums.py
+src/encoding_library/encoders/factory.py
+src/encoding_library/encoders/sentence_transformer_encoder.py
+src/encoding_library/utils/__init__.py
+src/encoding_library/utils/text_preprocessor.py
+src/genflowly_encoding_library.egg-info/PKG-INFO
+src/genflowly_encoding_library.egg-info/SOURCES.txt
+src/genflowly_encoding_library.egg-info/dependency_links.txt
+src/genflowly_encoding_library.egg-info/requires.txt
+src/genflowly_encoding_library.egg-info/top_level.txt

genflowly_encoding_library-0.1.0/src/genflowly_encoding_library.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

genflowly_encoding_library-0.1.0/src/genflowly_encoding_library.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,9 @@
+pymilvus
+sentence-transformers>=2.2.0
+scikit-learn
+scipy
+numpy
+[dev]
+pytest
+flake8

genflowly_encoding_library-0.1.0/src/genflowly_encoding_library.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ encoding_library