genflowly-encoding-library 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,16 @@
1
+ Metadata-Version: 2.4
2
+ Name: genflowly-encoding-library
3
+ Version: 0.1.0
4
+ Summary: Text encoding and chunking library.
5
+ Author: Genflowly
6
+ Requires-Python: >=3.9
7
+ Requires-Dist: pymilvus
8
+ Requires-Dist: sentence-transformers>=2.2.0
9
+ Requires-Dist: scikit-learn
10
+ Requires-Dist: scipy
11
+ Requires-Dist: numpy
12
+ Provides-Extra: dev
13
+ Requires-Dist: pytest; extra == "dev"
14
+ Requires-Dist: flake8; extra == "dev"
15
+ Dynamic: author
16
+ Dynamic: requires-python
@@ -0,0 +1,22 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "genflowly-encoding-library"
7
+ version = "0.1.0"
8
+ description = "Text encoding and chunking library."
9
+ dependencies = [
10
+ "pymilvus",
11
+ "sentence-transformers>=2.2.0",
12
+ "scikit-learn",
13
+ "scipy",
14
+ "numpy"
15
+ ]
16
+ requires-python = ">=3.9"
17
+
18
+ [project.optional-dependencies]
19
+ dev = [
20
+ "pytest",
21
+ "flake8"
22
+ ]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,18 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name="genflowly-encoding-library",
5
+ version="0.1.0",
6
+ description="A library for text chunking and embedding operations.",
7
+ author="Genflowly",
8
+ packages=find_packages(where="src"),
9
+ package_dir={"": "src"},
10
+ install_requires=[
11
+ "pymilvus",
12
+ "sentence-transformers>=2.2.0",
13
+ "scikit-learn",
14
+ "scipy",
15
+ "numpy"
16
+ ],
17
+ python_requires=">=3.9",
18
+ )
@@ -0,0 +1,14 @@
1
+ from .encoders.factory import EncoderFactory
2
+ from .chunkers.factory import ChunkerFactory
3
+ from .chunkers.enums import ChunkingType, RecursiveTechnique, SemanticTechnique
4
+ from .encoders.enums import ModelType, ModelName
5
+
6
+ __all__ = [
7
+ "EncoderFactory",
8
+ "ChunkerFactory",
9
+ "ChunkingType",
10
+ "RecursiveTechnique",
11
+ "SemanticTechnique",
12
+ "ModelType",
13
+ "ModelName"
14
+ ]
@@ -0,0 +1,3 @@
1
+ from .encoder import Encoder
2
+ from .sentence_transformer_encoder import SentenceTransformerEncoder
3
+ from .factory import EncoderFactory
@@ -0,0 +1,21 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import List, Dict, Any
3
+
4
+ class Encoder(ABC):
5
+ """
6
+ Abstract base class for all encoders (embedding generators).
7
+ """
8
+
9
+ @abstractmethod
10
+ def encode_documents(self, documents: List[str]) -> List[Any]:
11
+ """
12
+ Encode a list of documents (strings) into embeddings.
13
+ """
14
+ pass
15
+
16
+ @abstractmethod
17
+ def get_dimension(self) -> int:
18
+ """
19
+ Return the dimension of embeddings generated by this encoder.
20
+ """
21
+ pass
@@ -0,0 +1,11 @@
1
+ from enum import Enum
2
+
3
+ class ModelType(Enum):
4
+ SENTENCE_TRANSFORMER = "sentence-transformer"
5
+ OPENAI = "openai"
6
+ COHERE = "cohere"
7
+
8
+ class ModelName(Enum):
9
+ ALL_MINILM_L6_V2 = "all-MiniLM-L6-v2"
10
+ # Add other model names as needed
11
+ TEXT_EMBEDDING_3_SMALL = "text-embedding-3-small"
@@ -0,0 +1,38 @@
1
+ from typing import Union
2
+ from .encoder import Encoder
3
+ from .sentence_transformer_encoder import SentenceTransformerEncoder
4
+ from .enums import ModelType, ModelName
5
+
6
+ class EncoderFactory:
7
+ """
8
+ Factory class to get the appropriate encoder.
9
+ """
10
+
11
+ @staticmethod
12
+ def get_encoder(model_type: Union[str, ModelType] = ModelType.SENTENCE_TRANSFORMER,
13
+ model_name: Union[str, ModelName] = ModelName.ALL_MINILM_L6_V2,
14
+ **kwargs) -> Encoder:
15
+ """
16
+ Get encoder based on model type and name.
17
+ Default is SentenceTransformer with all-MiniLM-L6-v2.
18
+ """
19
+ # Normalize input to Enum
20
+ if isinstance(model_type, str):
21
+ try:
22
+ model_type = ModelType(model_type)
23
+ except ValueError:
24
+ raise ValueError(f"Unknown model type: {model_type}")
25
+
26
+ if model_type == ModelType.SENTENCE_TRANSFORMER:
27
+ device = kwargs.get('device', 'cpu')
28
+ # Handle ModelName enum or string for model_name
29
+ if isinstance(model_name, ModelName):
30
+ name_str = model_name.value
31
+ else:
32
+ name_str = model_name
33
+
34
+ return SentenceTransformerEncoder(model_name=name_str, device=device)
35
+
36
+ # Add other model types here if needed (e.g. OpenAI, Cohere)
37
+
38
+ raise ValueError(f"Unknown model type: {model_type}")
@@ -0,0 +1,46 @@
1
+ import logging
2
+ from typing import List, Dict, Any
3
+ from pymilvus.model.dense import SentenceTransformerEmbeddingFunction
4
+ from .encoder import Encoder
5
+
6
+ logger = logging.getLogger()
7
+
8
+ class SentenceTransformerEncoder(Encoder):
9
+ """
10
+ Concrete implementation of Encoder using SentenceTransformer.
11
+ """
12
+ _instances = {}
13
+
14
+ def __new__(cls, model_name: str = 'all-MiniLM-L6-v2', device: str = 'cpu'):
15
+ # Singleton pattern per model_name to avoid reloading models (expensive)
16
+ if model_name not in cls._instances:
17
+ cls._instances[model_name] = super(SentenceTransformerEncoder, cls).__new__(cls)
18
+ return cls._instances[model_name]
19
+
20
+ def __init__(self, model_name: str = 'all-MiniLM-L6-v2', device: str = 'cpu'):
21
+ # Only initialize once
22
+ if not hasattr(self, 'initialized'):
23
+ self.model_name = model_name
24
+ self.device = device
25
+ try:
26
+ logger.info(f"Loading SentenceTransformer model: {model_name} on {device}")
27
+ self.embedding_fn = SentenceTransformerEmbeddingFunction(
28
+ model_name=model_name,
29
+ device=device
30
+ )
31
+ self.initialized = True
32
+ logger.info(f"Model loaded successfully. Dimension: {self.embedding_fn.dim}")
33
+ except Exception as e:
34
+ logger.error(f"Failed to load model {model_name}: {e}")
35
+ raise e
36
+
37
+ def get_dimension(self) -> int:
38
+ return self.embedding_fn.dim
39
+
40
+ def encode_documents(self, documents: List[str]) -> List[Any]:
41
+ """
42
+ Encode list of documents.
43
+ """
44
+ return self.embedding_fn.encode_documents(documents)
45
+
46
+
@@ -0,0 +1,16 @@
1
+ import re
2
+ from typing import List
3
+
4
+ def split_into_sentences(text: str) -> List[str]:
5
+ """
6
+ Split text into sentences to handle long documents better.
7
+ Simple regex split on punctuation followed by whitespace.
8
+ """
9
+ sentences = re.split(r'(?<=[.!?])\s+', text)
10
+ sentences = [s.strip() for s in sentences if s.strip()]
11
+
12
+ if not sentences:
13
+ # Fallback for text without punctuation or empty
14
+ sentences = [text]
15
+
16
+ return sentences
@@ -0,0 +1,16 @@
1
+ Metadata-Version: 2.4
2
+ Name: genflowly-encoding-library
3
+ Version: 0.1.0
4
+ Summary: Text encoding and chunking library.
5
+ Author: Genflowly
6
+ Requires-Python: >=3.9
7
+ Requires-Dist: pymilvus
8
+ Requires-Dist: sentence-transformers>=2.2.0
9
+ Requires-Dist: scikit-learn
10
+ Requires-Dist: scipy
11
+ Requires-Dist: numpy
12
+ Provides-Extra: dev
13
+ Requires-Dist: pytest; extra == "dev"
14
+ Requires-Dist: flake8; extra == "dev"
15
+ Dynamic: author
16
+ Dynamic: requires-python
@@ -0,0 +1,15 @@
1
+ pyproject.toml
2
+ setup.py
3
+ src/encoding_library/__init__.py
4
+ src/encoding_library/encoders/__init__.py
5
+ src/encoding_library/encoders/encoder.py
6
+ src/encoding_library/encoders/enums.py
7
+ src/encoding_library/encoders/factory.py
8
+ src/encoding_library/encoders/sentence_transformer_encoder.py
9
+ src/encoding_library/utils/__init__.py
10
+ src/encoding_library/utils/text_preprocessor.py
11
+ src/genflowly_encoding_library.egg-info/PKG-INFO
12
+ src/genflowly_encoding_library.egg-info/SOURCES.txt
13
+ src/genflowly_encoding_library.egg-info/dependency_links.txt
14
+ src/genflowly_encoding_library.egg-info/requires.txt
15
+ src/genflowly_encoding_library.egg-info/top_level.txt
@@ -0,0 +1,9 @@
1
+ pymilvus
2
+ sentence-transformers>=2.2.0
3
+ scikit-learn
4
+ scipy
5
+ numpy
6
+
7
+ [dev]
8
+ pytest
9
+ flake8