genflowly-encoding-library 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genflowly_encoding_library-0.1.0/PKG-INFO +16 -0
- genflowly_encoding_library-0.1.0/pyproject.toml +22 -0
- genflowly_encoding_library-0.1.0/setup.cfg +4 -0
- genflowly_encoding_library-0.1.0/setup.py +18 -0
- genflowly_encoding_library-0.1.0/src/encoding_library/__init__.py +14 -0
- genflowly_encoding_library-0.1.0/src/encoding_library/encoders/__init__.py +3 -0
- genflowly_encoding_library-0.1.0/src/encoding_library/encoders/encoder.py +21 -0
- genflowly_encoding_library-0.1.0/src/encoding_library/encoders/enums.py +11 -0
- genflowly_encoding_library-0.1.0/src/encoding_library/encoders/factory.py +38 -0
- genflowly_encoding_library-0.1.0/src/encoding_library/encoders/sentence_transformer_encoder.py +46 -0
- genflowly_encoding_library-0.1.0/src/encoding_library/utils/__init__.py +0 -0
- genflowly_encoding_library-0.1.0/src/encoding_library/utils/text_preprocessor.py +16 -0
- genflowly_encoding_library-0.1.0/src/genflowly_encoding_library.egg-info/PKG-INFO +16 -0
- genflowly_encoding_library-0.1.0/src/genflowly_encoding_library.egg-info/SOURCES.txt +15 -0
- genflowly_encoding_library-0.1.0/src/genflowly_encoding_library.egg-info/dependency_links.txt +1 -0
- genflowly_encoding_library-0.1.0/src/genflowly_encoding_library.egg-info/requires.txt +9 -0
- genflowly_encoding_library-0.1.0/src/genflowly_encoding_library.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: genflowly-encoding-library
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Text encoding and chunking library.
|
|
5
|
+
Author: Genflowly
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
Requires-Dist: pymilvus
|
|
8
|
+
Requires-Dist: sentence-transformers>=2.2.0
|
|
9
|
+
Requires-Dist: scikit-learn
|
|
10
|
+
Requires-Dist: scipy
|
|
11
|
+
Requires-Dist: numpy
|
|
12
|
+
Provides-Extra: dev
|
|
13
|
+
Requires-Dist: pytest; extra == "dev"
|
|
14
|
+
Requires-Dist: flake8; extra == "dev"
|
|
15
|
+
Dynamic: author
|
|
16
|
+
Dynamic: requires-python
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "genflowly-encoding-library"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Text encoding and chunking library."
|
|
9
|
+
dependencies = [
|
|
10
|
+
"pymilvus",
|
|
11
|
+
"sentence-transformers>=2.2.0",
|
|
12
|
+
"scikit-learn",
|
|
13
|
+
"scipy",
|
|
14
|
+
"numpy"
|
|
15
|
+
]
|
|
16
|
+
requires-python = ">=3.9"
|
|
17
|
+
|
|
18
|
+
[project.optional-dependencies]
|
|
19
|
+
dev = [
|
|
20
|
+
"pytest",
|
|
21
|
+
"flake8"
|
|
22
|
+
]
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name="genflowly-encoding-library",
|
|
5
|
+
version="0.1.0",
|
|
6
|
+
description="A library for text chunking and embedding operations.",
|
|
7
|
+
author="Genflowly",
|
|
8
|
+
packages=find_packages(where="src"),
|
|
9
|
+
package_dir={"": "src"},
|
|
10
|
+
install_requires=[
|
|
11
|
+
"pymilvus",
|
|
12
|
+
"sentence-transformers>=2.2.0",
|
|
13
|
+
"scikit-learn",
|
|
14
|
+
"scipy",
|
|
15
|
+
"numpy"
|
|
16
|
+
],
|
|
17
|
+
python_requires=">=3.9",
|
|
18
|
+
)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from .encoders.factory import EncoderFactory
|
|
2
|
+
from .chunkers.factory import ChunkerFactory
|
|
3
|
+
from .chunkers.enums import ChunkingType, RecursiveTechnique, SemanticTechnique
|
|
4
|
+
from .encoders.enums import ModelType, ModelName
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"EncoderFactory",
|
|
8
|
+
"ChunkerFactory",
|
|
9
|
+
"ChunkingType",
|
|
10
|
+
"RecursiveTechnique",
|
|
11
|
+
"SemanticTechnique",
|
|
12
|
+
"ModelType",
|
|
13
|
+
"ModelName"
|
|
14
|
+
]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import List, Dict, Any
|
|
3
|
+
|
|
4
|
+
class Encoder(ABC):
|
|
5
|
+
"""
|
|
6
|
+
Abstract base class for all encoders (embedding generators).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
@abstractmethod
|
|
10
|
+
def encode_documents(self, documents: List[str]) -> List[Any]:
|
|
11
|
+
"""
|
|
12
|
+
Encode a list of documents (strings) into embeddings.
|
|
13
|
+
"""
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def get_dimension(self) -> int:
|
|
18
|
+
"""
|
|
19
|
+
Return the dimension of embeddings generated by this encoder.
|
|
20
|
+
"""
|
|
21
|
+
pass
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
class ModelType(Enum):
|
|
4
|
+
SENTENCE_TRANSFORMER = "sentence-transformer"
|
|
5
|
+
OPENAI = "openai"
|
|
6
|
+
COHERE = "cohere"
|
|
7
|
+
|
|
8
|
+
class ModelName(Enum):
|
|
9
|
+
ALL_MINILM_L6_V2 = "all-MiniLM-L6-v2"
|
|
10
|
+
# Add other model names as needed
|
|
11
|
+
TEXT_EMBEDDING_3_SMALL = "text-embedding-3-small"
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from typing import Union
|
|
2
|
+
from .encoder import Encoder
|
|
3
|
+
from .sentence_transformer_encoder import SentenceTransformerEncoder
|
|
4
|
+
from .enums import ModelType, ModelName
|
|
5
|
+
|
|
6
|
+
class EncoderFactory:
|
|
7
|
+
"""
|
|
8
|
+
Factory class to get the appropriate encoder.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
@staticmethod
|
|
12
|
+
def get_encoder(model_type: Union[str, ModelType] = ModelType.SENTENCE_TRANSFORMER,
|
|
13
|
+
model_name: Union[str, ModelName] = ModelName.ALL_MINILM_L6_V2,
|
|
14
|
+
**kwargs) -> Encoder:
|
|
15
|
+
"""
|
|
16
|
+
Get encoder based on model type and name.
|
|
17
|
+
Default is SentenceTransformer with all-MiniLM-L6-v2.
|
|
18
|
+
"""
|
|
19
|
+
# Normalize input to Enum
|
|
20
|
+
if isinstance(model_type, str):
|
|
21
|
+
try:
|
|
22
|
+
model_type = ModelType(model_type)
|
|
23
|
+
except ValueError:
|
|
24
|
+
raise ValueError(f"Unknown model type: {model_type}")
|
|
25
|
+
|
|
26
|
+
if model_type == ModelType.SENTENCE_TRANSFORMER:
|
|
27
|
+
device = kwargs.get('device', 'cpu')
|
|
28
|
+
# Handle ModelName enum or string for model_name
|
|
29
|
+
if isinstance(model_name, ModelName):
|
|
30
|
+
name_str = model_name.value
|
|
31
|
+
else:
|
|
32
|
+
name_str = model_name
|
|
33
|
+
|
|
34
|
+
return SentenceTransformerEncoder(model_name=name_str, device=device)
|
|
35
|
+
|
|
36
|
+
# Add other model types here if needed (e.g. OpenAI, Cohere)
|
|
37
|
+
|
|
38
|
+
raise ValueError(f"Unknown model type: {model_type}")
|
genflowly_encoding_library-0.1.0/src/encoding_library/encoders/sentence_transformer_encoder.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import List, Dict, Any
|
|
3
|
+
from pymilvus.model.dense import SentenceTransformerEmbeddingFunction
|
|
4
|
+
from .encoder import Encoder
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger()
|
|
7
|
+
|
|
8
|
+
class SentenceTransformerEncoder(Encoder):
|
|
9
|
+
"""
|
|
10
|
+
Concrete implementation of Encoder using SentenceTransformer.
|
|
11
|
+
"""
|
|
12
|
+
_instances = {}
|
|
13
|
+
|
|
14
|
+
def __new__(cls, model_name: str = 'all-MiniLM-L6-v2', device: str = 'cpu'):
|
|
15
|
+
# Singleton pattern per model_name to avoid reloading models (expensive)
|
|
16
|
+
if model_name not in cls._instances:
|
|
17
|
+
cls._instances[model_name] = super(SentenceTransformerEncoder, cls).__new__(cls)
|
|
18
|
+
return cls._instances[model_name]
|
|
19
|
+
|
|
20
|
+
def __init__(self, model_name: str = 'all-MiniLM-L6-v2', device: str = 'cpu'):
|
|
21
|
+
# Only initialize once
|
|
22
|
+
if not hasattr(self, 'initialized'):
|
|
23
|
+
self.model_name = model_name
|
|
24
|
+
self.device = device
|
|
25
|
+
try:
|
|
26
|
+
logger.info(f"Loading SentenceTransformer model: {model_name} on {device}")
|
|
27
|
+
self.embedding_fn = SentenceTransformerEmbeddingFunction(
|
|
28
|
+
model_name=model_name,
|
|
29
|
+
device=device
|
|
30
|
+
)
|
|
31
|
+
self.initialized = True
|
|
32
|
+
logger.info(f"Model loaded successfully. Dimension: {self.embedding_fn.dim}")
|
|
33
|
+
except Exception as e:
|
|
34
|
+
logger.error(f"Failed to load model {model_name}: {e}")
|
|
35
|
+
raise e
|
|
36
|
+
|
|
37
|
+
def get_dimension(self) -> int:
|
|
38
|
+
return self.embedding_fn.dim
|
|
39
|
+
|
|
40
|
+
def encode_documents(self, documents: List[str]) -> List[Any]:
|
|
41
|
+
"""
|
|
42
|
+
Encode list of documents.
|
|
43
|
+
"""
|
|
44
|
+
return self.embedding_fn.encode_documents(documents)
|
|
45
|
+
|
|
46
|
+
|
|
File without changes
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
def split_into_sentences(text: str) -> List[str]:
|
|
5
|
+
"""
|
|
6
|
+
Split text into sentences to handle long documents better.
|
|
7
|
+
Simple regex split on punctuation followed by whitespace.
|
|
8
|
+
"""
|
|
9
|
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
|
10
|
+
sentences = [s.strip() for s in sentences if s.strip()]
|
|
11
|
+
|
|
12
|
+
if not sentences:
|
|
13
|
+
# Fallback for text without punctuation or empty
|
|
14
|
+
sentences = [text]
|
|
15
|
+
|
|
16
|
+
return sentences
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: genflowly-encoding-library
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Text encoding and chunking library.
|
|
5
|
+
Author: Genflowly
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
Requires-Dist: pymilvus
|
|
8
|
+
Requires-Dist: sentence-transformers>=2.2.0
|
|
9
|
+
Requires-Dist: scikit-learn
|
|
10
|
+
Requires-Dist: scipy
|
|
11
|
+
Requires-Dist: numpy
|
|
12
|
+
Provides-Extra: dev
|
|
13
|
+
Requires-Dist: pytest; extra == "dev"
|
|
14
|
+
Requires-Dist: flake8; extra == "dev"
|
|
15
|
+
Dynamic: author
|
|
16
|
+
Dynamic: requires-python
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
pyproject.toml
|
|
2
|
+
setup.py
|
|
3
|
+
src/encoding_library/__init__.py
|
|
4
|
+
src/encoding_library/encoders/__init__.py
|
|
5
|
+
src/encoding_library/encoders/encoder.py
|
|
6
|
+
src/encoding_library/encoders/enums.py
|
|
7
|
+
src/encoding_library/encoders/factory.py
|
|
8
|
+
src/encoding_library/encoders/sentence_transformer_encoder.py
|
|
9
|
+
src/encoding_library/utils/__init__.py
|
|
10
|
+
src/encoding_library/utils/text_preprocessor.py
|
|
11
|
+
src/genflowly_encoding_library.egg-info/PKG-INFO
|
|
12
|
+
src/genflowly_encoding_library.egg-info/SOURCES.txt
|
|
13
|
+
src/genflowly_encoding_library.egg-info/dependency_links.txt
|
|
14
|
+
src/genflowly_encoding_library.egg-info/requires.txt
|
|
15
|
+
src/genflowly_encoding_library.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
encoding_library
|