embedding-flow 0.1.0__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- embedding_flow-0.1.2/PKG-INFO +69 -0
- embedding_flow-0.1.2/README.md +41 -0
- embedding_flow-0.1.2/embedding_flow/__init__.py +4 -0
- {embedding_flow-0.1.0 → embedding_flow-0.1.2/embedding_flow}/contracts/contracts.py +2 -1
- {embedding_flow-0.1.0 → embedding_flow-0.1.2/embedding_flow}/load/load.py +2 -2
- embedding_flow-0.1.2/embedding_flow/main.py +22 -0
- {embedding_flow-0.1.0 → embedding_flow-0.1.2/embedding_flow}/transform/transform.py +2 -2
- embedding_flow-0.1.2/embedding_flow.egg-info/PKG-INFO +69 -0
- embedding_flow-0.1.2/embedding_flow.egg-info/SOURCES.txt +19 -0
- {embedding_flow-0.1.0 → embedding_flow-0.1.2}/embedding_flow.egg-info/requires.txt +7 -1
- embedding_flow-0.1.2/embedding_flow.egg-info/top_level.txt +1 -0
- {embedding_flow-0.1.0 → embedding_flow-0.1.2}/pyproject.toml +10 -3
- {embedding_flow-0.1.0 → embedding_flow-0.1.2}/tests/test_load.py +3 -3
- {embedding_flow-0.1.0 → embedding_flow-0.1.2}/tests/test_transform.py +1 -1
- embedding_flow-0.1.0/MANIFEST.in +0 -7
- embedding_flow-0.1.0/PKG-INFO +0 -22
- embedding_flow-0.1.0/README.md +0 -40
- embedding_flow-0.1.0/embedding_flow.egg-info/PKG-INFO +0 -22
- embedding_flow-0.1.0/embedding_flow.egg-info/SOURCES.txt +0 -19
- embedding_flow-0.1.0/embedding_flow.egg-info/top_level.txt +0 -3
- embedding_flow-0.1.0/requirements.txt +0 -17
- {embedding_flow-0.1.0 → embedding_flow-0.1.2}/LICENSE +0 -0
- {embedding_flow-0.1.0 → embedding_flow-0.1.2/embedding_flow}/contracts/__init__.py +0 -0
- {embedding_flow-0.1.0 → embedding_flow-0.1.2/embedding_flow}/load/__init__.py +0 -0
- {embedding_flow-0.1.0 → embedding_flow-0.1.2/embedding_flow}/transform/__init__.py +0 -0
- {embedding_flow-0.1.0 → embedding_flow-0.1.2}/embedding_flow.egg-info/dependency_links.txt +0 -0
- {embedding_flow-0.1.0 → embedding_flow-0.1.2}/setup.cfg +0 -0
- {embedding_flow-0.1.0 → embedding_flow-0.1.2}/setup.py +0 -0
@@ -0,0 +1,69 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: embedding-flow
|
3
|
+
Version: 0.1.2
|
4
|
+
Summary: Pipeline to transform text chunks into embeddings and load to Qdrant
|
5
|
+
Author: facuvega
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
7
|
+
Classifier: Programming Language :: Python :: 3.10
|
8
|
+
Classifier: Programming Language :: Python :: 3.11
|
9
|
+
Classifier: Programming Language :: Python :: 3.12
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
11
|
+
Classifier: Operating System :: OS Independent
|
12
|
+
Requires-Python: >=3.10
|
13
|
+
Description-Content-Type: text/markdown
|
14
|
+
License-File: LICENSE
|
15
|
+
Requires-Dist: pandas>=2.0.0
|
16
|
+
Requires-Dist: pyarrow>=12.0.0
|
17
|
+
Requires-Dist: sentence-transformers>=2.2.0
|
18
|
+
Requires-Dist: qdrant-client>=1.7.0
|
19
|
+
Requires-Dist: transformers
|
20
|
+
Provides-Extra: dev
|
21
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
22
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
23
|
+
Provides-Extra: cpu
|
24
|
+
Requires-Dist: torch>=2.0.0; extra == "cpu"
|
25
|
+
Provides-Extra: cuda
|
26
|
+
Requires-Dist: torch>=2.0.0; extra == "cuda"
|
27
|
+
Dynamic: license-file
|
28
|
+
|
29
|
+
# embedding-flow
|
30
|
+
|
31
|
+
Biblioteca para transformar chunks de texto en embeddings de 768 dimensiones y cargarlos en Qdrant.
|
32
|
+
|
33
|
+
## Instalación
|
34
|
+
|
35
|
+
```bash
|
36
|
+
# Instalación básica (instala torch según tu sistema)
|
37
|
+
pip install embedding-flow
|
38
|
+
|
39
|
+
# O instalar con torch CPU (recomendado si no tenés GPU)
|
40
|
+
pip install embedding-flow torch --index-url https://download.pytorch.org/whl/cpu
|
41
|
+
```
|
42
|
+
|
43
|
+
## Uso
|
44
|
+
|
45
|
+
```python
|
46
|
+
from embedding_flow import embedding_flow
|
47
|
+
|
48
|
+
# Recibe el path del parquet con chunks y carga embeddings a Qdrant
|
49
|
+
embedding_flow("/path/to/chunks.parquet")
|
50
|
+
```
|
51
|
+
|
52
|
+
## Variables de entorno
|
53
|
+
|
54
|
+
```bash
|
55
|
+
QDRANT_URL=http://localhost:6333
|
56
|
+
QDRANT_COLLECTION=embeddings_collection
|
57
|
+
VECTOR_SIZE=768
|
58
|
+
```
|
59
|
+
|
60
|
+
## Flujo
|
61
|
+
|
62
|
+
1. Lee chunks desde parquet
|
63
|
+
2. Genera embeddings (768 dim) con `all-mpnet-base-v2`
|
64
|
+
3. Carga embeddings a Qdrant (Docker local)
|
65
|
+
|
66
|
+
## Licencia
|
67
|
+
|
68
|
+
MIT
|
69
|
+
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# embedding-flow
|
2
|
+
|
3
|
+
Biblioteca para transformar chunks de texto en embeddings de 768 dimensiones y cargarlos en Qdrant.
|
4
|
+
|
5
|
+
## Instalación
|
6
|
+
|
7
|
+
```bash
|
8
|
+
# Instalación básica (instala torch según tu sistema)
|
9
|
+
pip install embedding-flow
|
10
|
+
|
11
|
+
# O instalar con torch CPU (recomendado si no tenés GPU)
|
12
|
+
pip install embedding-flow torch --index-url https://download.pytorch.org/whl/cpu
|
13
|
+
```
|
14
|
+
|
15
|
+
## Uso
|
16
|
+
|
17
|
+
```python
|
18
|
+
from embedding_flow import embedding_flow
|
19
|
+
|
20
|
+
# Recibe el path del parquet con chunks y carga embeddings a Qdrant
|
21
|
+
embedding_flow("/path/to/chunks.parquet")
|
22
|
+
```
|
23
|
+
|
24
|
+
## Variables de entorno
|
25
|
+
|
26
|
+
```bash
|
27
|
+
QDRANT_URL=http://localhost:6333
|
28
|
+
QDRANT_COLLECTION=embeddings_collection
|
29
|
+
VECTOR_SIZE=768
|
30
|
+
```
|
31
|
+
|
32
|
+
## Flujo
|
33
|
+
|
34
|
+
1. Lee chunks desde parquet
|
35
|
+
2. Genera embeddings (768 dim) con `all-mpnet-base-v2`
|
36
|
+
3. Carga embeddings a Qdrant (Docker local)
|
37
|
+
|
38
|
+
## Licencia
|
39
|
+
|
40
|
+
MIT
|
41
|
+
|
@@ -1,4 +1,4 @@
|
|
1
|
-
from contracts.contracts import load_data
|
1
|
+
from embedding_flow.contracts.contracts import load_data
|
2
2
|
from qdrant_client import QdrantClient
|
3
3
|
from qdrant_client.models import Distance, VectorParams, PointStruct
|
4
4
|
import pandas as pd
|
@@ -98,4 +98,4 @@ class load_embedding(load_data):
|
|
98
98
|
|
99
99
|
except Exception as e:
|
100
100
|
logger.error(f"❌ Error al cargar embeddings a Qdrant desde {parquet_path}: {e}", exc_info=True)
|
101
|
-
return False
|
101
|
+
return False
|
@@ -0,0 +1,22 @@
|
|
1
|
+
from embedding_flow.transform.transform import transform_embedding
|
2
|
+
from embedding_flow.load.load import load_embedding
|
3
|
+
import logging
|
4
|
+
|
5
|
+
logging.basicConfig(
|
6
|
+
level=logging.INFO, # Nivel mínimo de logs a mostrar
|
7
|
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
8
|
+
filename='appMain.log', # Opcional: guarda los logs en un archivo
|
9
|
+
filemode='a' # 'a' append, 'w' overwrite
|
10
|
+
)
|
11
|
+
|
12
|
+
|
13
|
+
def embedding_flow(parquet_path: str)-> str | None :
|
14
|
+
|
15
|
+
transformer = transform_embedding(parquet_path)
|
16
|
+
of = load_embedding(transformer)
|
17
|
+
if of is None:
|
18
|
+
logging.error("❌ Pipeline failed")
|
19
|
+
return None
|
20
|
+
else:
|
21
|
+
logging.info("✅ Pipeline completed successfully")
|
22
|
+
return of
|
@@ -1,4 +1,4 @@
|
|
1
|
-
from contracts.contracts import transform_data
|
1
|
+
from embedding_flow.contracts.contracts import transform_data
|
2
2
|
from pathlib import Path
|
3
3
|
import pandas as pd
|
4
4
|
from sentence_transformers import SentenceTransformer
|
@@ -39,4 +39,4 @@ class transform_embedding(transform_data):
|
|
39
39
|
|
40
40
|
except Exception as e:
|
41
41
|
logger.error(f"❌ Error al transformar en embeddings {parquet_path}: {e}", exc_info=True)
|
42
|
-
return None
|
42
|
+
return None
|
@@ -0,0 +1,69 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: embedding-flow
|
3
|
+
Version: 0.1.2
|
4
|
+
Summary: Pipeline to transform text chunks into embeddings and load to Qdrant
|
5
|
+
Author: facuvega
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
7
|
+
Classifier: Programming Language :: Python :: 3.10
|
8
|
+
Classifier: Programming Language :: Python :: 3.11
|
9
|
+
Classifier: Programming Language :: Python :: 3.12
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
11
|
+
Classifier: Operating System :: OS Independent
|
12
|
+
Requires-Python: >=3.10
|
13
|
+
Description-Content-Type: text/markdown
|
14
|
+
License-File: LICENSE
|
15
|
+
Requires-Dist: pandas>=2.0.0
|
16
|
+
Requires-Dist: pyarrow>=12.0.0
|
17
|
+
Requires-Dist: sentence-transformers>=2.2.0
|
18
|
+
Requires-Dist: qdrant-client>=1.7.0
|
19
|
+
Requires-Dist: transformers
|
20
|
+
Provides-Extra: dev
|
21
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
22
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
23
|
+
Provides-Extra: cpu
|
24
|
+
Requires-Dist: torch>=2.0.0; extra == "cpu"
|
25
|
+
Provides-Extra: cuda
|
26
|
+
Requires-Dist: torch>=2.0.0; extra == "cuda"
|
27
|
+
Dynamic: license-file
|
28
|
+
|
29
|
+
# embedding-flow
|
30
|
+
|
31
|
+
Biblioteca para transformar chunks de texto en embeddings de 768 dimensiones y cargarlos en Qdrant.
|
32
|
+
|
33
|
+
## Instalación
|
34
|
+
|
35
|
+
```bash
|
36
|
+
# Instalación básica (instala torch según tu sistema)
|
37
|
+
pip install embedding-flow
|
38
|
+
|
39
|
+
# O instalar con torch CPU (recomendado si no tenés GPU)
|
40
|
+
pip install embedding-flow torch --index-url https://download.pytorch.org/whl/cpu
|
41
|
+
```
|
42
|
+
|
43
|
+
## Uso
|
44
|
+
|
45
|
+
```python
|
46
|
+
from embedding_flow import embedding_flow
|
47
|
+
|
48
|
+
# Recibe el path del parquet con chunks y carga embeddings a Qdrant
|
49
|
+
embedding_flow("/path/to/chunks.parquet")
|
50
|
+
```
|
51
|
+
|
52
|
+
## Variables de entorno
|
53
|
+
|
54
|
+
```bash
|
55
|
+
QDRANT_URL=http://localhost:6333
|
56
|
+
QDRANT_COLLECTION=embeddings_collection
|
57
|
+
VECTOR_SIZE=768
|
58
|
+
```
|
59
|
+
|
60
|
+
## Flujo
|
61
|
+
|
62
|
+
1. Lee chunks desde parquet
|
63
|
+
2. Genera embeddings (768 dim) con `all-mpnet-base-v2`
|
64
|
+
3. Carga embeddings a Qdrant (Docker local)
|
65
|
+
|
66
|
+
## Licencia
|
67
|
+
|
68
|
+
MIT
|
69
|
+
|
@@ -0,0 +1,19 @@
|
|
1
|
+
LICENSE
|
2
|
+
README.md
|
3
|
+
pyproject.toml
|
4
|
+
setup.py
|
5
|
+
embedding_flow/__init__.py
|
6
|
+
embedding_flow/main.py
|
7
|
+
embedding_flow.egg-info/PKG-INFO
|
8
|
+
embedding_flow.egg-info/SOURCES.txt
|
9
|
+
embedding_flow.egg-info/dependency_links.txt
|
10
|
+
embedding_flow.egg-info/requires.txt
|
11
|
+
embedding_flow.egg-info/top_level.txt
|
12
|
+
embedding_flow/contracts/__init__.py
|
13
|
+
embedding_flow/contracts/contracts.py
|
14
|
+
embedding_flow/load/__init__.py
|
15
|
+
embedding_flow/load/load.py
|
16
|
+
embedding_flow/transform/__init__.py
|
17
|
+
embedding_flow/transform/transform.py
|
18
|
+
tests/test_load.py
|
19
|
+
tests/test_transform.py
|
@@ -0,0 +1 @@
|
|
1
|
+
embedding_flow
|
@@ -4,16 +4,17 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "embedding-flow"
|
7
|
-
version = "0.1.
|
7
|
+
version = "0.1.2"
|
8
8
|
description = "Pipeline to transform text chunks into embeddings and load to Qdrant"
|
9
|
+
readme = "README.md"
|
9
10
|
authors = [{name = "facuvega"}]
|
10
11
|
requires-python = ">=3.10"
|
11
12
|
dependencies = [
|
12
13
|
"pandas>=2.0.0",
|
13
14
|
"pyarrow>=12.0.0",
|
14
15
|
"sentence-transformers>=2.2.0",
|
15
|
-
"torch>=2.0.0",
|
16
16
|
"qdrant-client>=1.7.0",
|
17
|
+
"transformers",
|
17
18
|
]
|
18
19
|
classifiers = [
|
19
20
|
"Programming Language :: Python :: 3",
|
@@ -29,8 +30,14 @@ dev = [
|
|
29
30
|
"pytest>=7.0.0",
|
30
31
|
"pytest-cov>=4.0.0",
|
31
32
|
]
|
33
|
+
cpu = [
|
34
|
+
"torch>=2.0.0",
|
35
|
+
]
|
36
|
+
cuda = [
|
37
|
+
"torch>=2.0.0",
|
38
|
+
]
|
32
39
|
|
33
40
|
[tool.setuptools.packages.find]
|
34
|
-
include = ["
|
41
|
+
include = ["embedding_flow*"]
|
35
42
|
exclude = ["tests*", "venv*"]
|
36
43
|
|
@@ -4,10 +4,10 @@ import tempfile
|
|
4
4
|
import numpy as np
|
5
5
|
from pathlib import Path
|
6
6
|
from unittest.mock import MagicMock, patch
|
7
|
-
from load.load import load_embedding
|
7
|
+
from embedding_flow.load.load import load_embedding
|
8
8
|
|
9
9
|
|
10
|
-
@patch('load.load.QdrantClient')
|
10
|
+
@patch('embedding_flow.load.load.QdrantClient')
|
11
11
|
def test_load_with_embeddings(mock_qdrant_client):
|
12
12
|
"""Test que load carga correctamente embeddings de 768 dims"""
|
13
13
|
# Mock del cliente
|
@@ -40,7 +40,7 @@ def test_load_with_embeddings(mock_qdrant_client):
|
|
40
40
|
Path(tmp_path).unlink(missing_ok=True)
|
41
41
|
|
42
42
|
|
43
|
-
@patch('load.load.QdrantClient')
|
43
|
+
@patch('embedding_flow.load.load.QdrantClient')
|
44
44
|
def test_load_without_embeddings(mock_qdrant_client):
|
45
45
|
"""Test que load falla sin columna 'embedding'"""
|
46
46
|
mock_client = MagicMock()
|
embedding_flow-0.1.0/MANIFEST.in
DELETED
embedding_flow-0.1.0/PKG-INFO
DELETED
@@ -1,22 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: embedding-flow
|
3
|
-
Version: 0.1.0
|
4
|
-
Summary: Pipeline to transform text chunks into embeddings and load to Qdrant
|
5
|
-
Author: facuvega
|
6
|
-
Classifier: Programming Language :: Python :: 3
|
7
|
-
Classifier: Programming Language :: Python :: 3.10
|
8
|
-
Classifier: Programming Language :: Python :: 3.11
|
9
|
-
Classifier: Programming Language :: Python :: 3.12
|
10
|
-
Classifier: License :: OSI Approved :: MIT License
|
11
|
-
Classifier: Operating System :: OS Independent
|
12
|
-
Requires-Python: >=3.10
|
13
|
-
License-File: LICENSE
|
14
|
-
Requires-Dist: pandas>=2.0.0
|
15
|
-
Requires-Dist: pyarrow>=12.0.0
|
16
|
-
Requires-Dist: sentence-transformers>=2.2.0
|
17
|
-
Requires-Dist: torch>=2.0.0
|
18
|
-
Requires-Dist: qdrant-client>=1.7.0
|
19
|
-
Provides-Extra: dev
|
20
|
-
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
21
|
-
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
22
|
-
Dynamic: license-file
|
embedding_flow-0.1.0/README.md
DELETED
@@ -1,40 +0,0 @@
|
|
1
|
-
# embedding-flow
|
2
|
-
|
3
|
-
Pipeline for transforming text chunks into 768-dimensional embeddings and loading to Qdrant.
|
4
|
-
|
5
|
-
## Installation
|
6
|
-
|
7
|
-
```bash
|
8
|
-
pip install embedding-flow
|
9
|
-
```
|
10
|
-
|
11
|
-
## Usage
|
12
|
-
|
13
|
-
```python
|
14
|
-
from transform.transform import transform_embedding
|
15
|
-
from load.load import load_embedding
|
16
|
-
|
17
|
-
# Transform
|
18
|
-
transformer = transform_embedding()
|
19
|
-
output_path = transformer.transform_data("chunks.parquet")
|
20
|
-
|
21
|
-
# Load to Qdrant
|
22
|
-
loader = load_embedding()
|
23
|
-
loader.load_data(output_path)
|
24
|
-
```
|
25
|
-
|
26
|
-
## Environment Variables
|
27
|
-
|
28
|
-
```bash
|
29
|
-
QDRANT_URL=http://localhost:6333
|
30
|
-
QDRANT_COLLECTION=embeddings_collection
|
31
|
-
VECTOR_SIZE=768
|
32
|
-
```
|
33
|
-
|
34
|
-
## Development
|
35
|
-
|
36
|
-
```bash
|
37
|
-
pip install -e ".[dev]"
|
38
|
-
pytest tests/
|
39
|
-
```
|
40
|
-
|
@@ -1,22 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: embedding-flow
|
3
|
-
Version: 0.1.0
|
4
|
-
Summary: Pipeline to transform text chunks into embeddings and load to Qdrant
|
5
|
-
Author: facuvega
|
6
|
-
Classifier: Programming Language :: Python :: 3
|
7
|
-
Classifier: Programming Language :: Python :: 3.10
|
8
|
-
Classifier: Programming Language :: Python :: 3.11
|
9
|
-
Classifier: Programming Language :: Python :: 3.12
|
10
|
-
Classifier: License :: OSI Approved :: MIT License
|
11
|
-
Classifier: Operating System :: OS Independent
|
12
|
-
Requires-Python: >=3.10
|
13
|
-
License-File: LICENSE
|
14
|
-
Requires-Dist: pandas>=2.0.0
|
15
|
-
Requires-Dist: pyarrow>=12.0.0
|
16
|
-
Requires-Dist: sentence-transformers>=2.2.0
|
17
|
-
Requires-Dist: torch>=2.0.0
|
18
|
-
Requires-Dist: qdrant-client>=1.7.0
|
19
|
-
Provides-Extra: dev
|
20
|
-
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
21
|
-
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
22
|
-
Dynamic: license-file
|
@@ -1,19 +0,0 @@
|
|
1
|
-
LICENSE
|
2
|
-
MANIFEST.in
|
3
|
-
README.md
|
4
|
-
pyproject.toml
|
5
|
-
requirements.txt
|
6
|
-
setup.py
|
7
|
-
contracts/__init__.py
|
8
|
-
contracts/contracts.py
|
9
|
-
embedding_flow.egg-info/PKG-INFO
|
10
|
-
embedding_flow.egg-info/SOURCES.txt
|
11
|
-
embedding_flow.egg-info/dependency_links.txt
|
12
|
-
embedding_flow.egg-info/requires.txt
|
13
|
-
embedding_flow.egg-info/top_level.txt
|
14
|
-
load/__init__.py
|
15
|
-
load/load.py
|
16
|
-
tests/test_load.py
|
17
|
-
tests/test_transform.py
|
18
|
-
transform/__init__.py
|
19
|
-
transform/transform.py
|
@@ -1,17 +0,0 @@
|
|
1
|
-
# Core dependencies
|
2
|
-
pandas>=2.0.0
|
3
|
-
pyarrow>=12.0.0
|
4
|
-
|
5
|
-
# ML & Embeddings
|
6
|
-
sentence-transformers>=2.2.0
|
7
|
-
torch>=2.0.0
|
8
|
-
|
9
|
-
# Vector Database
|
10
|
-
qdrant-client>=1.7.0
|
11
|
-
|
12
|
-
# Airflow (si se necesita localmente, sino está en el servidor)
|
13
|
-
# apache-airflow>=2.7.0
|
14
|
-
|
15
|
-
# Utilities
|
16
|
-
python-dotenv>=1.0.0
|
17
|
-
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|