embedding-flow 0.1.0__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {embedding_flow-0.1.0/embedding_flow.egg-info → embedding_flow-0.1.1}/PKG-INFO +40 -1
  2. embedding_flow-0.1.1/README.md +37 -0
  3. embedding_flow-0.1.1/embedding_flow/__init__.py +4 -0
  4. {embedding_flow-0.1.0 → embedding_flow-0.1.1/embedding_flow}/contracts/contracts.py +2 -1
  5. {embedding_flow-0.1.0 → embedding_flow-0.1.1/embedding_flow}/load/load.py +2 -2
  6. embedding_flow-0.1.1/embedding_flow/main.py +22 -0
  7. {embedding_flow-0.1.0 → embedding_flow-0.1.1/embedding_flow}/transform/transform.py +2 -2
  8. {embedding_flow-0.1.0 → embedding_flow-0.1.1/embedding_flow.egg-info}/PKG-INFO +40 -1
  9. embedding_flow-0.1.1/embedding_flow.egg-info/SOURCES.txt +19 -0
  10. embedding_flow-0.1.1/embedding_flow.egg-info/top_level.txt +1 -0
  11. {embedding_flow-0.1.0 → embedding_flow-0.1.1}/pyproject.toml +3 -2
  12. {embedding_flow-0.1.0 → embedding_flow-0.1.1}/tests/test_load.py +3 -3
  13. {embedding_flow-0.1.0 → embedding_flow-0.1.1}/tests/test_transform.py +1 -1
  14. embedding_flow-0.1.0/MANIFEST.in +0 -7
  15. embedding_flow-0.1.0/README.md +0 -40
  16. embedding_flow-0.1.0/embedding_flow.egg-info/SOURCES.txt +0 -19
  17. embedding_flow-0.1.0/embedding_flow.egg-info/top_level.txt +0 -3
  18. embedding_flow-0.1.0/requirements.txt +0 -17
  19. {embedding_flow-0.1.0 → embedding_flow-0.1.1}/LICENSE +0 -0
  20. {embedding_flow-0.1.0 → embedding_flow-0.1.1/embedding_flow}/contracts/__init__.py +0 -0
  21. {embedding_flow-0.1.0 → embedding_flow-0.1.1/embedding_flow}/load/__init__.py +0 -0
  22. {embedding_flow-0.1.0 → embedding_flow-0.1.1/embedding_flow}/transform/__init__.py +0 -0
  23. {embedding_flow-0.1.0 → embedding_flow-0.1.1}/embedding_flow.egg-info/dependency_links.txt +0 -0
  24. {embedding_flow-0.1.0 → embedding_flow-0.1.1}/embedding_flow.egg-info/requires.txt +0 -0
  25. {embedding_flow-0.1.0 → embedding_flow-0.1.1}/setup.cfg +0 -0
  26. {embedding_flow-0.1.0 → embedding_flow-0.1.1}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: embedding-flow
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: Pipeline to transform text chunks into embeddings and load to Qdrant
5
5
  Author: facuvega
6
6
  Classifier: Programming Language :: Python :: 3
@@ -10,6 +10,7 @@ Classifier: Programming Language :: Python :: 3.12
10
10
  Classifier: License :: OSI Approved :: MIT License
11
11
  Classifier: Operating System :: OS Independent
12
12
  Requires-Python: >=3.10
13
+ Description-Content-Type: text/markdown
13
14
  License-File: LICENSE
14
15
  Requires-Dist: pandas>=2.0.0
15
16
  Requires-Dist: pyarrow>=12.0.0
@@ -20,3 +21,41 @@ Provides-Extra: dev
20
21
  Requires-Dist: pytest>=7.0.0; extra == "dev"
21
22
  Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
22
23
  Dynamic: license-file
24
+
25
+ # embedding-flow
26
+
27
+ Biblioteca para transformar chunks de texto en embeddings de 768 dimensiones y cargarlos en Qdrant.
28
+
29
+ ## Instalación
30
+
31
+ ```bash
32
+ pip install embedding-flow
33
+ ```
34
+
35
+ ## Uso
36
+
37
+ ```python
38
+ from embedding_flow import embedding_flow
39
+
40
+ # Recibe el path del parquet con chunks y carga embeddings a Qdrant
41
+ embedding_flow("/path/to/chunks.parquet")
42
+ ```
43
+
44
+ ## Variables de entorno
45
+
46
+ ```bash
47
+ QDRANT_URL=http://localhost:6333
48
+ QDRANT_COLLECTION=embeddings_collection
49
+ VECTOR_SIZE=768
50
+ ```
51
+
52
+ ## Flujo
53
+
54
+ 1. Lee chunks desde parquet
55
+ 2. Genera embeddings (768 dim) con `all-mpnet-base-v2`
56
+ 3. Carga embeddings a Qdrant (Docker local)
57
+
58
+ ## Licencia
59
+
60
+ MIT
61
+
@@ -0,0 +1,37 @@
1
+ # embedding-flow
2
+
3
+ Biblioteca para transformar chunks de texto en embeddings de 768 dimensiones y cargarlos en Qdrant.
4
+
5
+ ## Instalación
6
+
7
+ ```bash
8
+ pip install embedding-flow
9
+ ```
10
+
11
+ ## Uso
12
+
13
+ ```python
14
+ from embedding_flow import embedding_flow
15
+
16
+ # Recibe el path del parquet con chunks y carga embeddings a Qdrant
17
+ embedding_flow("/path/to/chunks.parquet")
18
+ ```
19
+
20
+ ## Variables de entorno
21
+
22
+ ```bash
23
+ QDRANT_URL=http://localhost:6333
24
+ QDRANT_COLLECTION=embeddings_collection
25
+ VECTOR_SIZE=768
26
+ ```
27
+
28
+ ## Flujo
29
+
30
+ 1. Lee chunks desde parquet
31
+ 2. Genera embeddings (768 dim) con `all-mpnet-base-v2`
32
+ 3. Carga embeddings a Qdrant (Docker local)
33
+
34
+ ## Licencia
35
+
36
+ MIT
37
+
@@ -0,0 +1,4 @@
1
+ from embedding_flow.main import embedding_flow
2
+
3
+ __all__ = ['embedding_flow']
4
+
@@ -11,4 +11,5 @@ class load_data(ABC):
11
11
  @abstractmethod
12
12
  def load_data(self, url: str) -> bool:
13
13
  """Carga datos y retorna True si fue exitoso, False si falló"""
14
- pass
14
+ pass
15
+
@@ -1,4 +1,4 @@
1
- from contracts.contracts import load_data
1
+ from embedding_flow.contracts.contracts import load_data
2
2
  from qdrant_client import QdrantClient
3
3
  from qdrant_client.models import Distance, VectorParams, PointStruct
4
4
  import pandas as pd
@@ -98,4 +98,4 @@ class load_embedding(load_data):
98
98
 
99
99
  except Exception as e:
100
100
  logger.error(f"❌ Error al cargar embeddings a Qdrant desde {parquet_path}: {e}", exc_info=True)
101
- return False
101
+ return False
@@ -0,0 +1,22 @@
1
+ from embedding_flow.transform.transform import transform_embedding
2
+ from embedding_flow.load.load import load_embedding
3
+ import logging
4
+
5
+ logging.basicConfig(
6
+ level=logging.INFO, # Nivel mínimo de logs a mostrar
7
+ format='%(asctime)s - %(levelname)s - %(message)s',
8
+ filename='appMain.log', # Opcional: guarda los logs en un archivo
9
+ filemode='a' # 'a' append, 'w' overwrite
10
+ )
11
+
12
+
13
+ def embedding_flow(parquet_path: str)-> str | None :
14
+
15
+ transformer = transform_embedding(parquet_path)
16
+ of = load_embedding(transformer)
17
+ if of is None:
18
+ logging.error("❌ Pipeline failed")
19
+ return None
20
+ else:
21
+ logging.info("✅ Pipeline completed successfully")
22
+ return of
@@ -1,4 +1,4 @@
1
- from contracts.contracts import transform_data
1
+ from embedding_flow.contracts.contracts import transform_data
2
2
  from pathlib import Path
3
3
  import pandas as pd
4
4
  from sentence_transformers import SentenceTransformer
@@ -39,4 +39,4 @@ class transform_embedding(transform_data):
39
39
 
40
40
  except Exception as e:
41
41
  logger.error(f"❌ Error al transformar en embeddings {parquet_path}: {e}", exc_info=True)
42
- return None
42
+ return None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: embedding-flow
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: Pipeline to transform text chunks into embeddings and load to Qdrant
5
5
  Author: facuvega
6
6
  Classifier: Programming Language :: Python :: 3
@@ -10,6 +10,7 @@ Classifier: Programming Language :: Python :: 3.12
10
10
  Classifier: License :: OSI Approved :: MIT License
11
11
  Classifier: Operating System :: OS Independent
12
12
  Requires-Python: >=3.10
13
+ Description-Content-Type: text/markdown
13
14
  License-File: LICENSE
14
15
  Requires-Dist: pandas>=2.0.0
15
16
  Requires-Dist: pyarrow>=12.0.0
@@ -20,3 +21,41 @@ Provides-Extra: dev
20
21
  Requires-Dist: pytest>=7.0.0; extra == "dev"
21
22
  Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
22
23
  Dynamic: license-file
24
+
25
+ # embedding-flow
26
+
27
+ Biblioteca para transformar chunks de texto en embeddings de 768 dimensiones y cargarlos en Qdrant.
28
+
29
+ ## Instalación
30
+
31
+ ```bash
32
+ pip install embedding-flow
33
+ ```
34
+
35
+ ## Uso
36
+
37
+ ```python
38
+ from embedding_flow import embedding_flow
39
+
40
+ # Recibe el path del parquet con chunks y carga embeddings a Qdrant
41
+ embedding_flow("/path/to/chunks.parquet")
42
+ ```
43
+
44
+ ## Variables de entorno
45
+
46
+ ```bash
47
+ QDRANT_URL=http://localhost:6333
48
+ QDRANT_COLLECTION=embeddings_collection
49
+ VECTOR_SIZE=768
50
+ ```
51
+
52
+ ## Flujo
53
+
54
+ 1. Lee chunks desde parquet
55
+ 2. Genera embeddings (768 dim) con `all-mpnet-base-v2`
56
+ 3. Carga embeddings a Qdrant (Docker local)
57
+
58
+ ## Licencia
59
+
60
+ MIT
61
+
@@ -0,0 +1,19 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ setup.py
5
+ embedding_flow/__init__.py
6
+ embedding_flow/main.py
7
+ embedding_flow.egg-info/PKG-INFO
8
+ embedding_flow.egg-info/SOURCES.txt
9
+ embedding_flow.egg-info/dependency_links.txt
10
+ embedding_flow.egg-info/requires.txt
11
+ embedding_flow.egg-info/top_level.txt
12
+ embedding_flow/contracts/__init__.py
13
+ embedding_flow/contracts/contracts.py
14
+ embedding_flow/load/__init__.py
15
+ embedding_flow/load/load.py
16
+ embedding_flow/transform/__init__.py
17
+ embedding_flow/transform/transform.py
18
+ tests/test_load.py
19
+ tests/test_transform.py
@@ -0,0 +1 @@
1
+ embedding_flow
@@ -4,8 +4,9 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "embedding-flow"
7
- version = "0.1.0"
7
+ version = "0.1.1"
8
8
  description = "Pipeline to transform text chunks into embeddings and load to Qdrant"
9
+ readme = "README.md"
9
10
  authors = [{name = "facuvega"}]
10
11
  requires-python = ">=3.10"
11
12
  dependencies = [
@@ -31,6 +32,6 @@ dev = [
31
32
  ]
32
33
 
33
34
  [tool.setuptools.packages.find]
34
- include = ["contracts*", "transform*", "load*"]
35
+ include = ["embedding_flow*"]
35
36
  exclude = ["tests*", "venv*"]
36
37
 
@@ -4,10 +4,10 @@ import tempfile
4
4
  import numpy as np
5
5
  from pathlib import Path
6
6
  from unittest.mock import MagicMock, patch
7
- from load.load import load_embedding
7
+ from embedding_flow.load.load import load_embedding
8
8
 
9
9
 
10
- @patch('load.load.QdrantClient')
10
+ @patch('embedding_flow.load.load.QdrantClient')
11
11
  def test_load_with_embeddings(mock_qdrant_client):
12
12
  """Test que load carga correctamente embeddings de 768 dims"""
13
13
  # Mock del cliente
@@ -40,7 +40,7 @@ def test_load_with_embeddings(mock_qdrant_client):
40
40
  Path(tmp_path).unlink(missing_ok=True)
41
41
 
42
42
 
43
- @patch('load.load.QdrantClient')
43
+ @patch('embedding_flow.load.load.QdrantClient')
44
44
  def test_load_without_embeddings(mock_qdrant_client):
45
45
  """Test que load falla sin columna 'embedding'"""
46
46
  mock_client = MagicMock()
@@ -2,7 +2,7 @@ import pytest
2
2
  import pandas as pd
3
3
  import tempfile
4
4
  from pathlib import Path
5
- from transform.transform import transform_embedding
5
+ from embedding_flow.transform.transform import transform_embedding
6
6
 
7
7
 
8
8
  def test_transform_creates_embeddings():
@@ -1,7 +0,0 @@
1
- include README.md
2
- include LICENSE
3
- include requirements.txt
4
- recursive-include contracts *.py
5
- recursive-include transform *.py
6
- recursive-include load *.py
7
-
@@ -1,40 +0,0 @@
1
- # embedding-flow
2
-
3
- Pipeline for transforming text chunks into 768-dimensional embeddings and loading to Qdrant.
4
-
5
- ## Installation
6
-
7
- ```bash
8
- pip install embedding-flow
9
- ```
10
-
11
- ## Usage
12
-
13
- ```python
14
- from transform.transform import transform_embedding
15
- from load.load import load_embedding
16
-
17
- # Transform
18
- transformer = transform_embedding()
19
- output_path = transformer.transform_data("chunks.parquet")
20
-
21
- # Load to Qdrant
22
- loader = load_embedding()
23
- loader.load_data(output_path)
24
- ```
25
-
26
- ## Environment Variables
27
-
28
- ```bash
29
- QDRANT_URL=http://localhost:6333
30
- QDRANT_COLLECTION=embeddings_collection
31
- VECTOR_SIZE=768
32
- ```
33
-
34
- ## Development
35
-
36
- ```bash
37
- pip install -e ".[dev]"
38
- pytest tests/
39
- ```
40
-
@@ -1,19 +0,0 @@
1
- LICENSE
2
- MANIFEST.in
3
- README.md
4
- pyproject.toml
5
- requirements.txt
6
- setup.py
7
- contracts/__init__.py
8
- contracts/contracts.py
9
- embedding_flow.egg-info/PKG-INFO
10
- embedding_flow.egg-info/SOURCES.txt
11
- embedding_flow.egg-info/dependency_links.txt
12
- embedding_flow.egg-info/requires.txt
13
- embedding_flow.egg-info/top_level.txt
14
- load/__init__.py
15
- load/load.py
16
- tests/test_load.py
17
- tests/test_transform.py
18
- transform/__init__.py
19
- transform/transform.py
@@ -1,3 +0,0 @@
1
- contracts
2
- load
3
- transform
@@ -1,17 +0,0 @@
1
- # Core dependencies
2
- pandas>=2.0.0
3
- pyarrow>=12.0.0
4
-
5
- # ML & Embeddings
6
- sentence-transformers>=2.2.0
7
- torch>=2.0.0
8
-
9
- # Vector Database
10
- qdrant-client>=1.7.0
11
-
12
- # Airflow (si se necesita localmente, sino está en el servidor)
13
- # apache-airflow>=2.7.0
14
-
15
- # Utilities
16
- python-dotenv>=1.0.0
17
-
File without changes
File without changes
File without changes