PyPI - cvfile-haystack - Versions diffs - 0.1.0__tar.gz → 0.2.0__tar.gz - Mend

cvfile-haystack 0.1.0tar.gz → 0.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

{cvfile_haystack-0.1.0 → cvfile_haystack-0.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: cvfile-haystack
-Version: 0.1.0
+Version: 0.2.0
 Summary: Haystack integration for the .cv open file format.
 Project-URL: Homepage, https://cvfile.org
 Project-URL: Repository, https://github.com/cvfile/cv
@@ -16,7 +16,7 @@ Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
 Requires-Python: >=3.10
-Requires-Dist: cvfile<1,>=0.1.0
+Requires-Dist: cvfile<2,>=0.1
 Requires-Dist: haystack-ai<3,>=2.8
 Provides-Extra: dev
 Requires-Dist: pytest>=8.0; extra == 'dev'

{cvfile_haystack-0.1.0 → cvfile_haystack-0.2.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "cvfile-haystack"
-version = "0.1.0"
+version = "0.2.0"
 description = "Haystack integration for the .cv open file format."
 readme = "README.md"
 license = { text = "Apache-2.0" }
@@ -21,7 +21,7 @@ classifiers = [
   "Programming Language :: Python :: 3.13",
 ]
 dependencies = [
-  "cvfile>=0.1.0,<1",
+  "cvfile>=0.1,<2",
   "haystack-ai>=2.8,<3",
 ]

{cvfile_haystack-0.1.0 → cvfile_haystack-0.2.0}/src/haystack_integrations/components/converters/cvfile/converter.py RENAMED Viewed

@@ -28,13 +28,41 @@ def _payload_meta(payload: ExtractedPayload, file: CvFile) -> dict[str, Any]:
         "mime_type": payload.mime_type,
         "payload": payload.name,
         "relationship": payload.relationship,
-        "language": payload.language or file.metadata.primary_language,
+        "language": payload.language,
         "primary": payload.name == file.metadata.primary_payload,
         "cv_version": file.metadata.version,
         "cv_generator": file.metadata.generator,
     }
+def _resolve_chunks(file: CvFile) -> list:
+    """Decode the file's embeddings.cbor into text-resolved chunks.
+    Delegates to the core SDK so chunk text slicing uses UTF-8 byte offsets
+    (spec §5.1) and stays the single source of truth. Returns an empty list
+    when the embed extra is not installed or the file carries no embeddings.
+    """
+    try:
+        from cvfile.embed import resolve_embedding_chunks
+    except ImportError:
+        return []
+    return resolve_embedding_chunks(file)
+def _chunk_meta(chunk: Any, file: CvFile) -> dict[str, Any]:
+    return {
+        "language": file.metadata.primary_language,
+        "cv_version": file.metadata.version,
+        "cv_generator": file.metadata.generator,
+        "chunk_id": chunk.id,
+        "chunk_offset": chunk.text_offset,
+        "chunk_length": chunk.text_length,
+        "embedding_model": chunk.model,
+        "embedding_dimension": chunk.dimension,
+        "embedding_metric": chunk.metric,
+    }
 @component
 class CVFileToDocument:
     """Convert ``.cv`` files into Haystack ``Document`` objects.
@@ -48,18 +76,32 @@ class CVFileToDocument:
     Set ``primary_only=True`` to emit only the payload marked as
     ``primaryPayload`` in the file's XMP metadata (usually the canonical
     Markdown copy), and skip all alternates.
+    Set ``mode="chunks"`` to emit one ``Document`` per pre-computed embedding
+    chunk instead of one per payload. Each chunk ``Document`` carries its vector
+    on ``Document.embedding`` and its text is sliced from the markdown using
+    UTF-8 byte offsets. Files without an embeddings payload fall back to a single
+    Markdown ``Document``. In ``mode="chunks"`` the ``primary_only`` flag is
+    ignored (chunks already index a single text payload).
     """
-    def __init__(self, primary_only: bool = False) -> None:
+    def __init__(self, primary_only: bool = False, *, mode: str = "payloads") -> None:
         """Create a CVFileToDocument component.
         :param primary_only:
             If ``True``, emit only the payload marked as ``primaryPayload``
             in the file's XMP metadata. If ``False`` (default), emit one
             ``Document`` per textual payload (the primary plus any
-            language alternates and supplements).
+            language alternates and supplements). Ignored in ``mode="chunks"``.
+        :param mode:
+            ``"payloads"`` (default) emits one ``Document`` per textual payload.
+            ``"chunks"`` emits one ``Document`` per pre-computed embedding chunk
+            with its vector attached.
         """
+        if mode not in ("payloads", "chunks"):
+            raise ValueError("mode must be 'payloads' or 'chunks'")
         self.primary_only = primary_only
+        self.mode = mode
     @component.output_types(documents=list[Document])
     def run(
@@ -105,6 +147,10 @@ class CVFileToDocument:
             stream_meta = bytestream.meta or {}
             source_label = stream_meta.get("file_path") or stream_meta.get("file_name") or str(source)
+            if self.mode == "chunks":
+                documents.extend(self._chunk_documents(file, stream_meta, source_meta, source_label))
+                continue
             for payload in file.payloads:
                 if not _is_text_payload(payload):
                     continue
@@ -115,3 +161,28 @@ class CVFileToDocument:
                 documents.append(Document(content=payload.text(), meta=merged))
         return {"documents": documents}
+    @staticmethod
+    def _chunk_documents(
+        file: CvFile,
+        stream_meta: dict[str, Any],
+        source_meta: dict[str, Any],
+        source_label: str,
+    ) -> list[Document]:
+        chunks = _resolve_chunks(file)
+        if not chunks:
+            primary = next(
+                (p for p in file.payloads if p.name == file.metadata.primary_payload and _is_text_payload(p)),
+                None,
+            )
+            if primary is None:
+                return []
+            payload_meta = _payload_meta(primary, file)
+            merged = {**stream_meta, **payload_meta, **source_meta, "source": source_label}
+            return [Document(content=primary.text(), meta=merged)]
+        out: list[Document] = []
+        for chunk in chunks:
+            merged = {**stream_meta, **_chunk_meta(chunk, file), **source_meta, "source": source_label}
+            out.append(Document(content=chunk.text, meta=merged, embedding=list(chunk.vector)))
+        return out

{cvfile_haystack-0.1.0 → cvfile_haystack-0.2.0}/tests/test_converter.py RENAMED Viewed

@@ -11,6 +11,7 @@ from haystack.dataclasses import ByteStream
 from haystack_integrations.components.converters.cvfile import CVFileToDocument
 FIXTURE = Path(__file__).parents[3] / "packages" / "sdk-js" / "tests" / "fixtures" / "python-produced.cv"
+UNICODE_FIXTURE = Path(__file__).parents[2] / "tests" / "fixtures" / "unicode.cv"
 @pytest.fixture(scope="module")
@@ -75,3 +76,33 @@ def test_unreadable_source_is_skipped(tmp_path: Path) -> None:
     not_a_cv.write_bytes(b"not a real cv file")
     result = converter.run(sources=[not_a_cv])
     assert result["documents"] == []
+def test_chunks_mode_attaches_a_vector_per_chunk() -> None:
+    if not FIXTURE.exists():
+        pytest.skip(f"fixture not found: {FIXTURE}")
+    docs = CVFileToDocument(mode="chunks").run(sources=[FIXTURE])["documents"]
+    assert len(docs) >= 1
+    for doc in docs:
+        assert doc.embedding is not None
+        assert len(doc.embedding) == doc.meta["embedding_dimension"]
+        assert all(isinstance(v, float) for v in doc.embedding)
+        assert doc.content.strip(), "chunk text should not be empty"
+def test_invalid_mode_rejected() -> None:
+    with pytest.raises(ValueError):
+        CVFileToDocument(mode="bogus")
+def test_non_ascii_chunk_text_slices_on_byte_offsets() -> None:
+    if not UNICODE_FIXTURE.exists():
+        pytest.skip(f"fixture not found: {UNICODE_FIXTURE}")
+    docs = CVFileToDocument(mode="chunks").run(sources=[UNICODE_FIXTURE])["documents"]
+    joined = "".join(d.content for d in docs)
+    assert "Élodie" in joined
+    assert "工程師" in joined
+    assert "🚀" in joined
+    assert "经验" in joined
+    for doc in docs:
+        assert doc.content == doc.content.encode("utf-8").decode("utf-8")

{cvfile_haystack-0.1.0 → cvfile_haystack-0.2.0}/.gitignore RENAMED Viewed

File without changes

{cvfile_haystack-0.1.0 → cvfile_haystack-0.2.0}/README.md RENAMED Viewed

File without changes

{cvfile_haystack-0.1.0 → cvfile_haystack-0.2.0}/src/haystack_integrations/components/converters/cvfile/__init__.py RENAMED Viewed

File without changes

{cvfile_haystack-0.1.0 → cvfile_haystack-0.2.0}/tests/__init__.py RENAMED Viewed

File without changes

cvfile-haystack 0.1.0__tar.gz → 0.2.0__tar.gz

cvfile-haystack 0.1.0tar.gz → 0.2.0tar.gz