PyPI - cvfile-haystack - Versions diffs - 0.1.0__tar.gz - Mend

cvfile-haystack 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

cvfile_haystack-0.1.0/.gitignore +67 -0
cvfile_haystack-0.1.0/PKG-INFO +101 -0
cvfile_haystack-0.1.0/README.md +76 -0
cvfile_haystack-0.1.0/pyproject.toml +47 -0
cvfile_haystack-0.1.0/src/haystack_integrations/components/converters/cvfile/__init__.py +5 -0
cvfile_haystack-0.1.0/src/haystack_integrations/components/converters/cvfile/converter.py +117 -0
cvfile_haystack-0.1.0/tests/__init__.py +0 -0
cvfile_haystack-0.1.0/tests/test_converter.py +77 -0

cvfile_haystack-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,67 @@
+# Build output
+dist/
+build/
+out/
+.next/
+.astro/
+.turbo/
+.cache/
+.parcel-cache/
+*.tsbuildinfo
+# Dependencies
+node_modules/
+.pnpm-store/
+__pycache__/
+*.pyc
+.venv/
+venv/
+.tox/
+*.egg-info/
+# Go
+vendor/
+# Logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+pnpm-debug.log*
+# Editor / OS
+.DS_Store
+Thumbs.db
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# Test artefacts
+coverage/
+.nyc_output/
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+# Secrets — never commit
+.env
+.env.*
+!.env.example
+*.pem
+*.key
+hf_*
+# Don't ignore generated fixtures — committing them keeps tests green on a
+# fresh clone (no need to rebuild jane-doe.cv before running cross-SDK
+# interop / security / search tests).
+# Tauri
+src-tauri/target/
+# Misc
+*.pid
+*.seed
+*.tgz
+.vercel

cvfile_haystack-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,101 @@
+Metadata-Version: 2.4
+Name: cvfile-haystack
+Version: 0.1.0
+Summary: Haystack integration for the .cv open file format.
+Project-URL: Homepage, https://cvfile.org
+Project-URL: Repository, https://github.com/cvfile/cv
+Project-URL: Issues, https://github.com/cvfile/cv/issues
+Author: cvfile.org
+License: Apache-2.0
+Keywords: ats,converter,cv,haystack,pdf,pdfa,rag,resume
+Classifier: Development Status :: 3 - Alpha
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Requires-Python: >=3.10
+Requires-Dist: cvfile<1,>=0.1.0
+Requires-Dist: haystack-ai<3,>=2.8
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0; extra == 'dev'
+Requires-Dist: ruff>=0.7; extra == 'dev'
+Description-Content-Type: text/markdown
+# cvfile-haystack
+Haystack 2.x converter component for the [`.cv`](https://cvfile.org) open file format.
+A `.cv` file is a PDF/A-3u file carrying a Markdown copy of the same content
+(plus optional HTML and JSON Resume) as PDF Associated Files. Instead of OCR
+ing the PDF, this component reads the embedded text payloads directly and
+emits Haystack `Document` objects ready for indexing.
+## Install
+```bash
+pip install cvfile-haystack
+```
+## Use
+```python
+from haystack_integrations.components.converters.cvfile import CVFileToDocument
+converter = CVFileToDocument()
+result = converter.run(sources=["resume.cv"])
+documents = result["documents"]
+for doc in documents:
+    print(doc.meta["payload"], doc.meta["mime_type"], len(doc.content))
+```
+You get one `Document` per textual payload found in the file. The Markdown
+copy (typically `resume.md`) is the one flagged with `meta["primary"] = True`.
+### Primary only
+If you only want the canonical Markdown copy and want to skip language
+alternates and supplements:
+```python
+converter = CVFileToDocument(primary_only=True)
+```
+### Pipeline use
+```python
+from haystack import Pipeline
+from haystack.components.embedders import SentenceTransformersDocumentEmbedder
+from haystack.components.writers import DocumentWriter
+from haystack.document_stores.in_memory import InMemoryDocumentStore
+from haystack_integrations.components.converters.cvfile import CVFileToDocument
+store = InMemoryDocumentStore()
+pipe = Pipeline()
+pipe.add_component("read", CVFileToDocument(primary_only=True))
+pipe.add_component("embed", SentenceTransformersDocumentEmbedder(model="BAAI/bge-m3"))
+pipe.add_component("write", DocumentWriter(document_store=store))
+pipe.connect("read.documents", "embed.documents")
+pipe.connect("embed.documents", "write.documents")
+pipe.run({"read": {"sources": ["resumes/jane.cv", "resumes/john.cv"]}})
+```
+## Metadata fields
+| Key | Description |
+|---|---|
+| `source` | The file path (or stream name) the document came from |
+| `payload` | Name of the embedded file (e.g. `resume.md`) |
+| `mime_type` | MIME of the payload (`text/markdown`, `text/html`, `application/json`) |
+| `relationship` | PDF Associated Files relationship (`Alternative` for primary alternates) |
+| `language` | BCP 47 language tag for this payload |
+| `primary` | `True` for the payload declared as primary in the file's XMP metadata |
+| `cv_version` | Version of the `.cv` spec the file conforms to |
+| `cv_generator` | Tool that produced the file, if recorded |
+## License
+Apache-2.0.

cvfile_haystack-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,76 @@
+# cvfile-haystack
+Haystack 2.x converter component for the [`.cv`](https://cvfile.org) open file format.
+A `.cv` file is a PDF/A-3u file carrying a Markdown copy of the same content
+(plus optional HTML and JSON Resume) as PDF Associated Files. Instead of OCR
+ing the PDF, this component reads the embedded text payloads directly and
+emits Haystack `Document` objects ready for indexing.
+## Install
+```bash
+pip install cvfile-haystack
+```
+## Use
+```python
+from haystack_integrations.components.converters.cvfile import CVFileToDocument
+converter = CVFileToDocument()
+result = converter.run(sources=["resume.cv"])
+documents = result["documents"]
+for doc in documents:
+    print(doc.meta["payload"], doc.meta["mime_type"], len(doc.content))
+```
+You get one `Document` per textual payload found in the file. The Markdown
+copy (typically `resume.md`) is the one flagged with `meta["primary"] = True`.
+### Primary only
+If you only want the canonical Markdown copy and want to skip language
+alternates and supplements:
+```python
+converter = CVFileToDocument(primary_only=True)
+```
+### Pipeline use
+```python
+from haystack import Pipeline
+from haystack.components.embedders import SentenceTransformersDocumentEmbedder
+from haystack.components.writers import DocumentWriter
+from haystack.document_stores.in_memory import InMemoryDocumentStore
+from haystack_integrations.components.converters.cvfile import CVFileToDocument
+store = InMemoryDocumentStore()
+pipe = Pipeline()
+pipe.add_component("read", CVFileToDocument(primary_only=True))
+pipe.add_component("embed", SentenceTransformersDocumentEmbedder(model="BAAI/bge-m3"))
+pipe.add_component("write", DocumentWriter(document_store=store))
+pipe.connect("read.documents", "embed.documents")
+pipe.connect("embed.documents", "write.documents")
+pipe.run({"read": {"sources": ["resumes/jane.cv", "resumes/john.cv"]}})
+```
+## Metadata fields
+| Key | Description |
+|---|---|
+| `source` | The file path (or stream name) the document came from |
+| `payload` | Name of the embedded file (e.g. `resume.md`) |
+| `mime_type` | MIME of the payload (`text/markdown`, `text/html`, `application/json`) |
+| `relationship` | PDF Associated Files relationship (`Alternative` for primary alternates) |
+| `language` | BCP 47 language tag for this payload |
+| `primary` | `True` for the payload declared as primary in the file's XMP metadata |
+| `cv_version` | Version of the `.cv` spec the file conforms to |
+| `cv_generator` | Tool that produced the file, if recorded |
+## License
+Apache-2.0.

cvfile_haystack-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,47 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[project]
+name = "cvfile-haystack"
+version = "0.1.0"
+description = "Haystack integration for the .cv open file format."
+readme = "README.md"
+license = { text = "Apache-2.0" }
+requires-python = ">=3.10"
+authors = [{ name = "cvfile.org" }]
+keywords = ["haystack", "cv", "resume", "pdf", "pdfa", "rag", "ats", "converter"]
+classifiers = [
+  "Development Status :: 3 - Alpha",
+  "License :: OSI Approved :: Apache Software License",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
+]
+dependencies = [
+  "cvfile>=0.1.0,<1",
+  "haystack-ai>=2.8,<3",
+]
+[project.urls]
+Homepage = "https://cvfile.org"
+Repository = "https://github.com/cvfile/cv"
+Issues = "https://github.com/cvfile/cv/issues"
+[project.optional-dependencies]
+dev = [
+  "pytest>=8.0",
+  "ruff>=0.7",
+]
+[tool.hatch.build.targets.wheel]
+packages = ["src/haystack_integrations"]
+[tool.ruff]
+line-length = 120
+target-version = "py310"
+[tool.ruff.lint]
+select = ["E", "F", "I", "B", "UP", "N", "SIM", "RUF"]

cvfile_haystack-0.1.0/src/haystack_integrations/components/converters/cvfile/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Haystack converter for the .cv open file format."""
+from haystack_integrations.components.converters.cvfile.converter import CVFileToDocument
+__all__ = ["CVFileToDocument"]

cvfile_haystack-0.1.0/src/haystack_integrations/components/converters/cvfile/converter.py ADDED Viewed

@@ -0,0 +1,117 @@
+"""Haystack ``@component`` converter for the .cv open file format."""
+from __future__ import annotations
+from pathlib import Path
+from typing import Any
+from cvfile import CvFile, ExtractedPayload, extract
+from haystack import Document, component, logging
+from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
+from haystack.dataclasses import ByteStream
+logger = logging.getLogger(__name__)
+_TEXT_MIME_PREFIXES: tuple[str, ...] = (
+    "text/",
+    "application/json",
+    "application/xml",
+)
+def _is_text_payload(payload: ExtractedPayload) -> bool:
+    return any(payload.mime_type.startswith(prefix) for prefix in _TEXT_MIME_PREFIXES)
+def _payload_meta(payload: ExtractedPayload, file: CvFile) -> dict[str, Any]:
+    return {
+        "mime_type": payload.mime_type,
+        "payload": payload.name,
+        "relationship": payload.relationship,
+        "language": payload.language or file.metadata.primary_language,
+        "primary": payload.name == file.metadata.primary_payload,
+        "cv_version": file.metadata.version,
+        "cv_generator": file.metadata.generator,
+    }
+@component
+class CVFileToDocument:
+    """Convert ``.cv`` files into Haystack ``Document`` objects.
+    A ``.cv`` file is a PDF/A-3u that carries one or more textual payloads
+    (Markdown, HTML, JSON) as PDF Associated Files. This converter reads
+    each ``.cv`` source and emits one ``Document`` per textual payload. The
+    visual PDF layer is intentionally skipped because the embedded Markdown
+    is a cleaner text representation of the same content.
+    Set ``primary_only=True`` to emit only the payload marked as
+    ``primaryPayload`` in the file's XMP metadata (usually the canonical
+    Markdown copy), and skip all alternates.
+    """
+    def __init__(self, primary_only: bool = False) -> None:
+        """Create a CVFileToDocument component.
+        :param primary_only:
+            If ``True``, emit only the payload marked as ``primaryPayload``
+            in the file's XMP metadata. If ``False`` (default), emit one
+            ``Document`` per textual payload (the primary plus any
+            language alternates and supplements).
+        """
+        self.primary_only = primary_only
+    @component.output_types(documents=list[Document])
+    def run(
+        self,
+        sources: list[str | Path | ByteStream],
+        meta: dict[str, Any] | list[dict[str, Any]] | None = None,
+    ) -> dict[str, Any]:
+        """Convert a list of ``.cv`` sources into ``Document`` objects.
+        :param sources:
+            File paths or ``ByteStream`` objects pointing at ``.cv`` files.
+        :param meta:
+            Optional metadata to attach to the produced documents. A single
+            dictionary is merged into every document. A list must have the
+            same length as ``sources`` and is zipped one to one with the
+            inputs (the same dictionary is merged into every document
+            produced from that source).
+        :returns:
+            A dictionary with key ``documents`` containing the list of
+            ``Document`` objects extracted from every source.
+        """
+        documents: list[Document] = []
+        meta_list = normalize_metadata(meta, sources_count=len(sources))
+        for source, source_meta in zip(sources, meta_list, strict=True):
+            try:
+                bytestream = get_bytestream_from_source(source)
+            except Exception as e:
+                logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
+                continue
+            try:
+                file = extract(bytestream.data)
+            except Exception as e:
+                logger.warning(
+                    "Could not parse .cv file from {source}. Skipping it. Error: {error}",
+                    source=source,
+                    error=e,
+                )
+                continue
+            stream_meta = bytestream.meta or {}
+            source_label = stream_meta.get("file_path") or stream_meta.get("file_name") or str(source)
+            for payload in file.payloads:
+                if not _is_text_payload(payload):
+                    continue
+                payload_meta = _payload_meta(payload, file)
+                if self.primary_only and not payload_meta["primary"]:
+                    continue
+                merged = {**stream_meta, **payload_meta, **source_meta, "source": source_label}
+                documents.append(Document(content=payload.text(), meta=merged))
+        return {"documents": documents}

cvfile_haystack-0.1.0/tests/__init__.py ADDED Viewed

File without changes

cvfile_haystack-0.1.0/tests/test_converter.py ADDED Viewed

@@ -0,0 +1,77 @@
+"""Smoke tests for the Haystack CVFileToDocument converter."""
+from __future__ import annotations
+from pathlib import Path
+import pytest
+from haystack import Document
+from haystack.dataclasses import ByteStream
+from haystack_integrations.components.converters.cvfile import CVFileToDocument
+FIXTURE = Path(__file__).parents[3] / "packages" / "sdk-js" / "tests" / "fixtures" / "python-produced.cv"
+@pytest.fixture(scope="module")
+def converter() -> CVFileToDocument:
+    if not FIXTURE.exists():
+        pytest.skip(f"fixture not found: {FIXTURE}")
+    return CVFileToDocument()
+def test_run_returns_documents(converter: CVFileToDocument) -> None:
+    result = converter.run(sources=[FIXTURE])
+    docs = result["documents"]
+    assert len(docs) >= 1
+    assert all(isinstance(d, Document) for d in docs)
+def test_each_document_has_required_meta(converter: CVFileToDocument) -> None:
+    docs = converter.run(sources=[FIXTURE])["documents"]
+    for doc in docs:
+        for key in ("source", "payload", "mime_type", "relationship", "language", "primary", "cv_version"):
+            assert key in doc.meta, f"missing meta key {key} on {doc.meta.get('payload')}"
+def test_exactly_one_primary_document(converter: CVFileToDocument) -> None:
+    docs = converter.run(sources=[FIXTURE])["documents"]
+    primaries = [d for d in docs if d.meta["primary"]]
+    assert len(primaries) == 1
+def test_primary_only_emits_just_the_primary() -> None:
+    if not FIXTURE.exists():
+        pytest.skip(f"fixture not found: {FIXTURE}")
+    primary_only = CVFileToDocument(primary_only=True)
+    docs = primary_only.run(sources=[FIXTURE])["documents"]
+    assert len(docs) == 1
+    assert docs[0].meta["primary"] is True
+    assert docs[0].content.strip()
+def test_extra_meta_is_merged() -> None:
+    if not FIXTURE.exists():
+        pytest.skip(f"fixture not found: {FIXTURE}")
+    converter = CVFileToDocument()
+    docs = converter.run(sources=[FIXTURE], meta={"candidate_id": "abc123"})["documents"]
+    assert docs, "expected at least one document"
+    assert all(d.meta.get("candidate_id") == "abc123" for d in docs)
+def test_accepts_bytestream() -> None:
+    if not FIXTURE.exists():
+        pytest.skip(f"fixture not found: {FIXTURE}")
+    stream = ByteStream(data=FIXTURE.read_bytes(), meta={"file_name": "jane.cv"})
+    converter = CVFileToDocument()
+    docs = converter.run(sources=[stream])["documents"]
+    assert docs
+    assert docs[0].meta["source"] == "jane.cv"
+def test_unreadable_source_is_skipped(tmp_path: Path) -> None:
+    converter = CVFileToDocument()
+    not_a_cv = tmp_path / "garbage.cv"
+    not_a_cv.write_bytes(b"not a real cv file")
+    result = converter.run(sources=[not_a_cv])
+    assert result["documents"] == []