cvfile-haystack 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,67 @@
1
+ # Build output
2
+ dist/
3
+ build/
4
+ out/
5
+ .next/
6
+ .astro/
7
+ .turbo/
8
+ .cache/
9
+ .parcel-cache/
10
+ *.tsbuildinfo
11
+
12
+ # Dependencies
13
+ node_modules/
14
+ .pnpm-store/
15
+ __pycache__/
16
+ *.pyc
17
+ .venv/
18
+ venv/
19
+ .tox/
20
+ *.egg-info/
21
+
22
+ # Go
23
+ vendor/
24
+
25
+ # Logs
26
+ *.log
27
+ npm-debug.log*
28
+ yarn-debug.log*
29
+ yarn-error.log*
30
+ pnpm-debug.log*
31
+
32
+ # Editor / OS
33
+ .DS_Store
34
+ Thumbs.db
35
+ .vscode/
36
+ .idea/
37
+ *.swp
38
+ *.swo
39
+ *~
40
+
41
+ # Test artefacts
42
+ coverage/
43
+ .nyc_output/
44
+ .pytest_cache/
45
+ .mypy_cache/
46
+ .ruff_cache/
47
+
48
+ # Secrets — never commit
49
+ .env
50
+ .env.*
51
+ !.env.example
52
+ *.pem
53
+ *.key
54
+ hf_*
55
+
56
+ # Don't ignore generated fixtures — committing them keeps tests green on a
57
+ # fresh clone (no need to rebuild jane-doe.cv before running cross-SDK
58
+ # interop / security / search tests).
59
+
60
+ # Tauri
61
+ src-tauri/target/
62
+
63
+ # Misc
64
+ *.pid
65
+ *.seed
66
+ *.tgz
67
+ .vercel
@@ -0,0 +1,101 @@
1
+ Metadata-Version: 2.4
2
+ Name: cvfile-haystack
3
+ Version: 0.1.0
4
+ Summary: Haystack integration for the .cv open file format.
5
+ Project-URL: Homepage, https://cvfile.org
6
+ Project-URL: Repository, https://github.com/cvfile/cv
7
+ Project-URL: Issues, https://github.com/cvfile/cv/issues
8
+ Author: cvfile.org
9
+ License: Apache-2.0
10
+ Keywords: ats,converter,cv,haystack,pdf,pdfa,rag,resume
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Requires-Python: >=3.10
19
+ Requires-Dist: cvfile<1,>=0.1.0
20
+ Requires-Dist: haystack-ai<3,>=2.8
21
+ Provides-Extra: dev
22
+ Requires-Dist: pytest>=8.0; extra == 'dev'
23
+ Requires-Dist: ruff>=0.7; extra == 'dev'
24
+ Description-Content-Type: text/markdown
25
+
26
+ # cvfile-haystack
27
+
28
+ Haystack 2.x converter component for the [`.cv`](https://cvfile.org) open file format.
29
+
30
+ A `.cv` file is a PDF/A-3u file carrying a Markdown copy of the same content
31
+ (plus optional HTML and JSON Resume) as PDF Associated Files. Instead of OCR
32
+ ing the PDF, this component reads the embedded text payloads directly and
33
+ emits Haystack `Document` objects ready for indexing.
34
+
35
+ ## Install
36
+
37
+ ```bash
38
+ pip install cvfile-haystack
39
+ ```
40
+
41
+ ## Use
42
+
43
+ ```python
44
+ from haystack_integrations.components.converters.cvfile import CVFileToDocument
45
+
46
+ converter = CVFileToDocument()
47
+ result = converter.run(sources=["resume.cv"])
48
+ documents = result["documents"]
49
+
50
+ for doc in documents:
51
+ print(doc.meta["payload"], doc.meta["mime_type"], len(doc.content))
52
+ ```
53
+
54
+ You get one `Document` per textual payload found in the file. The Markdown
55
+ copy (typically `resume.md`) is the one flagged with `meta["primary"] = True`.
56
+
57
+ ### Primary only
58
+
59
+ If you only want the canonical Markdown copy and want to skip language
60
+ alternates and supplements:
61
+
62
+ ```python
63
+ converter = CVFileToDocument(primary_only=True)
64
+ ```
65
+
66
+ ### Pipeline use
67
+
68
+ ```python
69
+ from haystack import Pipeline
70
+ from haystack.components.embedders import SentenceTransformersDocumentEmbedder
71
+ from haystack.components.writers import DocumentWriter
72
+ from haystack.document_stores.in_memory import InMemoryDocumentStore
73
+ from haystack_integrations.components.converters.cvfile import CVFileToDocument
74
+
75
+ store = InMemoryDocumentStore()
76
+ pipe = Pipeline()
77
+ pipe.add_component("read", CVFileToDocument(primary_only=True))
78
+ pipe.add_component("embed", SentenceTransformersDocumentEmbedder(model="BAAI/bge-m3"))
79
+ pipe.add_component("write", DocumentWriter(document_store=store))
80
+ pipe.connect("read.documents", "embed.documents")
81
+ pipe.connect("embed.documents", "write.documents")
82
+
83
+ pipe.run({"read": {"sources": ["resumes/jane.cv", "resumes/john.cv"]}})
84
+ ```
85
+
86
+ ## Metadata fields
87
+
88
+ | Key | Description |
89
+ |---|---|
90
+ | `source` | The file path (or stream name) the document came from |
91
+ | `payload` | Name of the embedded file (e.g. `resume.md`) |
92
+ | `mime_type` | MIME of the payload (`text/markdown`, `text/html`, `application/json`) |
93
+ | `relationship` | PDF Associated Files relationship (`Alternative` for primary alternates) |
94
+ | `language` | BCP 47 language tag for this payload |
95
+ | `primary` | `True` for the payload declared as primary in the file's XMP metadata |
96
+ | `cv_version` | Version of the `.cv` spec the file conforms to |
97
+ | `cv_generator` | Tool that produced the file, if recorded |
98
+
99
+ ## License
100
+
101
+ Apache-2.0.
@@ -0,0 +1,76 @@
1
+ # cvfile-haystack
2
+
3
+ Haystack 2.x converter component for the [`.cv`](https://cvfile.org) open file format.
4
+
5
+ A `.cv` file is a PDF/A-3u file carrying a Markdown copy of the same content
6
+ (plus optional HTML and JSON Resume) as PDF Associated Files. Instead of OCR
7
+ ing the PDF, this component reads the embedded text payloads directly and
8
+ emits Haystack `Document` objects ready for indexing.
9
+
10
+ ## Install
11
+
12
+ ```bash
13
+ pip install cvfile-haystack
14
+ ```
15
+
16
+ ## Use
17
+
18
+ ```python
19
+ from haystack_integrations.components.converters.cvfile import CVFileToDocument
20
+
21
+ converter = CVFileToDocument()
22
+ result = converter.run(sources=["resume.cv"])
23
+ documents = result["documents"]
24
+
25
+ for doc in documents:
26
+ print(doc.meta["payload"], doc.meta["mime_type"], len(doc.content))
27
+ ```
28
+
29
+ You get one `Document` per textual payload found in the file. The Markdown
30
+ copy (typically `resume.md`) is the one flagged with `meta["primary"] = True`.
31
+
32
+ ### Primary only
33
+
34
+ If you only want the canonical Markdown copy and want to skip language
35
+ alternates and supplements:
36
+
37
+ ```python
38
+ converter = CVFileToDocument(primary_only=True)
39
+ ```
40
+
41
+ ### Pipeline use
42
+
43
+ ```python
44
+ from haystack import Pipeline
45
+ from haystack.components.embedders import SentenceTransformersDocumentEmbedder
46
+ from haystack.components.writers import DocumentWriter
47
+ from haystack.document_stores.in_memory import InMemoryDocumentStore
48
+ from haystack_integrations.components.converters.cvfile import CVFileToDocument
49
+
50
+ store = InMemoryDocumentStore()
51
+ pipe = Pipeline()
52
+ pipe.add_component("read", CVFileToDocument(primary_only=True))
53
+ pipe.add_component("embed", SentenceTransformersDocumentEmbedder(model="BAAI/bge-m3"))
54
+ pipe.add_component("write", DocumentWriter(document_store=store))
55
+ pipe.connect("read.documents", "embed.documents")
56
+ pipe.connect("embed.documents", "write.documents")
57
+
58
+ pipe.run({"read": {"sources": ["resumes/jane.cv", "resumes/john.cv"]}})
59
+ ```
60
+
61
+ ## Metadata fields
62
+
63
+ | Key | Description |
64
+ |---|---|
65
+ | `source` | The file path (or stream name) the document came from |
66
+ | `payload` | Name of the embedded file (e.g. `resume.md`) |
67
+ | `mime_type` | MIME of the payload (`text/markdown`, `text/html`, `application/json`) |
68
+ | `relationship` | PDF Associated Files relationship (`Alternative` for primary alternates) |
69
+ | `language` | BCP 47 language tag for this payload |
70
+ | `primary` | `True` for the payload declared as primary in the file's XMP metadata |
71
+ | `cv_version` | Version of the `.cv` spec the file conforms to |
72
+ | `cv_generator` | Tool that produced the file, if recorded |
73
+
74
+ ## License
75
+
76
+ Apache-2.0.
@@ -0,0 +1,47 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "cvfile-haystack"
7
+ version = "0.1.0"
8
+ description = "Haystack integration for the .cv open file format."
9
+ readme = "README.md"
10
+ license = { text = "Apache-2.0" }
11
+ requires-python = ">=3.10"
12
+ authors = [{ name = "cvfile.org" }]
13
+ keywords = ["haystack", "cv", "resume", "pdf", "pdfa", "rag", "ats", "converter"]
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "License :: OSI Approved :: Apache Software License",
17
+ "Programming Language :: Python :: 3",
18
+ "Programming Language :: Python :: 3.10",
19
+ "Programming Language :: Python :: 3.11",
20
+ "Programming Language :: Python :: 3.12",
21
+ "Programming Language :: Python :: 3.13",
22
+ ]
23
+ dependencies = [
24
+ "cvfile>=0.1.0,<1",
25
+ "haystack-ai>=2.8,<3",
26
+ ]
27
+
28
+ [project.urls]
29
+ Homepage = "https://cvfile.org"
30
+ Repository = "https://github.com/cvfile/cv"
31
+ Issues = "https://github.com/cvfile/cv/issues"
32
+
33
+ [project.optional-dependencies]
34
+ dev = [
35
+ "pytest>=8.0",
36
+ "ruff>=0.7",
37
+ ]
38
+
39
+ [tool.hatch.build.targets.wheel]
40
+ packages = ["src/haystack_integrations"]
41
+
42
+ [tool.ruff]
43
+ line-length = 120
44
+ target-version = "py310"
45
+
46
+ [tool.ruff.lint]
47
+ select = ["E", "F", "I", "B", "UP", "N", "SIM", "RUF"]
@@ -0,0 +1,5 @@
1
+ """Haystack converter for the .cv open file format."""
2
+
3
+ from haystack_integrations.components.converters.cvfile.converter import CVFileToDocument
4
+
5
+ __all__ = ["CVFileToDocument"]
@@ -0,0 +1,117 @@
1
+ """Haystack ``@component`` converter for the .cv open file format."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from cvfile import CvFile, ExtractedPayload, extract
9
+ from haystack import Document, component, logging
10
+ from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
11
+ from haystack.dataclasses import ByteStream
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ _TEXT_MIME_PREFIXES: tuple[str, ...] = (
16
+ "text/",
17
+ "application/json",
18
+ "application/xml",
19
+ )
20
+
21
+
22
+ def _is_text_payload(payload: ExtractedPayload) -> bool:
23
+ return any(payload.mime_type.startswith(prefix) for prefix in _TEXT_MIME_PREFIXES)
24
+
25
+
26
+ def _payload_meta(payload: ExtractedPayload, file: CvFile) -> dict[str, Any]:
27
+ return {
28
+ "mime_type": payload.mime_type,
29
+ "payload": payload.name,
30
+ "relationship": payload.relationship,
31
+ "language": payload.language or file.metadata.primary_language,
32
+ "primary": payload.name == file.metadata.primary_payload,
33
+ "cv_version": file.metadata.version,
34
+ "cv_generator": file.metadata.generator,
35
+ }
36
+
37
+
38
+ @component
39
+ class CVFileToDocument:
40
+ """Convert ``.cv`` files into Haystack ``Document`` objects.
41
+
42
+ A ``.cv`` file is a PDF/A-3u that carries one or more textual payloads
43
+ (Markdown, HTML, JSON) as PDF Associated Files. This converter reads
44
+ each ``.cv`` source and emits one ``Document`` per textual payload. The
45
+ visual PDF layer is intentionally skipped because the embedded Markdown
46
+ is a cleaner text representation of the same content.
47
+
48
+ Set ``primary_only=True`` to emit only the payload marked as
49
+ ``primaryPayload`` in the file's XMP metadata (usually the canonical
50
+ Markdown copy), and skip all alternates.
51
+ """
52
+
53
+ def __init__(self, primary_only: bool = False) -> None:
54
+ """Create a CVFileToDocument component.
55
+
56
+ :param primary_only:
57
+ If ``True``, emit only the payload marked as ``primaryPayload``
58
+ in the file's XMP metadata. If ``False`` (default), emit one
59
+ ``Document`` per textual payload (the primary plus any
60
+ language alternates and supplements).
61
+ """
62
+ self.primary_only = primary_only
63
+
64
+ @component.output_types(documents=list[Document])
65
+ def run(
66
+ self,
67
+ sources: list[str | Path | ByteStream],
68
+ meta: dict[str, Any] | list[dict[str, Any]] | None = None,
69
+ ) -> dict[str, Any]:
70
+ """Convert a list of ``.cv`` sources into ``Document`` objects.
71
+
72
+ :param sources:
73
+ File paths or ``ByteStream`` objects pointing at ``.cv`` files.
74
+ :param meta:
75
+ Optional metadata to attach to the produced documents. A single
76
+ dictionary is merged into every document. A list must have the
77
+ same length as ``sources`` and is zipped one to one with the
78
+ inputs (the same dictionary is merged into every document
79
+ produced from that source).
80
+
81
+ :returns:
82
+ A dictionary with key ``documents`` containing the list of
83
+ ``Document`` objects extracted from every source.
84
+ """
85
+ documents: list[Document] = []
86
+ meta_list = normalize_metadata(meta, sources_count=len(sources))
87
+
88
+ for source, source_meta in zip(sources, meta_list, strict=True):
89
+ try:
90
+ bytestream = get_bytestream_from_source(source)
91
+ except Exception as e:
92
+ logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
93
+ continue
94
+
95
+ try:
96
+ file = extract(bytestream.data)
97
+ except Exception as e:
98
+ logger.warning(
99
+ "Could not parse .cv file from {source}. Skipping it. Error: {error}",
100
+ source=source,
101
+ error=e,
102
+ )
103
+ continue
104
+
105
+ stream_meta = bytestream.meta or {}
106
+ source_label = stream_meta.get("file_path") or stream_meta.get("file_name") or str(source)
107
+
108
+ for payload in file.payloads:
109
+ if not _is_text_payload(payload):
110
+ continue
111
+ payload_meta = _payload_meta(payload, file)
112
+ if self.primary_only and not payload_meta["primary"]:
113
+ continue
114
+ merged = {**stream_meta, **payload_meta, **source_meta, "source": source_label}
115
+ documents.append(Document(content=payload.text(), meta=merged))
116
+
117
+ return {"documents": documents}
File without changes
@@ -0,0 +1,77 @@
1
+ """Smoke tests for the Haystack CVFileToDocument converter."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ import pytest
8
+ from haystack import Document
9
+ from haystack.dataclasses import ByteStream
10
+
11
+ from haystack_integrations.components.converters.cvfile import CVFileToDocument
12
+
13
+ FIXTURE = Path(__file__).parents[3] / "packages" / "sdk-js" / "tests" / "fixtures" / "python-produced.cv"
14
+
15
+
16
+ @pytest.fixture(scope="module")
17
+ def converter() -> CVFileToDocument:
18
+ if not FIXTURE.exists():
19
+ pytest.skip(f"fixture not found: {FIXTURE}")
20
+ return CVFileToDocument()
21
+
22
+
23
+ def test_run_returns_documents(converter: CVFileToDocument) -> None:
24
+ result = converter.run(sources=[FIXTURE])
25
+ docs = result["documents"]
26
+ assert len(docs) >= 1
27
+ assert all(isinstance(d, Document) for d in docs)
28
+
29
+
30
+ def test_each_document_has_required_meta(converter: CVFileToDocument) -> None:
31
+ docs = converter.run(sources=[FIXTURE])["documents"]
32
+ for doc in docs:
33
+ for key in ("source", "payload", "mime_type", "relationship", "language", "primary", "cv_version"):
34
+ assert key in doc.meta, f"missing meta key {key} on {doc.meta.get('payload')}"
35
+
36
+
37
+ def test_exactly_one_primary_document(converter: CVFileToDocument) -> None:
38
+ docs = converter.run(sources=[FIXTURE])["documents"]
39
+ primaries = [d for d in docs if d.meta["primary"]]
40
+ assert len(primaries) == 1
41
+
42
+
43
+ def test_primary_only_emits_just_the_primary() -> None:
44
+ if not FIXTURE.exists():
45
+ pytest.skip(f"fixture not found: {FIXTURE}")
46
+ primary_only = CVFileToDocument(primary_only=True)
47
+ docs = primary_only.run(sources=[FIXTURE])["documents"]
48
+ assert len(docs) == 1
49
+ assert docs[0].meta["primary"] is True
50
+ assert docs[0].content.strip()
51
+
52
+
53
+ def test_extra_meta_is_merged() -> None:
54
+ if not FIXTURE.exists():
55
+ pytest.skip(f"fixture not found: {FIXTURE}")
56
+ converter = CVFileToDocument()
57
+ docs = converter.run(sources=[FIXTURE], meta={"candidate_id": "abc123"})["documents"]
58
+ assert docs, "expected at least one document"
59
+ assert all(d.meta.get("candidate_id") == "abc123" for d in docs)
60
+
61
+
62
+ def test_accepts_bytestream() -> None:
63
+ if not FIXTURE.exists():
64
+ pytest.skip(f"fixture not found: {FIXTURE}")
65
+ stream = ByteStream(data=FIXTURE.read_bytes(), meta={"file_name": "jane.cv"})
66
+ converter = CVFileToDocument()
67
+ docs = converter.run(sources=[stream])["documents"]
68
+ assert docs
69
+ assert docs[0].meta["source"] == "jane.cv"
70
+
71
+
72
+ def test_unreadable_source_is_skipped(tmp_path: Path) -> None:
73
+ converter = CVFileToDocument()
74
+ not_a_cv = tmp_path / "garbage.cv"
75
+ not_a_cv.write_bytes(b"not a real cv file")
76
+ result = converter.run(sources=[not_a_cv])
77
+ assert result["documents"] == []