cvfile-haystack 0.1.0__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -65,3 +65,16 @@ src-tauri/target/
65
65
  *.seed
66
66
  *.tgz
67
67
  .vercel
68
+
69
+ # Launch material (HN/PH drafts, runbook, screenshots) and private outreach
70
+ # data (PII): kept local, purged from history, never published.
71
+ launch/
72
+
73
+ # Internal planning documents: business strategy, launch gates, monetization.
74
+ # Kept local, never published.
75
+ PLAN.md
76
+ ROADMAP.md
77
+
78
+ # Generated demo/build artefacts that escape the dist/ pattern
79
+ dist-demo/
80
+ docs/public/embed/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cvfile-haystack
3
- Version: 0.1.0
3
+ Version: 0.3.2
4
4
  Summary: Haystack integration for the .cv open file format.
5
5
  Project-URL: Homepage, https://cvfile.org
6
6
  Project-URL: Repository, https://github.com/cvfile/cv
@@ -15,8 +15,9 @@ Classifier: Programming Language :: Python :: 3.10
15
15
  Classifier: Programming Language :: Python :: 3.11
16
16
  Classifier: Programming Language :: Python :: 3.12
17
17
  Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Programming Language :: Python :: 3.14
18
19
  Requires-Python: >=3.10
19
- Requires-Dist: cvfile<1,>=0.1.0
20
+ Requires-Dist: cvfile<2,>=0.3.2
20
21
  Requires-Dist: haystack-ai<3,>=2.8
21
22
  Provides-Extra: dev
22
23
  Requires-Dist: pytest>=8.0; extra == 'dev'
@@ -63,6 +64,20 @@ alternates and supplements:
63
64
  converter = CVFileToDocument(primary_only=True)
64
65
  ```
65
66
 
67
+ ### Untrusted files
68
+
69
+ By default the converter runs `cvfile.validate()` on every source before
70
+ extracting anything. Files carrying forbidden active content (JavaScript,
71
+ launch or submit actions, external references), encryption, integrity digest
72
+ mismatches, or payloads over the spec size cap make `run()` raise
73
+ `ValueError` listing the issue codes. Resumes are classic untrusted input,
74
+ so keep the default when converting files you did not produce yourself.
75
+
76
+ ```python
77
+ converter = CVFileToDocument() # verify=True (default)
78
+ converter = CVFileToDocument(verify=False) # trusted files only
79
+ ```
80
+
66
81
  ### Pipeline use
67
82
 
68
83
  ```python
@@ -38,6 +38,20 @@ alternates and supplements:
38
38
  converter = CVFileToDocument(primary_only=True)
39
39
  ```
40
40
 
41
+ ### Untrusted files
42
+
43
+ By default the converter runs `cvfile.validate()` on every source before
44
+ extracting anything. Files carrying forbidden active content (JavaScript,
45
+ launch or submit actions, external references), encryption, integrity digest
46
+ mismatches, or payloads over the spec size cap make `run()` raise
47
+ `ValueError` listing the issue codes. Resumes are classic untrusted input,
48
+ so keep the default when converting files you did not produce yourself.
49
+
50
+ ```python
51
+ converter = CVFileToDocument() # verify=True (default)
52
+ converter = CVFileToDocument(verify=False) # trusted files only
53
+ ```
54
+
41
55
  ### Pipeline use
42
56
 
43
57
  ```python
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "cvfile-haystack"
7
- version = "0.1.0"
7
+ version = "0.3.2"
8
8
  description = "Haystack integration for the .cv open file format."
9
9
  readme = "README.md"
10
10
  license = { text = "Apache-2.0" }
@@ -19,9 +19,10 @@ classifiers = [
19
19
  "Programming Language :: Python :: 3.11",
20
20
  "Programming Language :: Python :: 3.12",
21
21
  "Programming Language :: Python :: 3.13",
22
+ "Programming Language :: Python :: 3.14",
22
23
  ]
23
24
  dependencies = [
24
- "cvfile>=0.1.0,<1",
25
+ "cvfile>=0.3.2,<2",
25
26
  "haystack-ai>=2.8,<3",
26
27
  ]
27
28
 
@@ -0,0 +1,229 @@
1
+ """Haystack ``@component`` converter for the .cv open file format."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from haystack import Document, component, logging
9
+ from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
10
+ from haystack.dataclasses import ByteStream
11
+
12
+ from cvfile import CvFile, ExtractedPayload, extract, validate
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ _TEXT_MIME_PREFIXES: tuple[str, ...] = (
17
+ "text/",
18
+ "application/json",
19
+ "application/xml",
20
+ )
21
+
22
+
23
+ def _is_text_payload(payload: ExtractedPayload) -> bool:
24
+ return any(payload.mime_type.startswith(prefix) for prefix in _TEXT_MIME_PREFIXES)
25
+
26
+
27
+ def _payload_meta(payload: ExtractedPayload, file: CvFile) -> dict[str, Any]:
28
+ return {
29
+ "mime_type": payload.mime_type,
30
+ "payload": payload.name,
31
+ "relationship": payload.relationship,
32
+ "language": payload.language,
33
+ "primary": payload.name == file.metadata.primary_payload,
34
+ "cv_version": file.metadata.version,
35
+ "cv_generator": file.metadata.generator,
36
+ }
37
+
38
+
39
+ def _verify_cv(data: bytes, source: str) -> None:
40
+ """Refuse to convert a .cv file that fails ``cvfile.validate()`` (lenient level).
41
+
42
+ Validation rejects forbidden active content (JavaScript, launch and submit
43
+ actions, external references), encryption, integrity digest mismatches, and
44
+ payloads over the spec size cap, which is the right default for untrusted
45
+ input.
46
+ """
47
+ report = validate(data)
48
+ if report.ok:
49
+ return
50
+ codes = ", ".join(sorted({issue.code for issue in report.issues if issue.level == "error"}))
51
+ raise ValueError(
52
+ f".cv validation failed for {source}: {codes}. "
53
+ "The file was rejected before extraction; pass verify=False only for trusted files."
54
+ )
55
+
56
+
57
+ def _resolve_chunks(file: CvFile) -> list:
58
+ """Decode the file's embeddings.cbor into text-resolved chunks.
59
+
60
+ Delegates to the core SDK so chunk text slicing uses UTF-8 byte offsets
61
+ (spec §5.1) and stays the single source of truth. Returns an empty list
62
+ when the embed extra is not installed or the file carries no embeddings.
63
+ """
64
+ try:
65
+ from cvfile.embed import resolve_embedding_chunks
66
+ except ImportError:
67
+ return []
68
+ return resolve_embedding_chunks(file)
69
+
70
+
71
+ def _chunk_meta(chunk: Any, file: CvFile) -> dict[str, Any]:
72
+ return {
73
+ "language": file.metadata.primary_language,
74
+ "cv_version": file.metadata.version,
75
+ "cv_generator": file.metadata.generator,
76
+ "chunk_id": chunk.id,
77
+ "chunk_offset": chunk.text_offset,
78
+ "chunk_length": chunk.text_length,
79
+ "embedding_model": chunk.model,
80
+ "embedding_dimension": chunk.dimension,
81
+ "embedding_metric": chunk.metric,
82
+ }
83
+
84
+
85
+ @component
86
+ class CVFileToDocument:
87
+ """Convert ``.cv`` files into Haystack ``Document`` objects.
88
+
89
+ A ``.cv`` file is a PDF/A-3u that carries one or more textual payloads
90
+ (Markdown, HTML, JSON) as PDF Associated Files. This converter reads
91
+ each ``.cv`` source and emits one ``Document`` per textual payload. The
92
+ visual PDF layer is intentionally skipped because the embedded Markdown
93
+ is a cleaner text representation of the same content.
94
+
95
+ Set ``primary_only=True`` to emit only the payload marked as
96
+ ``primaryPayload`` in the file's XMP metadata (usually the canonical
97
+ Markdown copy), and skip all alternates.
98
+
99
+ Set ``mode="chunks"`` to emit one ``Document`` per pre-computed embedding
100
+ chunk instead of one per payload. Each chunk ``Document`` carries its vector
101
+ on ``Document.embedding`` and its text is sliced from the markdown using
102
+ UTF-8 byte offsets. Files without an embeddings payload fall back to a single
103
+ Markdown ``Document``. In ``mode="chunks"`` the ``primary_only`` flag is
104
+ ignored (chunks already index a single text payload).
105
+
106
+ By default (``verify=True``) each source is checked with
107
+ ``cvfile.validate()`` before extraction: files carrying forbidden active
108
+ content (JavaScript, launch or submit actions, external references),
109
+ encryption, integrity digest mismatches, or oversized payloads make
110
+ ``run()`` raise ``ValueError`` listing the issue codes. Unlike unreadable
111
+ sources, which are logged and skipped, a validation failure is a security
112
+ signal and is never silently dropped. Set ``verify=False`` to skip the
113
+ check for trusted files only.
114
+ """
115
+
116
+ def __init__(self, primary_only: bool = False, *, mode: str = "payloads", verify: bool = True) -> None:
117
+ """Create a CVFileToDocument component.
118
+
119
+ :param primary_only:
120
+ If ``True``, emit only the payload marked as ``primaryPayload``
121
+ in the file's XMP metadata. If ``False`` (default), emit one
122
+ ``Document`` per textual payload (the primary plus any
123
+ language alternates and supplements). Ignored in ``mode="chunks"``.
124
+ :param mode:
125
+ ``"payloads"`` (default) emits one ``Document`` per textual payload.
126
+ ``"chunks"`` emits one ``Document`` per pre-computed embedding chunk
127
+ with its vector attached.
128
+ :param verify:
129
+ If ``True`` (default), run ``cvfile.validate()`` on every source
130
+ and raise ``ValueError`` when a file fails validation. Set to
131
+ ``False`` only for trusted files.
132
+ """
133
+ if mode not in ("payloads", "chunks"):
134
+ raise ValueError("mode must be 'payloads' or 'chunks'")
135
+ self.primary_only = primary_only
136
+ self.mode = mode
137
+ self.verify = verify
138
+
139
+ @component.output_types(documents=list[Document])
140
+ def run(
141
+ self,
142
+ sources: list[str | Path | ByteStream],
143
+ meta: dict[str, Any] | list[dict[str, Any]] | None = None,
144
+ ) -> dict[str, Any]:
145
+ """Convert a list of ``.cv`` sources into ``Document`` objects.
146
+
147
+ :param sources:
148
+ File paths or ``ByteStream`` objects pointing at ``.cv`` files.
149
+ :param meta:
150
+ Optional metadata to attach to the produced documents. A single
151
+ dictionary is merged into every document. A list must have the
152
+ same length as ``sources`` and is zipped one to one with the
153
+ inputs (the same dictionary is merged into every document
154
+ produced from that source).
155
+
156
+ :raises ValueError:
157
+ When ``verify=True`` (the default) and a source fails
158
+ ``cvfile.validate()``. The message lists the validation issue
159
+ codes.
160
+
161
+ :returns:
162
+ A dictionary with key ``documents`` containing the list of
163
+ ``Document`` objects extracted from every source.
164
+ """
165
+ documents: list[Document] = []
166
+ meta_list = normalize_metadata(meta, sources_count=len(sources))
167
+
168
+ for source, source_meta in zip(sources, meta_list, strict=True):
169
+ try:
170
+ bytestream = get_bytestream_from_source(source)
171
+ except Exception as e:
172
+ logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
173
+ continue
174
+
175
+ if self.verify:
176
+ _verify_cv(bytestream.data, str(source))
177
+
178
+ try:
179
+ file = extract(bytestream.data)
180
+ except Exception as e:
181
+ logger.warning(
182
+ "Could not parse .cv file from {source}. Skipping it. Error: {error}",
183
+ source=source,
184
+ error=e,
185
+ )
186
+ continue
187
+
188
+ stream_meta = bytestream.meta or {}
189
+ source_label = stream_meta.get("file_path") or stream_meta.get("file_name") or str(source)
190
+
191
+ if self.mode == "chunks":
192
+ documents.extend(self._chunk_documents(file, stream_meta, source_meta, source_label))
193
+ continue
194
+
195
+ for payload in file.payloads:
196
+ if not _is_text_payload(payload):
197
+ continue
198
+ payload_meta = _payload_meta(payload, file)
199
+ if self.primary_only and not payload_meta["primary"]:
200
+ continue
201
+ merged = {**stream_meta, **payload_meta, **source_meta, "source": source_label}
202
+ documents.append(Document(content=payload.text(), meta=merged))
203
+
204
+ return {"documents": documents}
205
+
206
+ @staticmethod
207
+ def _chunk_documents(
208
+ file: CvFile,
209
+ stream_meta: dict[str, Any],
210
+ source_meta: dict[str, Any],
211
+ source_label: str,
212
+ ) -> list[Document]:
213
+ chunks = _resolve_chunks(file)
214
+ if not chunks:
215
+ primary = next(
216
+ (p for p in file.payloads if p.name == file.metadata.primary_payload and _is_text_payload(p)),
217
+ None,
218
+ )
219
+ if primary is None:
220
+ return []
221
+ payload_meta = _payload_meta(primary, file)
222
+ merged = {**stream_meta, **payload_meta, **source_meta, "source": source_label}
223
+ return [Document(content=primary.text(), meta=merged)]
224
+
225
+ out: list[Document] = []
226
+ for chunk in chunks:
227
+ merged = {**stream_meta, **_chunk_meta(chunk, file), **source_meta, "source": source_label}
228
+ out.append(Document(content=chunk.text, meta=merged, embedding=list(chunk.vector)))
229
+ return out
@@ -11,6 +11,8 @@ from haystack.dataclasses import ByteStream
11
11
  from haystack_integrations.components.converters.cvfile import CVFileToDocument
12
12
 
13
13
  FIXTURE = Path(__file__).parents[3] / "packages" / "sdk-js" / "tests" / "fixtures" / "python-produced.cv"
14
+ UNICODE_FIXTURE = Path(__file__).parents[2] / "tests" / "fixtures" / "unicode.cv"
15
+ MALICIOUS_FIXTURE = Path(__file__).parents[3] / "spec" / "test-vectors" / "malicious" / "js-action.cv"
14
16
 
15
17
 
16
18
  @pytest.fixture(scope="module")
@@ -70,8 +72,63 @@ def test_accepts_bytestream() -> None:
70
72
 
71
73
 
72
74
  def test_unreadable_source_is_skipped(tmp_path: Path) -> None:
73
- converter = CVFileToDocument()
75
+ """Parse failures are logged and skipped when verification is off; with
76
+ verify=True (default) the same garbage fails validation and raises."""
74
77
  not_a_cv = tmp_path / "garbage.cv"
75
78
  not_a_cv.write_bytes(b"not a real cv file")
76
- result = converter.run(sources=[not_a_cv])
79
+ result = CVFileToDocument(verify=False).run(sources=[not_a_cv])
77
80
  assert result["documents"] == []
81
+ with pytest.raises(ValueError, match="pdf-parse-failed"):
82
+ CVFileToDocument().run(sources=[not_a_cv])
83
+
84
+
85
+ def test_chunks_mode_attaches_a_vector_per_chunk() -> None:
86
+ if not FIXTURE.exists():
87
+ pytest.skip(f"fixture not found: {FIXTURE}")
88
+ docs = CVFileToDocument(mode="chunks").run(sources=[FIXTURE])["documents"]
89
+ assert len(docs) >= 1
90
+ for doc in docs:
91
+ assert doc.embedding is not None
92
+ assert len(doc.embedding) == doc.meta["embedding_dimension"]
93
+ assert all(isinstance(v, float) for v in doc.embedding)
94
+ assert doc.content.strip(), "chunk text should not be empty"
95
+
96
+
97
+ def test_invalid_mode_rejected() -> None:
98
+ with pytest.raises(ValueError):
99
+ CVFileToDocument(mode="bogus")
100
+
101
+
102
+ def test_verify_rejects_malicious_file() -> None:
103
+ if not MALICIOUS_FIXTURE.exists():
104
+ pytest.skip(f"fixture not found: {MALICIOUS_FIXTURE}")
105
+ with pytest.raises(ValueError, match="javascript-action"):
106
+ CVFileToDocument().run(sources=[MALICIOUS_FIXTURE])
107
+
108
+
109
+ def test_verify_false_converts_malicious_file() -> None:
110
+ if not MALICIOUS_FIXTURE.exists():
111
+ pytest.skip(f"fixture not found: {MALICIOUS_FIXTURE}")
112
+ docs = CVFileToDocument(verify=False).run(sources=[MALICIOUS_FIXTURE])["documents"]
113
+ assert len(docs) >= 1
114
+
115
+
116
+ def test_verify_default_passes_on_valid_file() -> None:
117
+ if not FIXTURE.exists():
118
+ pytest.skip(f"fixture not found: {FIXTURE}")
119
+ converter = CVFileToDocument()
120
+ assert converter.verify is True
121
+ assert len(converter.run(sources=[FIXTURE])["documents"]) >= 1
122
+
123
+
124
+ def test_non_ascii_chunk_text_slices_on_byte_offsets() -> None:
125
+ if not UNICODE_FIXTURE.exists():
126
+ pytest.skip(f"fixture not found: {UNICODE_FIXTURE}")
127
+ docs = CVFileToDocument(mode="chunks").run(sources=[UNICODE_FIXTURE])["documents"]
128
+ joined = "".join(d.content for d in docs)
129
+ assert "Élodie" in joined
130
+ assert "工程師" in joined
131
+ assert "🚀" in joined
132
+ assert "经验" in joined
133
+ for doc in docs:
134
+ assert doc.content == doc.content.encode("utf-8").decode("utf-8")
@@ -1,117 +0,0 @@
1
- """Haystack ``@component`` converter for the .cv open file format."""
2
-
3
- from __future__ import annotations
4
-
5
- from pathlib import Path
6
- from typing import Any
7
-
8
- from cvfile import CvFile, ExtractedPayload, extract
9
- from haystack import Document, component, logging
10
- from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
11
- from haystack.dataclasses import ByteStream
12
-
13
- logger = logging.getLogger(__name__)
14
-
15
- _TEXT_MIME_PREFIXES: tuple[str, ...] = (
16
- "text/",
17
- "application/json",
18
- "application/xml",
19
- )
20
-
21
-
22
- def _is_text_payload(payload: ExtractedPayload) -> bool:
23
- return any(payload.mime_type.startswith(prefix) for prefix in _TEXT_MIME_PREFIXES)
24
-
25
-
26
- def _payload_meta(payload: ExtractedPayload, file: CvFile) -> dict[str, Any]:
27
- return {
28
- "mime_type": payload.mime_type,
29
- "payload": payload.name,
30
- "relationship": payload.relationship,
31
- "language": payload.language or file.metadata.primary_language,
32
- "primary": payload.name == file.metadata.primary_payload,
33
- "cv_version": file.metadata.version,
34
- "cv_generator": file.metadata.generator,
35
- }
36
-
37
-
38
- @component
39
- class CVFileToDocument:
40
- """Convert ``.cv`` files into Haystack ``Document`` objects.
41
-
42
- A ``.cv`` file is a PDF/A-3u that carries one or more textual payloads
43
- (Markdown, HTML, JSON) as PDF Associated Files. This converter reads
44
- each ``.cv`` source and emits one ``Document`` per textual payload. The
45
- visual PDF layer is intentionally skipped because the embedded Markdown
46
- is a cleaner text representation of the same content.
47
-
48
- Set ``primary_only=True`` to emit only the payload marked as
49
- ``primaryPayload`` in the file's XMP metadata (usually the canonical
50
- Markdown copy), and skip all alternates.
51
- """
52
-
53
- def __init__(self, primary_only: bool = False) -> None:
54
- """Create a CVFileToDocument component.
55
-
56
- :param primary_only:
57
- If ``True``, emit only the payload marked as ``primaryPayload``
58
- in the file's XMP metadata. If ``False`` (default), emit one
59
- ``Document`` per textual payload (the primary plus any
60
- language alternates and supplements).
61
- """
62
- self.primary_only = primary_only
63
-
64
- @component.output_types(documents=list[Document])
65
- def run(
66
- self,
67
- sources: list[str | Path | ByteStream],
68
- meta: dict[str, Any] | list[dict[str, Any]] | None = None,
69
- ) -> dict[str, Any]:
70
- """Convert a list of ``.cv`` sources into ``Document`` objects.
71
-
72
- :param sources:
73
- File paths or ``ByteStream`` objects pointing at ``.cv`` files.
74
- :param meta:
75
- Optional metadata to attach to the produced documents. A single
76
- dictionary is merged into every document. A list must have the
77
- same length as ``sources`` and is zipped one to one with the
78
- inputs (the same dictionary is merged into every document
79
- produced from that source).
80
-
81
- :returns:
82
- A dictionary with key ``documents`` containing the list of
83
- ``Document`` objects extracted from every source.
84
- """
85
- documents: list[Document] = []
86
- meta_list = normalize_metadata(meta, sources_count=len(sources))
87
-
88
- for source, source_meta in zip(sources, meta_list, strict=True):
89
- try:
90
- bytestream = get_bytestream_from_source(source)
91
- except Exception as e:
92
- logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
93
- continue
94
-
95
- try:
96
- file = extract(bytestream.data)
97
- except Exception as e:
98
- logger.warning(
99
- "Could not parse .cv file from {source}. Skipping it. Error: {error}",
100
- source=source,
101
- error=e,
102
- )
103
- continue
104
-
105
- stream_meta = bytestream.meta or {}
106
- source_label = stream_meta.get("file_path") or stream_meta.get("file_name") or str(source)
107
-
108
- for payload in file.payloads:
109
- if not _is_text_payload(payload):
110
- continue
111
- payload_meta = _payload_meta(payload, file)
112
- if self.primary_only and not payload_meta["primary"]:
113
- continue
114
- merged = {**stream_meta, **payload_meta, **source_meta, "source": source_label}
115
- documents.append(Document(content=payload.text(), meta=merged))
116
-
117
- return {"documents": documents}