cvfile-haystack 0.2.0__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -65,3 +65,16 @@ src-tauri/target/
65
65
  *.seed
66
66
  *.tgz
67
67
  .vercel
68
+
69
+ # Launch material (HN/PH drafts, runbook, screenshots) and private outreach
70
+ # data (PII): kept local, purged from history, never published.
71
+ launch/
72
+
73
+ # Internal planning documents: business strategy, launch gates, monetization.
74
+ # Kept local, never published.
75
+ PLAN.md
76
+ ROADMAP.md
77
+
78
+ # Generated demo/build artefacts that escape the dist/ pattern
79
+ dist-demo/
80
+ docs/public/embed/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cvfile-haystack
3
- Version: 0.2.0
3
+ Version: 0.3.2
4
4
  Summary: Haystack integration for the .cv open file format.
5
5
  Project-URL: Homepage, https://cvfile.org
6
6
  Project-URL: Repository, https://github.com/cvfile/cv
@@ -15,8 +15,9 @@ Classifier: Programming Language :: Python :: 3.10
15
15
  Classifier: Programming Language :: Python :: 3.11
16
16
  Classifier: Programming Language :: Python :: 3.12
17
17
  Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Programming Language :: Python :: 3.14
18
19
  Requires-Python: >=3.10
19
- Requires-Dist: cvfile<2,>=0.1
20
+ Requires-Dist: cvfile<2,>=0.3.2
20
21
  Requires-Dist: haystack-ai<3,>=2.8
21
22
  Provides-Extra: dev
22
23
  Requires-Dist: pytest>=8.0; extra == 'dev'
@@ -63,6 +64,20 @@ alternates and supplements:
63
64
  converter = CVFileToDocument(primary_only=True)
64
65
  ```
65
66
 
67
+ ### Untrusted files
68
+
69
+ By default the converter runs `cvfile.validate()` on every source before
70
+ extracting anything. Files carrying forbidden active content (JavaScript,
71
+ launch or submit actions, external references), encryption, integrity digest
72
+ mismatches, or payloads over the spec size cap make `run()` raise
73
+ `ValueError` listing the issue codes. Resumes are classic untrusted input,
74
+ so keep the default when converting files you did not produce yourself.
75
+
76
+ ```python
77
+ converter = CVFileToDocument() # verify=True (default)
78
+ converter = CVFileToDocument(verify=False) # trusted files only
79
+ ```
80
+
66
81
  ### Pipeline use
67
82
 
68
83
  ```python
@@ -38,6 +38,20 @@ alternates and supplements:
38
38
  converter = CVFileToDocument(primary_only=True)
39
39
  ```
40
40
 
41
+ ### Untrusted files
42
+
43
+ By default the converter runs `cvfile.validate()` on every source before
44
+ extracting anything. Files carrying forbidden active content (JavaScript,
45
+ launch or submit actions, external references), encryption, integrity digest
46
+ mismatches, or payloads over the spec size cap make `run()` raise
47
+ `ValueError` listing the issue codes. Resumes are classic untrusted input,
48
+ so keep the default when converting files you did not produce yourself.
49
+
50
+ ```python
51
+ converter = CVFileToDocument() # verify=True (default)
52
+ converter = CVFileToDocument(verify=False) # trusted files only
53
+ ```
54
+
41
55
  ### Pipeline use
42
56
 
43
57
  ```python
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "cvfile-haystack"
7
- version = "0.2.0"
7
+ version = "0.3.2"
8
8
  description = "Haystack integration for the .cv open file format."
9
9
  readme = "README.md"
10
10
  license = { text = "Apache-2.0" }
@@ -19,9 +19,10 @@ classifiers = [
19
19
  "Programming Language :: Python :: 3.11",
20
20
  "Programming Language :: Python :: 3.12",
21
21
  "Programming Language :: Python :: 3.13",
22
+ "Programming Language :: Python :: 3.14",
22
23
  ]
23
24
  dependencies = [
24
- "cvfile>=0.1,<2",
25
+ "cvfile>=0.3.2,<2",
25
26
  "haystack-ai>=2.8,<3",
26
27
  ]
27
28
 
@@ -5,11 +5,12 @@ from __future__ import annotations
5
5
  from pathlib import Path
6
6
  from typing import Any
7
7
 
8
- from cvfile import CvFile, ExtractedPayload, extract
9
8
  from haystack import Document, component, logging
10
9
  from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
11
10
  from haystack.dataclasses import ByteStream
12
11
 
12
+ from cvfile import CvFile, ExtractedPayload, extract, validate
13
+
13
14
  logger = logging.getLogger(__name__)
14
15
 
15
16
  _TEXT_MIME_PREFIXES: tuple[str, ...] = (
@@ -35,6 +36,24 @@ def _payload_meta(payload: ExtractedPayload, file: CvFile) -> dict[str, Any]:
35
36
  }
36
37
 
37
38
 
39
+ def _verify_cv(data: bytes, source: str) -> None:
40
+ """Refuse to convert a .cv file that fails ``cvfile.validate()`` (lenient level).
41
+
42
+ Validation rejects forbidden active content (JavaScript, launch and submit
43
+ actions, external references), encryption, integrity digest mismatches, and
44
+ payloads over the spec size cap, which is the right default for untrusted
45
+ input.
46
+ """
47
+ report = validate(data)
48
+ if report.ok:
49
+ return
50
+ codes = ", ".join(sorted({issue.code for issue in report.issues if issue.level == "error"}))
51
+ raise ValueError(
52
+ f".cv validation failed for {source}: {codes}. "
53
+ "The file was rejected before extraction; pass verify=False only for trusted files."
54
+ )
55
+
56
+
38
57
  def _resolve_chunks(file: CvFile) -> list:
39
58
  """Decode the file's embeddings.cbor into text-resolved chunks.
40
59
 
@@ -83,9 +102,18 @@ class CVFileToDocument:
83
102
  UTF-8 byte offsets. Files without an embeddings payload fall back to a single
84
103
  Markdown ``Document``. In ``mode="chunks"`` the ``primary_only`` flag is
85
104
  ignored (chunks already index a single text payload).
105
+
106
+ By default (``verify=True``) each source is checked with
107
+ ``cvfile.validate()`` before extraction: files carrying forbidden active
108
+ content (JavaScript, launch or submit actions, external references),
109
+ encryption, integrity digest mismatches, or oversized payloads make
110
+ ``run()`` raise ``ValueError`` listing the issue codes. Unlike unreadable
111
+ sources, which are logged and skipped, a validation failure is a security
112
+ signal and is never silently dropped. Set ``verify=False`` to skip the
113
+ check for trusted files only.
86
114
  """
87
115
 
88
- def __init__(self, primary_only: bool = False, *, mode: str = "payloads") -> None:
116
+ def __init__(self, primary_only: bool = False, *, mode: str = "payloads", verify: bool = True) -> None:
89
117
  """Create a CVFileToDocument component.
90
118
 
91
119
  :param primary_only:
@@ -97,11 +125,16 @@ class CVFileToDocument:
97
125
  ``"payloads"`` (default) emits one ``Document`` per textual payload.
98
126
  ``"chunks"`` emits one ``Document`` per pre-computed embedding chunk
99
127
  with its vector attached.
128
+ :param verify:
129
+ If ``True`` (default), run ``cvfile.validate()`` on every source
130
+ and raise ``ValueError`` when a file fails validation. Set to
131
+ ``False`` only for trusted files.
100
132
  """
101
133
  if mode not in ("payloads", "chunks"):
102
134
  raise ValueError("mode must be 'payloads' or 'chunks'")
103
135
  self.primary_only = primary_only
104
136
  self.mode = mode
137
+ self.verify = verify
105
138
 
106
139
  @component.output_types(documents=list[Document])
107
140
  def run(
@@ -120,6 +153,11 @@ class CVFileToDocument:
120
153
  inputs (the same dictionary is merged into every document
121
154
  produced from that source).
122
155
 
156
+ :raises ValueError:
157
+ When ``verify=True`` (the default) and a source fails
158
+ ``cvfile.validate()``. The message lists the validation issue
159
+ codes.
160
+
123
161
  :returns:
124
162
  A dictionary with key ``documents`` containing the list of
125
163
  ``Document`` objects extracted from every source.
@@ -134,6 +172,9 @@ class CVFileToDocument:
134
172
  logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
135
173
  continue
136
174
 
175
+ if self.verify:
176
+ _verify_cv(bytestream.data, str(source))
177
+
137
178
  try:
138
179
  file = extract(bytestream.data)
139
180
  except Exception as e:
@@ -12,6 +12,7 @@ from haystack_integrations.components.converters.cvfile import CVFileToDocument
12
12
 
13
13
  FIXTURE = Path(__file__).parents[3] / "packages" / "sdk-js" / "tests" / "fixtures" / "python-produced.cv"
14
14
  UNICODE_FIXTURE = Path(__file__).parents[2] / "tests" / "fixtures" / "unicode.cv"
15
+ MALICIOUS_FIXTURE = Path(__file__).parents[3] / "spec" / "test-vectors" / "malicious" / "js-action.cv"
15
16
 
16
17
 
17
18
  @pytest.fixture(scope="module")
@@ -71,11 +72,14 @@ def test_accepts_bytestream() -> None:
71
72
 
72
73
 
73
74
  def test_unreadable_source_is_skipped(tmp_path: Path) -> None:
74
- converter = CVFileToDocument()
75
+ """Parse failures are logged and skipped when verification is off; with
76
+ verify=True (default) the same garbage fails validation and raises."""
75
77
  not_a_cv = tmp_path / "garbage.cv"
76
78
  not_a_cv.write_bytes(b"not a real cv file")
77
- result = converter.run(sources=[not_a_cv])
79
+ result = CVFileToDocument(verify=False).run(sources=[not_a_cv])
78
80
  assert result["documents"] == []
81
+ with pytest.raises(ValueError, match="pdf-parse-failed"):
82
+ CVFileToDocument().run(sources=[not_a_cv])
79
83
 
80
84
 
81
85
  def test_chunks_mode_attaches_a_vector_per_chunk() -> None:
@@ -95,6 +99,28 @@ def test_invalid_mode_rejected() -> None:
95
99
  CVFileToDocument(mode="bogus")
96
100
 
97
101
 
102
+ def test_verify_rejects_malicious_file() -> None:
103
+ if not MALICIOUS_FIXTURE.exists():
104
+ pytest.skip(f"fixture not found: {MALICIOUS_FIXTURE}")
105
+ with pytest.raises(ValueError, match="javascript-action"):
106
+ CVFileToDocument().run(sources=[MALICIOUS_FIXTURE])
107
+
108
+
109
+ def test_verify_false_converts_malicious_file() -> None:
110
+ if not MALICIOUS_FIXTURE.exists():
111
+ pytest.skip(f"fixture not found: {MALICIOUS_FIXTURE}")
112
+ docs = CVFileToDocument(verify=False).run(sources=[MALICIOUS_FIXTURE])["documents"]
113
+ assert len(docs) >= 1
114
+
115
+
116
+ def test_verify_default_passes_on_valid_file() -> None:
117
+ if not FIXTURE.exists():
118
+ pytest.skip(f"fixture not found: {FIXTURE}")
119
+ converter = CVFileToDocument()
120
+ assert converter.verify is True
121
+ assert len(converter.run(sources=[FIXTURE])["documents"]) >= 1
122
+
123
+
98
124
  def test_non_ascii_chunk_text_slices_on_byte_offsets() -> None:
99
125
  if not UNICODE_FIXTURE.exists():
100
126
  pytest.skip(f"fixture not found: {UNICODE_FIXTURE}")