cvfile-haystack 0.2.0__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cvfile_haystack-0.2.0 → cvfile_haystack-0.3.2}/.gitignore +13 -0
- {cvfile_haystack-0.2.0 → cvfile_haystack-0.3.2}/PKG-INFO +17 -2
- {cvfile_haystack-0.2.0 → cvfile_haystack-0.3.2}/README.md +14 -0
- {cvfile_haystack-0.2.0 → cvfile_haystack-0.3.2}/pyproject.toml +3 -2
- {cvfile_haystack-0.2.0 → cvfile_haystack-0.3.2}/src/haystack_integrations/components/converters/cvfile/converter.py +43 -2
- {cvfile_haystack-0.2.0 → cvfile_haystack-0.3.2}/tests/test_converter.py +28 -2
- {cvfile_haystack-0.2.0 → cvfile_haystack-0.3.2}/src/haystack_integrations/components/converters/cvfile/__init__.py +0 -0
- {cvfile_haystack-0.2.0 → cvfile_haystack-0.3.2}/tests/__init__.py +0 -0
|
@@ -65,3 +65,16 @@ src-tauri/target/
|
|
|
65
65
|
*.seed
|
|
66
66
|
*.tgz
|
|
67
67
|
.vercel
|
|
68
|
+
|
|
69
|
+
# Launch material (HN/PH drafts, runbook, screenshots) and private outreach
|
|
70
|
+
# data (PII): kept local, purged from history, never published.
|
|
71
|
+
launch/
|
|
72
|
+
|
|
73
|
+
# Internal planning documents: business strategy, launch gates, monetization.
|
|
74
|
+
# Kept local, never published.
|
|
75
|
+
PLAN.md
|
|
76
|
+
ROADMAP.md
|
|
77
|
+
|
|
78
|
+
# Generated demo/build artefacts that escape the dist/ pattern
|
|
79
|
+
dist-demo/
|
|
80
|
+
docs/public/embed/
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cvfile-haystack
|
|
3
|
-
Version: 0.2
|
|
3
|
+
Version: 0.3.2
|
|
4
4
|
Summary: Haystack integration for the .cv open file format.
|
|
5
5
|
Project-URL: Homepage, https://cvfile.org
|
|
6
6
|
Project-URL: Repository, https://github.com/cvfile/cv
|
|
@@ -15,8 +15,9 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
15
15
|
Classifier: Programming Language :: Python :: 3.11
|
|
16
16
|
Classifier: Programming Language :: Python :: 3.12
|
|
17
17
|
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
18
19
|
Requires-Python: >=3.10
|
|
19
|
-
Requires-Dist: cvfile<2,>=0.
|
|
20
|
+
Requires-Dist: cvfile<2,>=0.3.2
|
|
20
21
|
Requires-Dist: haystack-ai<3,>=2.8
|
|
21
22
|
Provides-Extra: dev
|
|
22
23
|
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
@@ -63,6 +64,20 @@ alternates and supplements:
|
|
|
63
64
|
converter = CVFileToDocument(primary_only=True)
|
|
64
65
|
```
|
|
65
66
|
|
|
67
|
+
### Untrusted files
|
|
68
|
+
|
|
69
|
+
By default the converter runs `cvfile.validate()` on every source before
|
|
70
|
+
extracting anything. Files carrying forbidden active content (JavaScript,
|
|
71
|
+
launch or submit actions, external references), encryption, integrity digest
|
|
72
|
+
mismatches, or payloads over the spec size cap make `run()` raise
|
|
73
|
+
`ValueError` listing the issue codes. Resumes are classic untrusted input,
|
|
74
|
+
so keep the default when converting files you did not produce yourself.
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
converter = CVFileToDocument() # verify=True (default)
|
|
78
|
+
converter = CVFileToDocument(verify=False) # trusted files only
|
|
79
|
+
```
|
|
80
|
+
|
|
66
81
|
### Pipeline use
|
|
67
82
|
|
|
68
83
|
```python
|
|
@@ -38,6 +38,20 @@ alternates and supplements:
|
|
|
38
38
|
converter = CVFileToDocument(primary_only=True)
|
|
39
39
|
```
|
|
40
40
|
|
|
41
|
+
### Untrusted files
|
|
42
|
+
|
|
43
|
+
By default the converter runs `cvfile.validate()` on every source before
|
|
44
|
+
extracting anything. Files carrying forbidden active content (JavaScript,
|
|
45
|
+
launch or submit actions, external references), encryption, integrity digest
|
|
46
|
+
mismatches, or payloads over the spec size cap make `run()` raise
|
|
47
|
+
`ValueError` listing the issue codes. Resumes are classic untrusted input,
|
|
48
|
+
so keep the default when converting files you did not produce yourself.
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
converter = CVFileToDocument() # verify=True (default)
|
|
52
|
+
converter = CVFileToDocument(verify=False) # trusted files only
|
|
53
|
+
```
|
|
54
|
+
|
|
41
55
|
### Pipeline use
|
|
42
56
|
|
|
43
57
|
```python
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "cvfile-haystack"
|
|
7
|
-
version = "0.2
|
|
7
|
+
version = "0.3.2"
|
|
8
8
|
description = "Haystack integration for the .cv open file format."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { text = "Apache-2.0" }
|
|
@@ -19,9 +19,10 @@ classifiers = [
|
|
|
19
19
|
"Programming Language :: Python :: 3.11",
|
|
20
20
|
"Programming Language :: Python :: 3.12",
|
|
21
21
|
"Programming Language :: Python :: 3.13",
|
|
22
|
+
"Programming Language :: Python :: 3.14",
|
|
22
23
|
]
|
|
23
24
|
dependencies = [
|
|
24
|
-
"cvfile>=0.
|
|
25
|
+
"cvfile>=0.3.2,<2",
|
|
25
26
|
"haystack-ai>=2.8,<3",
|
|
26
27
|
]
|
|
27
28
|
|
|
@@ -5,11 +5,12 @@ from __future__ import annotations
|
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import Any
|
|
7
7
|
|
|
8
|
-
from cvfile import CvFile, ExtractedPayload, extract
|
|
9
8
|
from haystack import Document, component, logging
|
|
10
9
|
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
|
|
11
10
|
from haystack.dataclasses import ByteStream
|
|
12
11
|
|
|
12
|
+
from cvfile import CvFile, ExtractedPayload, extract, validate
|
|
13
|
+
|
|
13
14
|
logger = logging.getLogger(__name__)
|
|
14
15
|
|
|
15
16
|
_TEXT_MIME_PREFIXES: tuple[str, ...] = (
|
|
@@ -35,6 +36,24 @@ def _payload_meta(payload: ExtractedPayload, file: CvFile) -> dict[str, Any]:
|
|
|
35
36
|
}
|
|
36
37
|
|
|
37
38
|
|
|
39
|
+
def _verify_cv(data: bytes, source: str) -> None:
|
|
40
|
+
"""Refuse to convert a .cv file that fails ``cvfile.validate()`` (lenient level).
|
|
41
|
+
|
|
42
|
+
Validation rejects forbidden active content (JavaScript, launch and submit
|
|
43
|
+
actions, external references), encryption, integrity digest mismatches, and
|
|
44
|
+
payloads over the spec size cap, which is the right default for untrusted
|
|
45
|
+
input.
|
|
46
|
+
"""
|
|
47
|
+
report = validate(data)
|
|
48
|
+
if report.ok:
|
|
49
|
+
return
|
|
50
|
+
codes = ", ".join(sorted({issue.code for issue in report.issues if issue.level == "error"}))
|
|
51
|
+
raise ValueError(
|
|
52
|
+
f".cv validation failed for {source}: {codes}. "
|
|
53
|
+
"The file was rejected before extraction; pass verify=False only for trusted files."
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
38
57
|
def _resolve_chunks(file: CvFile) -> list:
|
|
39
58
|
"""Decode the file's embeddings.cbor into text-resolved chunks.
|
|
40
59
|
|
|
@@ -83,9 +102,18 @@ class CVFileToDocument:
|
|
|
83
102
|
UTF-8 byte offsets. Files without an embeddings payload fall back to a single
|
|
84
103
|
Markdown ``Document``. In ``mode="chunks"`` the ``primary_only`` flag is
|
|
85
104
|
ignored (chunks already index a single text payload).
|
|
105
|
+
|
|
106
|
+
By default (``verify=True``) each source is checked with
|
|
107
|
+
``cvfile.validate()`` before extraction: files carrying forbidden active
|
|
108
|
+
content (JavaScript, launch or submit actions, external references),
|
|
109
|
+
encryption, integrity digest mismatches, or oversized payloads make
|
|
110
|
+
``run()`` raise ``ValueError`` listing the issue codes. Unlike unreadable
|
|
111
|
+
sources, which are logged and skipped, a validation failure is a security
|
|
112
|
+
signal and is never silently dropped. Set ``verify=False`` to skip the
|
|
113
|
+
check for trusted files only.
|
|
86
114
|
"""
|
|
87
115
|
|
|
88
|
-
def __init__(self, primary_only: bool = False, *, mode: str = "payloads") -> None:
|
|
116
|
+
def __init__(self, primary_only: bool = False, *, mode: str = "payloads", verify: bool = True) -> None:
|
|
89
117
|
"""Create a CVFileToDocument component.
|
|
90
118
|
|
|
91
119
|
:param primary_only:
|
|
@@ -97,11 +125,16 @@ class CVFileToDocument:
|
|
|
97
125
|
``"payloads"`` (default) emits one ``Document`` per textual payload.
|
|
98
126
|
``"chunks"`` emits one ``Document`` per pre-computed embedding chunk
|
|
99
127
|
with its vector attached.
|
|
128
|
+
:param verify:
|
|
129
|
+
If ``True`` (default), run ``cvfile.validate()`` on every source
|
|
130
|
+
and raise ``ValueError`` when a file fails validation. Set to
|
|
131
|
+
``False`` only for trusted files.
|
|
100
132
|
"""
|
|
101
133
|
if mode not in ("payloads", "chunks"):
|
|
102
134
|
raise ValueError("mode must be 'payloads' or 'chunks'")
|
|
103
135
|
self.primary_only = primary_only
|
|
104
136
|
self.mode = mode
|
|
137
|
+
self.verify = verify
|
|
105
138
|
|
|
106
139
|
@component.output_types(documents=list[Document])
|
|
107
140
|
def run(
|
|
@@ -120,6 +153,11 @@ class CVFileToDocument:
|
|
|
120
153
|
inputs (the same dictionary is merged into every document
|
|
121
154
|
produced from that source).
|
|
122
155
|
|
|
156
|
+
:raises ValueError:
|
|
157
|
+
When ``verify=True`` (the default) and a source fails
|
|
158
|
+
``cvfile.validate()``. The message lists the validation issue
|
|
159
|
+
codes.
|
|
160
|
+
|
|
123
161
|
:returns:
|
|
124
162
|
A dictionary with key ``documents`` containing the list of
|
|
125
163
|
``Document`` objects extracted from every source.
|
|
@@ -134,6 +172,9 @@ class CVFileToDocument:
|
|
|
134
172
|
logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
|
|
135
173
|
continue
|
|
136
174
|
|
|
175
|
+
if self.verify:
|
|
176
|
+
_verify_cv(bytestream.data, str(source))
|
|
177
|
+
|
|
137
178
|
try:
|
|
138
179
|
file = extract(bytestream.data)
|
|
139
180
|
except Exception as e:
|
|
@@ -12,6 +12,7 @@ from haystack_integrations.components.converters.cvfile import CVFileToDocument
|
|
|
12
12
|
|
|
13
13
|
FIXTURE = Path(__file__).parents[3] / "packages" / "sdk-js" / "tests" / "fixtures" / "python-produced.cv"
|
|
14
14
|
UNICODE_FIXTURE = Path(__file__).parents[2] / "tests" / "fixtures" / "unicode.cv"
|
|
15
|
+
MALICIOUS_FIXTURE = Path(__file__).parents[3] / "spec" / "test-vectors" / "malicious" / "js-action.cv"
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
@pytest.fixture(scope="module")
|
|
@@ -71,11 +72,14 @@ def test_accepts_bytestream() -> None:
|
|
|
71
72
|
|
|
72
73
|
|
|
73
74
|
def test_unreadable_source_is_skipped(tmp_path: Path) -> None:
|
|
74
|
-
|
|
75
|
+
"""Parse failures are logged and skipped when verification is off; with
|
|
76
|
+
verify=True (default) the same garbage fails validation and raises."""
|
|
75
77
|
not_a_cv = tmp_path / "garbage.cv"
|
|
76
78
|
not_a_cv.write_bytes(b"not a real cv file")
|
|
77
|
-
result =
|
|
79
|
+
result = CVFileToDocument(verify=False).run(sources=[not_a_cv])
|
|
78
80
|
assert result["documents"] == []
|
|
81
|
+
with pytest.raises(ValueError, match="pdf-parse-failed"):
|
|
82
|
+
CVFileToDocument().run(sources=[not_a_cv])
|
|
79
83
|
|
|
80
84
|
|
|
81
85
|
def test_chunks_mode_attaches_a_vector_per_chunk() -> None:
|
|
@@ -95,6 +99,28 @@ def test_invalid_mode_rejected() -> None:
|
|
|
95
99
|
CVFileToDocument(mode="bogus")
|
|
96
100
|
|
|
97
101
|
|
|
102
|
+
def test_verify_rejects_malicious_file() -> None:
|
|
103
|
+
if not MALICIOUS_FIXTURE.exists():
|
|
104
|
+
pytest.skip(f"fixture not found: {MALICIOUS_FIXTURE}")
|
|
105
|
+
with pytest.raises(ValueError, match="javascript-action"):
|
|
106
|
+
CVFileToDocument().run(sources=[MALICIOUS_FIXTURE])
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def test_verify_false_converts_malicious_file() -> None:
|
|
110
|
+
if not MALICIOUS_FIXTURE.exists():
|
|
111
|
+
pytest.skip(f"fixture not found: {MALICIOUS_FIXTURE}")
|
|
112
|
+
docs = CVFileToDocument(verify=False).run(sources=[MALICIOUS_FIXTURE])["documents"]
|
|
113
|
+
assert len(docs) >= 1
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def test_verify_default_passes_on_valid_file() -> None:
|
|
117
|
+
if not FIXTURE.exists():
|
|
118
|
+
pytest.skip(f"fixture not found: {FIXTURE}")
|
|
119
|
+
converter = CVFileToDocument()
|
|
120
|
+
assert converter.verify is True
|
|
121
|
+
assert len(converter.run(sources=[FIXTURE])["documents"]) >= 1
|
|
122
|
+
|
|
123
|
+
|
|
98
124
|
def test_non_ascii_chunk_text_slices_on_byte_offsets() -> None:
|
|
99
125
|
if not UNICODE_FIXTURE.exists():
|
|
100
126
|
pytest.skip(f"fixture not found: {UNICODE_FIXTURE}")
|
|
File without changes
|
|
File without changes
|