cvfile-haystack 0.1.0__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cvfile_haystack-0.1.0 → cvfile_haystack-0.3.2}/.gitignore +13 -0
- {cvfile_haystack-0.1.0 → cvfile_haystack-0.3.2}/PKG-INFO +17 -2
- {cvfile_haystack-0.1.0 → cvfile_haystack-0.3.2}/README.md +14 -0
- {cvfile_haystack-0.1.0 → cvfile_haystack-0.3.2}/pyproject.toml +3 -2
- cvfile_haystack-0.3.2/src/haystack_integrations/components/converters/cvfile/converter.py +229 -0
- {cvfile_haystack-0.1.0 → cvfile_haystack-0.3.2}/tests/test_converter.py +59 -2
- cvfile_haystack-0.1.0/src/haystack_integrations/components/converters/cvfile/converter.py +0 -117
- {cvfile_haystack-0.1.0 → cvfile_haystack-0.3.2}/src/haystack_integrations/components/converters/cvfile/__init__.py +0 -0
- {cvfile_haystack-0.1.0 → cvfile_haystack-0.3.2}/tests/__init__.py +0 -0
|
@@ -65,3 +65,16 @@ src-tauri/target/
|
|
|
65
65
|
*.seed
|
|
66
66
|
*.tgz
|
|
67
67
|
.vercel
|
|
68
|
+
|
|
69
|
+
# Launch material (HN/PH drafts, runbook, screenshots) and private outreach
|
|
70
|
+
# data (PII): kept local, purged from history, never published.
|
|
71
|
+
launch/
|
|
72
|
+
|
|
73
|
+
# Internal planning documents: business strategy, launch gates, monetization.
|
|
74
|
+
# Kept local, never published.
|
|
75
|
+
PLAN.md
|
|
76
|
+
ROADMAP.md
|
|
77
|
+
|
|
78
|
+
# Generated demo/build artefacts that escape the dist/ pattern
|
|
79
|
+
dist-demo/
|
|
80
|
+
docs/public/embed/
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cvfile-haystack
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.2
|
|
4
4
|
Summary: Haystack integration for the .cv open file format.
|
|
5
5
|
Project-URL: Homepage, https://cvfile.org
|
|
6
6
|
Project-URL: Repository, https://github.com/cvfile/cv
|
|
@@ -15,8 +15,9 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
15
15
|
Classifier: Programming Language :: Python :: 3.11
|
|
16
16
|
Classifier: Programming Language :: Python :: 3.12
|
|
17
17
|
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
18
19
|
Requires-Python: >=3.10
|
|
19
|
-
Requires-Dist: cvfile<
|
|
20
|
+
Requires-Dist: cvfile<2,>=0.3.2
|
|
20
21
|
Requires-Dist: haystack-ai<3,>=2.8
|
|
21
22
|
Provides-Extra: dev
|
|
22
23
|
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
@@ -63,6 +64,20 @@ alternates and supplements:
|
|
|
63
64
|
converter = CVFileToDocument(primary_only=True)
|
|
64
65
|
```
|
|
65
66
|
|
|
67
|
+
### Untrusted files
|
|
68
|
+
|
|
69
|
+
By default the converter runs `cvfile.validate()` on every source before
|
|
70
|
+
extracting anything. Files carrying forbidden active content (JavaScript,
|
|
71
|
+
launch or submit actions, external references), encryption, integrity digest
|
|
72
|
+
mismatches, or payloads over the spec size cap make `run()` raise
|
|
73
|
+
`ValueError` listing the issue codes. Resumes are classic untrusted input,
|
|
74
|
+
so keep the default when converting files you did not produce yourself.
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
converter = CVFileToDocument() # verify=True (default)
|
|
78
|
+
converter = CVFileToDocument(verify=False) # trusted files only
|
|
79
|
+
```
|
|
80
|
+
|
|
66
81
|
### Pipeline use
|
|
67
82
|
|
|
68
83
|
```python
|
|
@@ -38,6 +38,20 @@ alternates and supplements:
|
|
|
38
38
|
converter = CVFileToDocument(primary_only=True)
|
|
39
39
|
```
|
|
40
40
|
|
|
41
|
+
### Untrusted files
|
|
42
|
+
|
|
43
|
+
By default the converter runs `cvfile.validate()` on every source before
|
|
44
|
+
extracting anything. Files carrying forbidden active content (JavaScript,
|
|
45
|
+
launch or submit actions, external references), encryption, integrity digest
|
|
46
|
+
mismatches, or payloads over the spec size cap make `run()` raise
|
|
47
|
+
`ValueError` listing the issue codes. Resumes are classic untrusted input,
|
|
48
|
+
so keep the default when converting files you did not produce yourself.
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
converter = CVFileToDocument() # verify=True (default)
|
|
52
|
+
converter = CVFileToDocument(verify=False) # trusted files only
|
|
53
|
+
```
|
|
54
|
+
|
|
41
55
|
### Pipeline use
|
|
42
56
|
|
|
43
57
|
```python
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "cvfile-haystack"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.3.2"
|
|
8
8
|
description = "Haystack integration for the .cv open file format."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { text = "Apache-2.0" }
|
|
@@ -19,9 +19,10 @@ classifiers = [
|
|
|
19
19
|
"Programming Language :: Python :: 3.11",
|
|
20
20
|
"Programming Language :: Python :: 3.12",
|
|
21
21
|
"Programming Language :: Python :: 3.13",
|
|
22
|
+
"Programming Language :: Python :: 3.14",
|
|
22
23
|
]
|
|
23
24
|
dependencies = [
|
|
24
|
-
"cvfile>=0.
|
|
25
|
+
"cvfile>=0.3.2,<2",
|
|
25
26
|
"haystack-ai>=2.8,<3",
|
|
26
27
|
]
|
|
27
28
|
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
"""Haystack ``@component`` converter for the .cv open file format."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from haystack import Document, component, logging
|
|
9
|
+
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
|
|
10
|
+
from haystack.dataclasses import ByteStream
|
|
11
|
+
|
|
12
|
+
from cvfile import CvFile, ExtractedPayload, extract, validate
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
_TEXT_MIME_PREFIXES: tuple[str, ...] = (
|
|
17
|
+
"text/",
|
|
18
|
+
"application/json",
|
|
19
|
+
"application/xml",
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _is_text_payload(payload: ExtractedPayload) -> bool:
|
|
24
|
+
return any(payload.mime_type.startswith(prefix) for prefix in _TEXT_MIME_PREFIXES)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _payload_meta(payload: ExtractedPayload, file: CvFile) -> dict[str, Any]:
|
|
28
|
+
return {
|
|
29
|
+
"mime_type": payload.mime_type,
|
|
30
|
+
"payload": payload.name,
|
|
31
|
+
"relationship": payload.relationship,
|
|
32
|
+
"language": payload.language,
|
|
33
|
+
"primary": payload.name == file.metadata.primary_payload,
|
|
34
|
+
"cv_version": file.metadata.version,
|
|
35
|
+
"cv_generator": file.metadata.generator,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _verify_cv(data: bytes, source: str) -> None:
|
|
40
|
+
"""Refuse to convert a .cv file that fails ``cvfile.validate()`` (lenient level).
|
|
41
|
+
|
|
42
|
+
Validation rejects forbidden active content (JavaScript, launch and submit
|
|
43
|
+
actions, external references), encryption, integrity digest mismatches, and
|
|
44
|
+
payloads over the spec size cap, which is the right default for untrusted
|
|
45
|
+
input.
|
|
46
|
+
"""
|
|
47
|
+
report = validate(data)
|
|
48
|
+
if report.ok:
|
|
49
|
+
return
|
|
50
|
+
codes = ", ".join(sorted({issue.code for issue in report.issues if issue.level == "error"}))
|
|
51
|
+
raise ValueError(
|
|
52
|
+
f".cv validation failed for {source}: {codes}. "
|
|
53
|
+
"The file was rejected before extraction; pass verify=False only for trusted files."
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _resolve_chunks(file: CvFile) -> list:
|
|
58
|
+
"""Decode the file's embeddings.cbor into text-resolved chunks.
|
|
59
|
+
|
|
60
|
+
Delegates to the core SDK so chunk text slicing uses UTF-8 byte offsets
|
|
61
|
+
(spec §5.1) and stays the single source of truth. Returns an empty list
|
|
62
|
+
when the embed extra is not installed or the file carries no embeddings.
|
|
63
|
+
"""
|
|
64
|
+
try:
|
|
65
|
+
from cvfile.embed import resolve_embedding_chunks
|
|
66
|
+
except ImportError:
|
|
67
|
+
return []
|
|
68
|
+
return resolve_embedding_chunks(file)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _chunk_meta(chunk: Any, file: CvFile) -> dict[str, Any]:
|
|
72
|
+
return {
|
|
73
|
+
"language": file.metadata.primary_language,
|
|
74
|
+
"cv_version": file.metadata.version,
|
|
75
|
+
"cv_generator": file.metadata.generator,
|
|
76
|
+
"chunk_id": chunk.id,
|
|
77
|
+
"chunk_offset": chunk.text_offset,
|
|
78
|
+
"chunk_length": chunk.text_length,
|
|
79
|
+
"embedding_model": chunk.model,
|
|
80
|
+
"embedding_dimension": chunk.dimension,
|
|
81
|
+
"embedding_metric": chunk.metric,
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@component
|
|
86
|
+
class CVFileToDocument:
|
|
87
|
+
"""Convert ``.cv`` files into Haystack ``Document`` objects.
|
|
88
|
+
|
|
89
|
+
A ``.cv`` file is a PDF/A-3u that carries one or more textual payloads
|
|
90
|
+
(Markdown, HTML, JSON) as PDF Associated Files. This converter reads
|
|
91
|
+
each ``.cv`` source and emits one ``Document`` per textual payload. The
|
|
92
|
+
visual PDF layer is intentionally skipped because the embedded Markdown
|
|
93
|
+
is a cleaner text representation of the same content.
|
|
94
|
+
|
|
95
|
+
Set ``primary_only=True`` to emit only the payload marked as
|
|
96
|
+
``primaryPayload`` in the file's XMP metadata (usually the canonical
|
|
97
|
+
Markdown copy), and skip all alternates.
|
|
98
|
+
|
|
99
|
+
Set ``mode="chunks"`` to emit one ``Document`` per pre-computed embedding
|
|
100
|
+
chunk instead of one per payload. Each chunk ``Document`` carries its vector
|
|
101
|
+
on ``Document.embedding`` and its text is sliced from the markdown using
|
|
102
|
+
UTF-8 byte offsets. Files without an embeddings payload fall back to a single
|
|
103
|
+
Markdown ``Document``. In ``mode="chunks"`` the ``primary_only`` flag is
|
|
104
|
+
ignored (chunks already index a single text payload).
|
|
105
|
+
|
|
106
|
+
By default (``verify=True``) each source is checked with
|
|
107
|
+
``cvfile.validate()`` before extraction: files carrying forbidden active
|
|
108
|
+
content (JavaScript, launch or submit actions, external references),
|
|
109
|
+
encryption, integrity digest mismatches, or oversized payloads make
|
|
110
|
+
``run()`` raise ``ValueError`` listing the issue codes. Unlike unreadable
|
|
111
|
+
sources, which are logged and skipped, a validation failure is a security
|
|
112
|
+
signal and is never silently dropped. Set ``verify=False`` to skip the
|
|
113
|
+
check for trusted files only.
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
def __init__(self, primary_only: bool = False, *, mode: str = "payloads", verify: bool = True) -> None:
|
|
117
|
+
"""Create a CVFileToDocument component.
|
|
118
|
+
|
|
119
|
+
:param primary_only:
|
|
120
|
+
If ``True``, emit only the payload marked as ``primaryPayload``
|
|
121
|
+
in the file's XMP metadata. If ``False`` (default), emit one
|
|
122
|
+
``Document`` per textual payload (the primary plus any
|
|
123
|
+
language alternates and supplements). Ignored in ``mode="chunks"``.
|
|
124
|
+
:param mode:
|
|
125
|
+
``"payloads"`` (default) emits one ``Document`` per textual payload.
|
|
126
|
+
``"chunks"`` emits one ``Document`` per pre-computed embedding chunk
|
|
127
|
+
with its vector attached.
|
|
128
|
+
:param verify:
|
|
129
|
+
If ``True`` (default), run ``cvfile.validate()`` on every source
|
|
130
|
+
and raise ``ValueError`` when a file fails validation. Set to
|
|
131
|
+
``False`` only for trusted files.
|
|
132
|
+
"""
|
|
133
|
+
if mode not in ("payloads", "chunks"):
|
|
134
|
+
raise ValueError("mode must be 'payloads' or 'chunks'")
|
|
135
|
+
self.primary_only = primary_only
|
|
136
|
+
self.mode = mode
|
|
137
|
+
self.verify = verify
|
|
138
|
+
|
|
139
|
+
@component.output_types(documents=list[Document])
|
|
140
|
+
def run(
|
|
141
|
+
self,
|
|
142
|
+
sources: list[str | Path | ByteStream],
|
|
143
|
+
meta: dict[str, Any] | list[dict[str, Any]] | None = None,
|
|
144
|
+
) -> dict[str, Any]:
|
|
145
|
+
"""Convert a list of ``.cv`` sources into ``Document`` objects.
|
|
146
|
+
|
|
147
|
+
:param sources:
|
|
148
|
+
File paths or ``ByteStream`` objects pointing at ``.cv`` files.
|
|
149
|
+
:param meta:
|
|
150
|
+
Optional metadata to attach to the produced documents. A single
|
|
151
|
+
dictionary is merged into every document. A list must have the
|
|
152
|
+
same length as ``sources`` and is zipped one to one with the
|
|
153
|
+
inputs (the same dictionary is merged into every document
|
|
154
|
+
produced from that source).
|
|
155
|
+
|
|
156
|
+
:raises ValueError:
|
|
157
|
+
When ``verify=True`` (the default) and a source fails
|
|
158
|
+
``cvfile.validate()``. The message lists the validation issue
|
|
159
|
+
codes.
|
|
160
|
+
|
|
161
|
+
:returns:
|
|
162
|
+
A dictionary with key ``documents`` containing the list of
|
|
163
|
+
``Document`` objects extracted from every source.
|
|
164
|
+
"""
|
|
165
|
+
documents: list[Document] = []
|
|
166
|
+
meta_list = normalize_metadata(meta, sources_count=len(sources))
|
|
167
|
+
|
|
168
|
+
for source, source_meta in zip(sources, meta_list, strict=True):
|
|
169
|
+
try:
|
|
170
|
+
bytestream = get_bytestream_from_source(source)
|
|
171
|
+
except Exception as e:
|
|
172
|
+
logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
|
|
173
|
+
continue
|
|
174
|
+
|
|
175
|
+
if self.verify:
|
|
176
|
+
_verify_cv(bytestream.data, str(source))
|
|
177
|
+
|
|
178
|
+
try:
|
|
179
|
+
file = extract(bytestream.data)
|
|
180
|
+
except Exception as e:
|
|
181
|
+
logger.warning(
|
|
182
|
+
"Could not parse .cv file from {source}. Skipping it. Error: {error}",
|
|
183
|
+
source=source,
|
|
184
|
+
error=e,
|
|
185
|
+
)
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
stream_meta = bytestream.meta or {}
|
|
189
|
+
source_label = stream_meta.get("file_path") or stream_meta.get("file_name") or str(source)
|
|
190
|
+
|
|
191
|
+
if self.mode == "chunks":
|
|
192
|
+
documents.extend(self._chunk_documents(file, stream_meta, source_meta, source_label))
|
|
193
|
+
continue
|
|
194
|
+
|
|
195
|
+
for payload in file.payloads:
|
|
196
|
+
if not _is_text_payload(payload):
|
|
197
|
+
continue
|
|
198
|
+
payload_meta = _payload_meta(payload, file)
|
|
199
|
+
if self.primary_only and not payload_meta["primary"]:
|
|
200
|
+
continue
|
|
201
|
+
merged = {**stream_meta, **payload_meta, **source_meta, "source": source_label}
|
|
202
|
+
documents.append(Document(content=payload.text(), meta=merged))
|
|
203
|
+
|
|
204
|
+
return {"documents": documents}
|
|
205
|
+
|
|
206
|
+
@staticmethod
|
|
207
|
+
def _chunk_documents(
|
|
208
|
+
file: CvFile,
|
|
209
|
+
stream_meta: dict[str, Any],
|
|
210
|
+
source_meta: dict[str, Any],
|
|
211
|
+
source_label: str,
|
|
212
|
+
) -> list[Document]:
|
|
213
|
+
chunks = _resolve_chunks(file)
|
|
214
|
+
if not chunks:
|
|
215
|
+
primary = next(
|
|
216
|
+
(p for p in file.payloads if p.name == file.metadata.primary_payload and _is_text_payload(p)),
|
|
217
|
+
None,
|
|
218
|
+
)
|
|
219
|
+
if primary is None:
|
|
220
|
+
return []
|
|
221
|
+
payload_meta = _payload_meta(primary, file)
|
|
222
|
+
merged = {**stream_meta, **payload_meta, **source_meta, "source": source_label}
|
|
223
|
+
return [Document(content=primary.text(), meta=merged)]
|
|
224
|
+
|
|
225
|
+
out: list[Document] = []
|
|
226
|
+
for chunk in chunks:
|
|
227
|
+
merged = {**stream_meta, **_chunk_meta(chunk, file), **source_meta, "source": source_label}
|
|
228
|
+
out.append(Document(content=chunk.text, meta=merged, embedding=list(chunk.vector)))
|
|
229
|
+
return out
|
|
@@ -11,6 +11,8 @@ from haystack.dataclasses import ByteStream
|
|
|
11
11
|
from haystack_integrations.components.converters.cvfile import CVFileToDocument
|
|
12
12
|
|
|
13
13
|
FIXTURE = Path(__file__).parents[3] / "packages" / "sdk-js" / "tests" / "fixtures" / "python-produced.cv"
|
|
14
|
+
UNICODE_FIXTURE = Path(__file__).parents[2] / "tests" / "fixtures" / "unicode.cv"
|
|
15
|
+
MALICIOUS_FIXTURE = Path(__file__).parents[3] / "spec" / "test-vectors" / "malicious" / "js-action.cv"
|
|
14
16
|
|
|
15
17
|
|
|
16
18
|
@pytest.fixture(scope="module")
|
|
@@ -70,8 +72,63 @@ def test_accepts_bytestream() -> None:
|
|
|
70
72
|
|
|
71
73
|
|
|
72
74
|
def test_unreadable_source_is_skipped(tmp_path: Path) -> None:
|
|
73
|
-
|
|
75
|
+
"""Parse failures are logged and skipped when verification is off; with
|
|
76
|
+
verify=True (default) the same garbage fails validation and raises."""
|
|
74
77
|
not_a_cv = tmp_path / "garbage.cv"
|
|
75
78
|
not_a_cv.write_bytes(b"not a real cv file")
|
|
76
|
-
result =
|
|
79
|
+
result = CVFileToDocument(verify=False).run(sources=[not_a_cv])
|
|
77
80
|
assert result["documents"] == []
|
|
81
|
+
with pytest.raises(ValueError, match="pdf-parse-failed"):
|
|
82
|
+
CVFileToDocument().run(sources=[not_a_cv])
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def test_chunks_mode_attaches_a_vector_per_chunk() -> None:
|
|
86
|
+
if not FIXTURE.exists():
|
|
87
|
+
pytest.skip(f"fixture not found: {FIXTURE}")
|
|
88
|
+
docs = CVFileToDocument(mode="chunks").run(sources=[FIXTURE])["documents"]
|
|
89
|
+
assert len(docs) >= 1
|
|
90
|
+
for doc in docs:
|
|
91
|
+
assert doc.embedding is not None
|
|
92
|
+
assert len(doc.embedding) == doc.meta["embedding_dimension"]
|
|
93
|
+
assert all(isinstance(v, float) for v in doc.embedding)
|
|
94
|
+
assert doc.content.strip(), "chunk text should not be empty"
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def test_invalid_mode_rejected() -> None:
|
|
98
|
+
with pytest.raises(ValueError):
|
|
99
|
+
CVFileToDocument(mode="bogus")
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def test_verify_rejects_malicious_file() -> None:
|
|
103
|
+
if not MALICIOUS_FIXTURE.exists():
|
|
104
|
+
pytest.skip(f"fixture not found: {MALICIOUS_FIXTURE}")
|
|
105
|
+
with pytest.raises(ValueError, match="javascript-action"):
|
|
106
|
+
CVFileToDocument().run(sources=[MALICIOUS_FIXTURE])
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def test_verify_false_converts_malicious_file() -> None:
|
|
110
|
+
if not MALICIOUS_FIXTURE.exists():
|
|
111
|
+
pytest.skip(f"fixture not found: {MALICIOUS_FIXTURE}")
|
|
112
|
+
docs = CVFileToDocument(verify=False).run(sources=[MALICIOUS_FIXTURE])["documents"]
|
|
113
|
+
assert len(docs) >= 1
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def test_verify_default_passes_on_valid_file() -> None:
|
|
117
|
+
if not FIXTURE.exists():
|
|
118
|
+
pytest.skip(f"fixture not found: {FIXTURE}")
|
|
119
|
+
converter = CVFileToDocument()
|
|
120
|
+
assert converter.verify is True
|
|
121
|
+
assert len(converter.run(sources=[FIXTURE])["documents"]) >= 1
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def test_non_ascii_chunk_text_slices_on_byte_offsets() -> None:
|
|
125
|
+
if not UNICODE_FIXTURE.exists():
|
|
126
|
+
pytest.skip(f"fixture not found: {UNICODE_FIXTURE}")
|
|
127
|
+
docs = CVFileToDocument(mode="chunks").run(sources=[UNICODE_FIXTURE])["documents"]
|
|
128
|
+
joined = "".join(d.content for d in docs)
|
|
129
|
+
assert "Élodie" in joined
|
|
130
|
+
assert "工程師" in joined
|
|
131
|
+
assert "🚀" in joined
|
|
132
|
+
assert "经验" in joined
|
|
133
|
+
for doc in docs:
|
|
134
|
+
assert doc.content == doc.content.encode("utf-8").decode("utf-8")
|
|
@@ -1,117 +0,0 @@
|
|
|
1
|
-
"""Haystack ``@component`` converter for the .cv open file format."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import Any
|
|
7
|
-
|
|
8
|
-
from cvfile import CvFile, ExtractedPayload, extract
|
|
9
|
-
from haystack import Document, component, logging
|
|
10
|
-
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
|
|
11
|
-
from haystack.dataclasses import ByteStream
|
|
12
|
-
|
|
13
|
-
logger = logging.getLogger(__name__)
|
|
14
|
-
|
|
15
|
-
_TEXT_MIME_PREFIXES: tuple[str, ...] = (
|
|
16
|
-
"text/",
|
|
17
|
-
"application/json",
|
|
18
|
-
"application/xml",
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def _is_text_payload(payload: ExtractedPayload) -> bool:
|
|
23
|
-
return any(payload.mime_type.startswith(prefix) for prefix in _TEXT_MIME_PREFIXES)
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def _payload_meta(payload: ExtractedPayload, file: CvFile) -> dict[str, Any]:
|
|
27
|
-
return {
|
|
28
|
-
"mime_type": payload.mime_type,
|
|
29
|
-
"payload": payload.name,
|
|
30
|
-
"relationship": payload.relationship,
|
|
31
|
-
"language": payload.language or file.metadata.primary_language,
|
|
32
|
-
"primary": payload.name == file.metadata.primary_payload,
|
|
33
|
-
"cv_version": file.metadata.version,
|
|
34
|
-
"cv_generator": file.metadata.generator,
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
@component
|
|
39
|
-
class CVFileToDocument:
|
|
40
|
-
"""Convert ``.cv`` files into Haystack ``Document`` objects.
|
|
41
|
-
|
|
42
|
-
A ``.cv`` file is a PDF/A-3u that carries one or more textual payloads
|
|
43
|
-
(Markdown, HTML, JSON) as PDF Associated Files. This converter reads
|
|
44
|
-
each ``.cv`` source and emits one ``Document`` per textual payload. The
|
|
45
|
-
visual PDF layer is intentionally skipped because the embedded Markdown
|
|
46
|
-
is a cleaner text representation of the same content.
|
|
47
|
-
|
|
48
|
-
Set ``primary_only=True`` to emit only the payload marked as
|
|
49
|
-
``primaryPayload`` in the file's XMP metadata (usually the canonical
|
|
50
|
-
Markdown copy), and skip all alternates.
|
|
51
|
-
"""
|
|
52
|
-
|
|
53
|
-
def __init__(self, primary_only: bool = False) -> None:
|
|
54
|
-
"""Create a CVFileToDocument component.
|
|
55
|
-
|
|
56
|
-
:param primary_only:
|
|
57
|
-
If ``True``, emit only the payload marked as ``primaryPayload``
|
|
58
|
-
in the file's XMP metadata. If ``False`` (default), emit one
|
|
59
|
-
``Document`` per textual payload (the primary plus any
|
|
60
|
-
language alternates and supplements).
|
|
61
|
-
"""
|
|
62
|
-
self.primary_only = primary_only
|
|
63
|
-
|
|
64
|
-
@component.output_types(documents=list[Document])
|
|
65
|
-
def run(
|
|
66
|
-
self,
|
|
67
|
-
sources: list[str | Path | ByteStream],
|
|
68
|
-
meta: dict[str, Any] | list[dict[str, Any]] | None = None,
|
|
69
|
-
) -> dict[str, Any]:
|
|
70
|
-
"""Convert a list of ``.cv`` sources into ``Document`` objects.
|
|
71
|
-
|
|
72
|
-
:param sources:
|
|
73
|
-
File paths or ``ByteStream`` objects pointing at ``.cv`` files.
|
|
74
|
-
:param meta:
|
|
75
|
-
Optional metadata to attach to the produced documents. A single
|
|
76
|
-
dictionary is merged into every document. A list must have the
|
|
77
|
-
same length as ``sources`` and is zipped one to one with the
|
|
78
|
-
inputs (the same dictionary is merged into every document
|
|
79
|
-
produced from that source).
|
|
80
|
-
|
|
81
|
-
:returns:
|
|
82
|
-
A dictionary with key ``documents`` containing the list of
|
|
83
|
-
``Document`` objects extracted from every source.
|
|
84
|
-
"""
|
|
85
|
-
documents: list[Document] = []
|
|
86
|
-
meta_list = normalize_metadata(meta, sources_count=len(sources))
|
|
87
|
-
|
|
88
|
-
for source, source_meta in zip(sources, meta_list, strict=True):
|
|
89
|
-
try:
|
|
90
|
-
bytestream = get_bytestream_from_source(source)
|
|
91
|
-
except Exception as e:
|
|
92
|
-
logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
|
|
93
|
-
continue
|
|
94
|
-
|
|
95
|
-
try:
|
|
96
|
-
file = extract(bytestream.data)
|
|
97
|
-
except Exception as e:
|
|
98
|
-
logger.warning(
|
|
99
|
-
"Could not parse .cv file from {source}. Skipping it. Error: {error}",
|
|
100
|
-
source=source,
|
|
101
|
-
error=e,
|
|
102
|
-
)
|
|
103
|
-
continue
|
|
104
|
-
|
|
105
|
-
stream_meta = bytestream.meta or {}
|
|
106
|
-
source_label = stream_meta.get("file_path") or stream_meta.get("file_name") or str(source)
|
|
107
|
-
|
|
108
|
-
for payload in file.payloads:
|
|
109
|
-
if not _is_text_payload(payload):
|
|
110
|
-
continue
|
|
111
|
-
payload_meta = _payload_meta(payload, file)
|
|
112
|
-
if self.primary_only and not payload_meta["primary"]:
|
|
113
|
-
continue
|
|
114
|
-
merged = {**stream_meta, **payload_meta, **source_meta, "source": source_label}
|
|
115
|
-
documents.append(Document(content=payload.text(), meta=merged))
|
|
116
|
-
|
|
117
|
-
return {"documents": documents}
|
|
File without changes
|
|
File without changes
|