cvfile-haystack 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cvfile_haystack-0.1.0 → cvfile_haystack-0.2.0}/PKG-INFO +2 -2
- {cvfile_haystack-0.1.0 → cvfile_haystack-0.2.0}/pyproject.toml +2 -2
- {cvfile_haystack-0.1.0 → cvfile_haystack-0.2.0}/src/haystack_integrations/components/converters/cvfile/converter.py +74 -3
- {cvfile_haystack-0.1.0 → cvfile_haystack-0.2.0}/tests/test_converter.py +31 -0
- {cvfile_haystack-0.1.0 → cvfile_haystack-0.2.0}/.gitignore +0 -0
- {cvfile_haystack-0.1.0 → cvfile_haystack-0.2.0}/README.md +0 -0
- {cvfile_haystack-0.1.0 → cvfile_haystack-0.2.0}/src/haystack_integrations/components/converters/cvfile/__init__.py +0 -0
- {cvfile_haystack-0.1.0 → cvfile_haystack-0.2.0}/tests/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cvfile-haystack
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Haystack integration for the .cv open file format.
|
|
5
5
|
Project-URL: Homepage, https://cvfile.org
|
|
6
6
|
Project-URL: Repository, https://github.com/cvfile/cv
|
|
@@ -16,7 +16,7 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
16
16
|
Classifier: Programming Language :: Python :: 3.12
|
|
17
17
|
Classifier: Programming Language :: Python :: 3.13
|
|
18
18
|
Requires-Python: >=3.10
|
|
19
|
-
Requires-Dist: cvfile<
|
|
19
|
+
Requires-Dist: cvfile<2,>=0.1
|
|
20
20
|
Requires-Dist: haystack-ai<3,>=2.8
|
|
21
21
|
Provides-Extra: dev
|
|
22
22
|
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "cvfile-haystack"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.2.0"
|
|
8
8
|
description = "Haystack integration for the .cv open file format."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { text = "Apache-2.0" }
|
|
@@ -21,7 +21,7 @@ classifiers = [
|
|
|
21
21
|
"Programming Language :: Python :: 3.13",
|
|
22
22
|
]
|
|
23
23
|
dependencies = [
|
|
24
|
-
"cvfile>=0.1
|
|
24
|
+
"cvfile>=0.1,<2",
|
|
25
25
|
"haystack-ai>=2.8,<3",
|
|
26
26
|
]
|
|
27
27
|
|
|
@@ -28,13 +28,41 @@ def _payload_meta(payload: ExtractedPayload, file: CvFile) -> dict[str, Any]:
|
|
|
28
28
|
"mime_type": payload.mime_type,
|
|
29
29
|
"payload": payload.name,
|
|
30
30
|
"relationship": payload.relationship,
|
|
31
|
-
"language": payload.language
|
|
31
|
+
"language": payload.language,
|
|
32
32
|
"primary": payload.name == file.metadata.primary_payload,
|
|
33
33
|
"cv_version": file.metadata.version,
|
|
34
34
|
"cv_generator": file.metadata.generator,
|
|
35
35
|
}
|
|
36
36
|
|
|
37
37
|
|
|
38
|
+
def _resolve_chunks(file: CvFile) -> list:
|
|
39
|
+
"""Decode the file's embeddings.cbor into text-resolved chunks.
|
|
40
|
+
|
|
41
|
+
Delegates to the core SDK so chunk text slicing uses UTF-8 byte offsets
|
|
42
|
+
(spec §5.1) and stays the single source of truth. Returns an empty list
|
|
43
|
+
when the embed extra is not installed or the file carries no embeddings.
|
|
44
|
+
"""
|
|
45
|
+
try:
|
|
46
|
+
from cvfile.embed import resolve_embedding_chunks
|
|
47
|
+
except ImportError:
|
|
48
|
+
return []
|
|
49
|
+
return resolve_embedding_chunks(file)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _chunk_meta(chunk: Any, file: CvFile) -> dict[str, Any]:
|
|
53
|
+
return {
|
|
54
|
+
"language": file.metadata.primary_language,
|
|
55
|
+
"cv_version": file.metadata.version,
|
|
56
|
+
"cv_generator": file.metadata.generator,
|
|
57
|
+
"chunk_id": chunk.id,
|
|
58
|
+
"chunk_offset": chunk.text_offset,
|
|
59
|
+
"chunk_length": chunk.text_length,
|
|
60
|
+
"embedding_model": chunk.model,
|
|
61
|
+
"embedding_dimension": chunk.dimension,
|
|
62
|
+
"embedding_metric": chunk.metric,
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
|
|
38
66
|
@component
|
|
39
67
|
class CVFileToDocument:
|
|
40
68
|
"""Convert ``.cv`` files into Haystack ``Document`` objects.
|
|
@@ -48,18 +76,32 @@ class CVFileToDocument:
|
|
|
48
76
|
Set ``primary_only=True`` to emit only the payload marked as
|
|
49
77
|
``primaryPayload`` in the file's XMP metadata (usually the canonical
|
|
50
78
|
Markdown copy), and skip all alternates.
|
|
79
|
+
|
|
80
|
+
Set ``mode="chunks"`` to emit one ``Document`` per pre-computed embedding
|
|
81
|
+
chunk instead of one per payload. Each chunk ``Document`` carries its vector
|
|
82
|
+
on ``Document.embedding`` and its text is sliced from the markdown using
|
|
83
|
+
UTF-8 byte offsets. Files without an embeddings payload fall back to a single
|
|
84
|
+
Markdown ``Document``. In ``mode="chunks"`` the ``primary_only`` flag is
|
|
85
|
+
ignored (chunks already index a single text payload).
|
|
51
86
|
"""
|
|
52
87
|
|
|
53
|
-
def __init__(self, primary_only: bool = False) -> None:
|
|
88
|
+
def __init__(self, primary_only: bool = False, *, mode: str = "payloads") -> None:
|
|
54
89
|
"""Create a CVFileToDocument component.
|
|
55
90
|
|
|
56
91
|
:param primary_only:
|
|
57
92
|
If ``True``, emit only the payload marked as ``primaryPayload``
|
|
58
93
|
in the file's XMP metadata. If ``False`` (default), emit one
|
|
59
94
|
``Document`` per textual payload (the primary plus any
|
|
60
|
-
language alternates and supplements).
|
|
95
|
+
language alternates and supplements). Ignored in ``mode="chunks"``.
|
|
96
|
+
:param mode:
|
|
97
|
+
``"payloads"`` (default) emits one ``Document`` per textual payload.
|
|
98
|
+
``"chunks"`` emits one ``Document`` per pre-computed embedding chunk
|
|
99
|
+
with its vector attached.
|
|
61
100
|
"""
|
|
101
|
+
if mode not in ("payloads", "chunks"):
|
|
102
|
+
raise ValueError("mode must be 'payloads' or 'chunks'")
|
|
62
103
|
self.primary_only = primary_only
|
|
104
|
+
self.mode = mode
|
|
63
105
|
|
|
64
106
|
@component.output_types(documents=list[Document])
|
|
65
107
|
def run(
|
|
@@ -105,6 +147,10 @@ class CVFileToDocument:
|
|
|
105
147
|
stream_meta = bytestream.meta or {}
|
|
106
148
|
source_label = stream_meta.get("file_path") or stream_meta.get("file_name") or str(source)
|
|
107
149
|
|
|
150
|
+
if self.mode == "chunks":
|
|
151
|
+
documents.extend(self._chunk_documents(file, stream_meta, source_meta, source_label))
|
|
152
|
+
continue
|
|
153
|
+
|
|
108
154
|
for payload in file.payloads:
|
|
109
155
|
if not _is_text_payload(payload):
|
|
110
156
|
continue
|
|
@@ -115,3 +161,28 @@ class CVFileToDocument:
|
|
|
115
161
|
documents.append(Document(content=payload.text(), meta=merged))
|
|
116
162
|
|
|
117
163
|
return {"documents": documents}
|
|
164
|
+
|
|
165
|
+
@staticmethod
|
|
166
|
+
def _chunk_documents(
|
|
167
|
+
file: CvFile,
|
|
168
|
+
stream_meta: dict[str, Any],
|
|
169
|
+
source_meta: dict[str, Any],
|
|
170
|
+
source_label: str,
|
|
171
|
+
) -> list[Document]:
|
|
172
|
+
chunks = _resolve_chunks(file)
|
|
173
|
+
if not chunks:
|
|
174
|
+
primary = next(
|
|
175
|
+
(p for p in file.payloads if p.name == file.metadata.primary_payload and _is_text_payload(p)),
|
|
176
|
+
None,
|
|
177
|
+
)
|
|
178
|
+
if primary is None:
|
|
179
|
+
return []
|
|
180
|
+
payload_meta = _payload_meta(primary, file)
|
|
181
|
+
merged = {**stream_meta, **payload_meta, **source_meta, "source": source_label}
|
|
182
|
+
return [Document(content=primary.text(), meta=merged)]
|
|
183
|
+
|
|
184
|
+
out: list[Document] = []
|
|
185
|
+
for chunk in chunks:
|
|
186
|
+
merged = {**stream_meta, **_chunk_meta(chunk, file), **source_meta, "source": source_label}
|
|
187
|
+
out.append(Document(content=chunk.text, meta=merged, embedding=list(chunk.vector)))
|
|
188
|
+
return out
|
|
@@ -11,6 +11,7 @@ from haystack.dataclasses import ByteStream
|
|
|
11
11
|
from haystack_integrations.components.converters.cvfile import CVFileToDocument
|
|
12
12
|
|
|
13
13
|
FIXTURE = Path(__file__).parents[3] / "packages" / "sdk-js" / "tests" / "fixtures" / "python-produced.cv"
|
|
14
|
+
UNICODE_FIXTURE = Path(__file__).parents[2] / "tests" / "fixtures" / "unicode.cv"
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
@pytest.fixture(scope="module")
|
|
@@ -75,3 +76,33 @@ def test_unreadable_source_is_skipped(tmp_path: Path) -> None:
|
|
|
75
76
|
not_a_cv.write_bytes(b"not a real cv file")
|
|
76
77
|
result = converter.run(sources=[not_a_cv])
|
|
77
78
|
assert result["documents"] == []
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def test_chunks_mode_attaches_a_vector_per_chunk() -> None:
|
|
82
|
+
if not FIXTURE.exists():
|
|
83
|
+
pytest.skip(f"fixture not found: {FIXTURE}")
|
|
84
|
+
docs = CVFileToDocument(mode="chunks").run(sources=[FIXTURE])["documents"]
|
|
85
|
+
assert len(docs) >= 1
|
|
86
|
+
for doc in docs:
|
|
87
|
+
assert doc.embedding is not None
|
|
88
|
+
assert len(doc.embedding) == doc.meta["embedding_dimension"]
|
|
89
|
+
assert all(isinstance(v, float) for v in doc.embedding)
|
|
90
|
+
assert doc.content.strip(), "chunk text should not be empty"
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def test_invalid_mode_rejected() -> None:
|
|
94
|
+
with pytest.raises(ValueError):
|
|
95
|
+
CVFileToDocument(mode="bogus")
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def test_non_ascii_chunk_text_slices_on_byte_offsets() -> None:
|
|
99
|
+
if not UNICODE_FIXTURE.exists():
|
|
100
|
+
pytest.skip(f"fixture not found: {UNICODE_FIXTURE}")
|
|
101
|
+
docs = CVFileToDocument(mode="chunks").run(sources=[UNICODE_FIXTURE])["documents"]
|
|
102
|
+
joined = "".join(d.content for d in docs)
|
|
103
|
+
assert "Élodie" in joined
|
|
104
|
+
assert "工程師" in joined
|
|
105
|
+
assert "🚀" in joined
|
|
106
|
+
assert "经验" in joined
|
|
107
|
+
for doc in docs:
|
|
108
|
+
assert doc.content == doc.content.encode("utf-8").decode("utf-8")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|