cvfile-haystack 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cvfile-haystack
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: Haystack integration for the .cv open file format.
5
5
  Project-URL: Homepage, https://cvfile.org
6
6
  Project-URL: Repository, https://github.com/cvfile/cv
@@ -16,7 +16,7 @@ Classifier: Programming Language :: Python :: 3.11
16
16
  Classifier: Programming Language :: Python :: 3.12
17
17
  Classifier: Programming Language :: Python :: 3.13
18
18
  Requires-Python: >=3.10
19
- Requires-Dist: cvfile<1,>=0.1.0
19
+ Requires-Dist: cvfile<2,>=0.1
20
20
  Requires-Dist: haystack-ai<3,>=2.8
21
21
  Provides-Extra: dev
22
22
  Requires-Dist: pytest>=8.0; extra == 'dev'
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "cvfile-haystack"
7
- version = "0.1.0"
7
+ version = "0.2.0"
8
8
  description = "Haystack integration for the .cv open file format."
9
9
  readme = "README.md"
10
10
  license = { text = "Apache-2.0" }
@@ -21,7 +21,7 @@ classifiers = [
21
21
  "Programming Language :: Python :: 3.13",
22
22
  ]
23
23
  dependencies = [
24
- "cvfile>=0.1.0,<1",
24
+ "cvfile>=0.1,<2",
25
25
  "haystack-ai>=2.8,<3",
26
26
  ]
27
27
 
@@ -28,13 +28,41 @@ def _payload_meta(payload: ExtractedPayload, file: CvFile) -> dict[str, Any]:
28
28
  "mime_type": payload.mime_type,
29
29
  "payload": payload.name,
30
30
  "relationship": payload.relationship,
31
- "language": payload.language or file.metadata.primary_language,
31
+ "language": payload.language,
32
32
  "primary": payload.name == file.metadata.primary_payload,
33
33
  "cv_version": file.metadata.version,
34
34
  "cv_generator": file.metadata.generator,
35
35
  }
36
36
 
37
37
 
38
+ def _resolve_chunks(file: CvFile) -> list:
39
+ """Decode the file's embeddings.cbor into text-resolved chunks.
40
+
41
+ Delegates to the core SDK so chunk text slicing uses UTF-8 byte offsets
42
+ (spec §5.1) and stays the single source of truth. Returns an empty list
43
+ when the embed extra is not installed or the file carries no embeddings.
44
+ """
45
+ try:
46
+ from cvfile.embed import resolve_embedding_chunks
47
+ except ImportError:
48
+ return []
49
+ return resolve_embedding_chunks(file)
50
+
51
+
52
+ def _chunk_meta(chunk: Any, file: CvFile) -> dict[str, Any]:
53
+ return {
54
+ "language": file.metadata.primary_language,
55
+ "cv_version": file.metadata.version,
56
+ "cv_generator": file.metadata.generator,
57
+ "chunk_id": chunk.id,
58
+ "chunk_offset": chunk.text_offset,
59
+ "chunk_length": chunk.text_length,
60
+ "embedding_model": chunk.model,
61
+ "embedding_dimension": chunk.dimension,
62
+ "embedding_metric": chunk.metric,
63
+ }
64
+
65
+
38
66
  @component
39
67
  class CVFileToDocument:
40
68
  """Convert ``.cv`` files into Haystack ``Document`` objects.
@@ -48,18 +76,32 @@ class CVFileToDocument:
48
76
  Set ``primary_only=True`` to emit only the payload marked as
49
77
  ``primaryPayload`` in the file's XMP metadata (usually the canonical
50
78
  Markdown copy), and skip all alternates.
79
+
80
+ Set ``mode="chunks"`` to emit one ``Document`` per pre-computed embedding
81
+ chunk instead of one per payload. Each chunk ``Document`` carries its vector
82
+ on ``Document.embedding`` and its text is sliced from the markdown using
83
+ UTF-8 byte offsets. Files without an embeddings payload fall back to a single
84
+ Markdown ``Document``. In ``mode="chunks"`` the ``primary_only`` flag is
85
+ ignored (chunks already index a single text payload).
51
86
  """
52
87
 
53
- def __init__(self, primary_only: bool = False) -> None:
88
+ def __init__(self, primary_only: bool = False, *, mode: str = "payloads") -> None:
54
89
  """Create a CVFileToDocument component.
55
90
 
56
91
  :param primary_only:
57
92
  If ``True``, emit only the payload marked as ``primaryPayload``
58
93
  in the file's XMP metadata. If ``False`` (default), emit one
59
94
  ``Document`` per textual payload (the primary plus any
60
- language alternates and supplements).
95
+ language alternates and supplements). Ignored in ``mode="chunks"``.
96
+ :param mode:
97
+ ``"payloads"`` (default) emits one ``Document`` per textual payload.
98
+ ``"chunks"`` emits one ``Document`` per pre-computed embedding chunk
99
+ with its vector attached.
61
100
  """
101
+ if mode not in ("payloads", "chunks"):
102
+ raise ValueError("mode must be 'payloads' or 'chunks'")
62
103
  self.primary_only = primary_only
104
+ self.mode = mode
63
105
 
64
106
  @component.output_types(documents=list[Document])
65
107
  def run(
@@ -105,6 +147,10 @@ class CVFileToDocument:
105
147
  stream_meta = bytestream.meta or {}
106
148
  source_label = stream_meta.get("file_path") or stream_meta.get("file_name") or str(source)
107
149
 
150
+ if self.mode == "chunks":
151
+ documents.extend(self._chunk_documents(file, stream_meta, source_meta, source_label))
152
+ continue
153
+
108
154
  for payload in file.payloads:
109
155
  if not _is_text_payload(payload):
110
156
  continue
@@ -115,3 +161,28 @@ class CVFileToDocument:
115
161
  documents.append(Document(content=payload.text(), meta=merged))
116
162
 
117
163
  return {"documents": documents}
164
+
165
+ @staticmethod
166
+ def _chunk_documents(
167
+ file: CvFile,
168
+ stream_meta: dict[str, Any],
169
+ source_meta: dict[str, Any],
170
+ source_label: str,
171
+ ) -> list[Document]:
172
+ chunks = _resolve_chunks(file)
173
+ if not chunks:
174
+ primary = next(
175
+ (p for p in file.payloads if p.name == file.metadata.primary_payload and _is_text_payload(p)),
176
+ None,
177
+ )
178
+ if primary is None:
179
+ return []
180
+ payload_meta = _payload_meta(primary, file)
181
+ merged = {**stream_meta, **payload_meta, **source_meta, "source": source_label}
182
+ return [Document(content=primary.text(), meta=merged)]
183
+
184
+ out: list[Document] = []
185
+ for chunk in chunks:
186
+ merged = {**stream_meta, **_chunk_meta(chunk, file), **source_meta, "source": source_label}
187
+ out.append(Document(content=chunk.text, meta=merged, embedding=list(chunk.vector)))
188
+ return out
@@ -11,6 +11,7 @@ from haystack.dataclasses import ByteStream
11
11
  from haystack_integrations.components.converters.cvfile import CVFileToDocument
12
12
 
13
13
  FIXTURE = Path(__file__).parents[3] / "packages" / "sdk-js" / "tests" / "fixtures" / "python-produced.cv"
14
+ UNICODE_FIXTURE = Path(__file__).parents[2] / "tests" / "fixtures" / "unicode.cv"
14
15
 
15
16
 
16
17
  @pytest.fixture(scope="module")
@@ -75,3 +76,33 @@ def test_unreadable_source_is_skipped(tmp_path: Path) -> None:
75
76
  not_a_cv.write_bytes(b"not a real cv file")
76
77
  result = converter.run(sources=[not_a_cv])
77
78
  assert result["documents"] == []
79
+
80
+
81
+ def test_chunks_mode_attaches_a_vector_per_chunk() -> None:
82
+ if not FIXTURE.exists():
83
+ pytest.skip(f"fixture not found: {FIXTURE}")
84
+ docs = CVFileToDocument(mode="chunks").run(sources=[FIXTURE])["documents"]
85
+ assert len(docs) >= 1
86
+ for doc in docs:
87
+ assert doc.embedding is not None
88
+ assert len(doc.embedding) == doc.meta["embedding_dimension"]
89
+ assert all(isinstance(v, float) for v in doc.embedding)
90
+ assert doc.content.strip(), "chunk text should not be empty"
91
+
92
+
93
+ def test_invalid_mode_rejected() -> None:
94
+ with pytest.raises(ValueError):
95
+ CVFileToDocument(mode="bogus")
96
+
97
+
98
+ def test_non_ascii_chunk_text_slices_on_byte_offsets() -> None:
99
+ if not UNICODE_FIXTURE.exists():
100
+ pytest.skip(f"fixture not found: {UNICODE_FIXTURE}")
101
+ docs = CVFileToDocument(mode="chunks").run(sources=[UNICODE_FIXTURE])["documents"]
102
+ joined = "".join(d.content for d in docs)
103
+ assert "Élodie" in joined
104
+ assert "工程師" in joined
105
+ assert "🚀" in joined
106
+ assert "经验" in joined
107
+ for doc in docs:
108
+ assert doc.content == doc.content.encode("utf-8").decode("utf-8")