chunksmith-multimodal 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. chunksmith_multimodal/Mapper/__init__.py +37 -0
  2. chunksmith_multimodal/Mapper/anchor_mapping_options.py +68 -0
  3. chunksmith_multimodal/Mapper/anchor_slice.py +200 -0
  4. chunksmith_multimodal/Mapper/anchor_slice_find.py +160 -0
  5. chunksmith_multimodal/Mapper/anchor_slice_meta.py +75 -0
  6. chunksmith_multimodal/Mapper/chunk_assignment.py +236 -0
  7. chunksmith_multimodal/Mapper/chunk_assignment_assign.py +193 -0
  8. chunksmith_multimodal/Mapper/combined_attach.py +275 -0
  9. chunksmith_multimodal/Mapper/combined_build.py +273 -0
  10. chunksmith_multimodal/Mapper/combined_index.py +43 -0
  11. chunksmith_multimodal/Mapper/combined_media.py +243 -0
  12. chunksmith_multimodal/Mapper/mapper.py +250 -0
  13. chunksmith_multimodal/Mapper/mapper_common.py +38 -0
  14. chunksmith_multimodal/__init__.py +77 -0
  15. chunksmith_multimodal/coded_formate.py +250 -0
  16. chunksmith_multimodal/common/__init__.py +0 -0
  17. chunksmith_multimodal/common/html_table.py +195 -0
  18. chunksmith_multimodal/config/__init__.py +81 -0
  19. chunksmith_multimodal/config/config.py +237 -0
  20. chunksmith_multimodal/display.py +46 -0
  21. chunksmith_multimodal/element_sources.py +39 -0
  22. chunksmith_multimodal/elements.py +165 -0
  23. chunksmith_multimodal/elements_metadata.py +165 -0
  24. chunksmith_multimodal/entry.py +75 -0
  25. chunksmith_multimodal/format.py +35 -0
  26. chunksmith_multimodal/group_by_title/__init__.py +5 -0
  27. chunksmith_multimodal/group_by_title/base.py +56 -0
  28. chunksmith_multimodal/group_by_title/chunk_strategies.py +17 -0
  29. chunksmith_multimodal/group_by_title/chunker.py +177 -0
  30. chunksmith_multimodal/group_by_title/chunking_options.py +286 -0
  31. chunksmith_multimodal/group_by_title/element_types.py +63 -0
  32. chunksmith_multimodal/group_by_title/element_types_constants.py +49 -0
  33. chunksmith_multimodal/group_by_title/html_table_splitter.py +218 -0
  34. chunksmith_multimodal/group_by_title/pre_chunk_combiner.py +90 -0
  35. chunksmith_multimodal/group_by_title/pre_chunk_model.py +127 -0
  36. chunksmith_multimodal/group_by_title/pre_chunk_pipeline.py +234 -0
  37. chunksmith_multimodal/group_by_title/table_accumulators.py +116 -0
  38. chunksmith_multimodal/group_by_title/table_chunker.py +248 -0
  39. chunksmith_multimodal/group_by_title/table_handling.py +16 -0
  40. chunksmith_multimodal/group_by_title/text_splitter.py +241 -0
  41. chunksmith_multimodal/group_by_title/title.py +186 -0
  42. chunksmith_multimodal/group_by_title/title_grouper.py +76 -0
  43. chunksmith_multimodal/group_by_title/typing_context.py +31 -0
  44. chunksmith_multimodal/helpers.py +158 -0
  45. chunksmith_multimodal/indexer/__init__.py +15 -0
  46. chunksmith_multimodal/indexer/anchor_fields.py +38 -0
  47. chunksmith_multimodal/indexer/client.py +184 -0
  48. chunksmith_multimodal/indexer/coded_chunks.py +79 -0
  49. chunksmith_multimodal/indexer/config.py +63 -0
  50. chunksmith_multimodal/indexer/defaults.py +38 -0
  51. chunksmith_multimodal/indexer/doc_description.py +45 -0
  52. chunksmith_multimodal/indexer/embedded_toc.py +146 -0
  53. chunksmith_multimodal/indexer/embedded_toc_merge.py +108 -0
  54. chunksmith_multimodal/indexer/embedded_toc_pages.py +105 -0
  55. chunksmith_multimodal/indexer/llm_config.py +57 -0
  56. chunksmith_multimodal/indexer/llm_progress.py +59 -0
  57. chunksmith_multimodal/indexer/options.py +57 -0
  58. chunksmith_multimodal/indexer/outline_backfill.py +140 -0
  59. chunksmith_multimodal/indexer/probes.py +78 -0
  60. chunksmith_multimodal/indexer/prompts.py +51 -0
  61. chunksmith_multimodal/indexer/prompts_core.py +181 -0
  62. chunksmith_multimodal/indexer/prompts_embedded_toc.py +146 -0
  63. chunksmith_multimodal/indexer/runner.py +212 -0
  64. chunksmith_multimodal/indexer/runner_extract.py +128 -0
  65. chunksmith_multimodal/indexer/runner_json.py +278 -0
  66. chunksmith_multimodal/indexer/tree.py +121 -0
  67. chunksmith_multimodal/multi_indexing_config.py +174 -0
  68. chunksmith_multimodal/outline.py +132 -0
  69. chunksmith_multimodal/pipeline.py +197 -0
  70. chunksmith_multimodal/pipeline_config.py +64 -0
  71. chunksmith_multimodal/pipeline_phases.py +155 -0
  72. chunksmith_multimodal/toon_codec.py +104 -0
  73. chunksmith_multimodal/unstructured/__init__.py +59 -0
  74. chunksmith_multimodal/unstructured/partition_api.py +226 -0
  75. chunksmith_multimodal/unstructured/partition_async.py +299 -0
  76. chunksmith_multimodal/unstructured/pdf_split.py +84 -0
  77. chunksmith_multimodal-0.3.0.dist-info/METADATA +23 -0
  78. chunksmith_multimodal-0.3.0.dist-info/RECORD +80 -0
  79. chunksmith_multimodal-0.3.0.dist-info/WHEEL +5 -0
  80. chunksmith_multimodal-0.3.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,37 @@
1
+ """Mapper output helpers for ChunkSmith TitleIndexer."""
2
+
3
+ from chunksmith_multimodal.Mapper.anchor_mapping_options import (
4
+ AnchorMappingOptions,
5
+ apply_anchor_mapping_options,
6
+ )
7
+ from chunksmith_multimodal.Mapper.combined_index import (
8
+ MAPPING_METHODS,
9
+ build_combined_index,
10
+ print_combined_index_summary,
11
+ title_chunks_to_chunk_nodes,
12
+ )
13
+ from chunksmith_multimodal.Mapper.mapper import (
14
+ assign_mapper_node_ids,
15
+ build_mapper,
16
+ format_mapper_structure,
17
+ format_mapper_mapping_report_text,
18
+ print_mapper_mapping_report,
19
+ save_mapper_txt,
20
+ summarize_anchor_mapping,
21
+ )
22
+
23
+ __all__ = [
24
+ "AnchorMappingOptions",
25
+ "apply_anchor_mapping_options",
26
+ "MAPPING_METHODS",
27
+ "assign_mapper_node_ids",
28
+ "build_combined_index",
29
+ "build_mapper",
30
+ "format_mapper_structure",
31
+ "print_combined_index_summary",
32
+ "format_mapper_mapping_report_text",
33
+ "print_mapper_mapping_report",
34
+ "save_mapper_txt",
35
+ "summarize_anchor_mapping",
36
+ "title_chunks_to_chunk_nodes",
37
+ ]
@@ -0,0 +1,68 @@
1
+ """Mapper anchor-slice settings (start/end boundary mode), separate from page-index LLM options."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Any
7
+
8
+ from chunksmith_multimodal.indexer.options import (
9
+ anchor_mapping_mode_label,
10
+ validate_anchor_mapping_flags,
11
+ )
12
+
13
+ __all__ = [
14
+ "AnchorMappingOptions",
15
+ "apply_anchor_mapping_options",
16
+ ]
17
+
18
+
19
+ @dataclass
20
+ class AnchorMappingOptions:
21
+ """
22
+ How the mapper slices raw elements per outline node (flat preorder).
23
+
24
+ - ``both``: node start anchor + node end anchor
25
+ - ``start_only``: node start; end at next flat node's start (``use_end=False``)
26
+ - ``end_only``: start at previous node's end; node end (``use_start=False``)
27
+ """
28
+
29
+ use_start_anchor_mapping: bool = True
30
+ use_end_anchor_mapping: bool = False
31
+
32
+ def mode_label(self) -> str:
33
+ return anchor_mapping_mode_label(
34
+ use_start=self.use_start_anchor_mapping,
35
+ use_end=self.use_end_anchor_mapping,
36
+ )
37
+
38
+ def validate(self) -> None:
39
+ validate_anchor_mapping_flags(
40
+ use_start=self.use_start_anchor_mapping,
41
+ use_end=self.use_end_anchor_mapping,
42
+ )
43
+
44
+ def to_indexer_options_dict(self) -> dict[str, bool | str]:
45
+ """Patch for ``title_outline[\"indexer_options\"]`` / mapper."""
46
+ return {
47
+ "use_start_anchor_mapping": self.use_start_anchor_mapping,
48
+ "use_end_anchor_mapping": self.use_end_anchor_mapping,
49
+ "anchor_mapping_mode": self.mode_label(),
50
+ }
51
+
52
+
53
+ def apply_anchor_mapping_options(
54
+ title_outline: dict[str, Any],
55
+ options: AnchorMappingOptions | None,
56
+ ) -> dict[str, Any]:
57
+ """
58
+ Merge anchor mapping flags into ``title_outline[\"indexer_options\"]`` (in place).
59
+
60
+ Call before ``build_mapper`` when re-running mapper only, or let the pipeline do it.
61
+ """
62
+ if options is None:
63
+ return title_outline
64
+ options.validate()
65
+ merged = dict(title_outline.get("indexer_options") or {})
66
+ merged.update(options.to_indexer_options_dict())
67
+ title_outline["indexer_options"] = merged
68
+ return title_outline
@@ -0,0 +1,200 @@
1
+ """O(n) element slicing by start/end anchor needles with page and next-start fallbacks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import copy
6
+ from typing import Any
7
+
8
+ from chunksmith_multimodal.indexer.anchor_fields import (
9
+ anchor_end_from_row,
10
+ anchor_start_from_row,
11
+ )
12
+ from chunksmith_multimodal.Mapper.anchor_slice_find import (
13
+ find_end_exclusive_index,
14
+ find_start_element_index,
15
+ trim_first_element_from_offset,
16
+ trim_last_element_to_anchor_end,
17
+ )
18
+ from chunksmith_multimodal.Mapper.anchor_slice_meta import (
19
+ anchor_mapping_detail_from_meta,
20
+ anchor_mapping_status_from_meta,
21
+ )
22
+
23
+ __all__ = [
24
+ "node_anchor_start",
25
+ "node_anchor_end",
26
+ "effective_boundary_needles",
27
+ "boundary_needles_for_node",
28
+ "slice_elements_by_anchor_range",
29
+ "anchor_mapping_status_from_meta",
30
+ "anchor_mapping_detail_from_meta",
31
+ ]
32
+
33
+
34
+ def node_anchor_start(node: dict[str, Any]) -> str:
35
+ return anchor_start_from_row(node)
36
+
37
+
38
+ def node_anchor_end(node: dict[str, Any]) -> str:
39
+ return anchor_end_from_row(node)
40
+
41
+
42
+ def effective_boundary_needles(
43
+ flat_nodes: list[dict[str, Any]],
44
+ index: int,
45
+ *,
46
+ use_start_mapping: bool,
47
+ use_end_mapping: bool,
48
+ ) -> tuple[str, str, str, str, str, str, str | None, str | None]:
49
+ """
50
+ Declared effective boundaries (for metadata) and slice inputs (for element search).
51
+
52
+ Returns (
53
+ declared_start, declared_end,
54
+ slice_start, slice_end,
55
+ next_start, prev_end,
56
+ next_node_id, prev_node_id,
57
+ ).
58
+
59
+ - both: this node's start + end anchors
60
+ - start_only: this start; declared end = next flat node's start
61
+ - end_only: declared start = previous flat node's end; this end
62
+ """
63
+ node = flat_nodes[index]
64
+ node_start = node_anchor_start(node)
65
+ node_end = node_anchor_end(node)
66
+
67
+ next_start = ""
68
+ next_nid: str | None = None
69
+ if index + 1 < len(flat_nodes):
70
+ nxt = flat_nodes[index + 1]
71
+ next_start = node_anchor_start(nxt)
72
+ next_nid = str(nxt.get("node_id") or "") or None
73
+
74
+ prev_end = ""
75
+ prev_nid: str | None = None
76
+ if index > 0:
77
+ prev = flat_nodes[index - 1]
78
+ prev_end = node_anchor_end(prev)
79
+ prev_nid = str(prev.get("node_id") or "") or None
80
+
81
+ if use_start_mapping and use_end_mapping:
82
+ return node_start, node_end, node_start, node_end, next_start, prev_end, next_nid, prev_nid
83
+ if use_start_mapping and not use_end_mapping:
84
+ return node_start, next_start, node_start, "", next_start, prev_end, next_nid, prev_nid
85
+ if not use_start_mapping and use_end_mapping:
86
+ return prev_end, node_end, "", node_end, next_start, prev_end, next_nid, prev_nid
87
+ return node_start, node_end, node_start, node_end, next_start, prev_end, next_nid, prev_nid
88
+
89
+
90
+ def boundary_needles_for_node(
91
+ flat_nodes: list[dict[str, Any]],
92
+ index: int,
93
+ *,
94
+ use_start_mapping: bool,
95
+ use_end_mapping: bool,
96
+ ) -> tuple[str, str, str, str, str | None, str | None]:
97
+ """
98
+ Slice needles for one flat-preorder node.
99
+
100
+ Returns (slice_start, slice_end, next_start, prev_end, next_node_id, prev_node_id).
101
+ """
102
+ _decl_start, _decl_end, slice_start, slice_end, next_start, prev_end, next_nid, prev_nid = (
103
+ effective_boundary_needles(
104
+ flat_nodes, index, use_start_mapping=use_start_mapping, use_end_mapping=use_end_mapping
105
+ )
106
+ )
107
+ return slice_start, slice_end, next_start, prev_end, next_nid, prev_nid
108
+
109
+
110
+ def slice_elements_by_anchor_range(
111
+ elements: list[dict[str, Any]],
112
+ *,
113
+ anchor_start: str,
114
+ anchor_end: str,
115
+ start_page: int | None,
116
+ end_page: int | None,
117
+ next_start_needle: str = "",
118
+ prev_end_needle: str = "",
119
+ anchor_mapping_mode: str = "both",
120
+ next_node_id: str | None = None,
121
+ prev_node_id: str | None = None,
122
+ effective_start_needle: str = "",
123
+ effective_end_needle: str = "",
124
+ ) -> tuple[list[dict[str, Any]], dict[str, Any]]:
125
+ """Slice ordered elements for one outline node; metadata for debugging."""
126
+ mode = anchor_mapping_mode or "both"
127
+ end_requires_next = mode == "start_only"
128
+ allow_next_start = mode == "start_only"
129
+
130
+ meta: dict[str, Any] = {
131
+ "mapping_anchor_mode": mode,
132
+ "mapping_anchor_start": anchor_start,
133
+ "mapping_anchor_end": anchor_end,
134
+ "mapping_effective_start_needle": effective_start_needle,
135
+ "mapping_effective_end_needle": effective_end_needle,
136
+ "mapping_anchor_start_found": False,
137
+ "mapping_anchor_end_found": False,
138
+ "mapping_start_strategy": "",
139
+ "mapping_end_strategy": "",
140
+ "mapping_slice_start_strategy": "",
141
+ "mapping_slice_end_strategy": "",
142
+ "mapping_next_node_id": next_node_id,
143
+ "mapping_prev_node_id": prev_node_id,
144
+ }
145
+ if not elements:
146
+ meta["mapping_anchor_status"] = anchor_mapping_status_from_meta(meta)
147
+ meta["mapping_anchor_detail"] = anchor_mapping_detail_from_meta(meta)
148
+ return [], meta
149
+
150
+ start_i, start_found, start_strategy = find_start_element_index(
151
+ elements,
152
+ anchor_start,
153
+ page_fallback=start_page,
154
+ prev_end_needle=prev_end_needle,
155
+ )
156
+ meta["mapping_anchor_start_found"] = start_found
157
+ meta["mapping_start_strategy"] = start_strategy
158
+
159
+ end_i, end_found, strategy = find_end_exclusive_index(
160
+ elements,
161
+ anchor_end,
162
+ from_index=start_i,
163
+ next_start_needle=next_start_needle if allow_next_start else "",
164
+ page_end_fallback=end_page,
165
+ end_requires_next_start=end_requires_next,
166
+ allow_next_start=allow_next_start,
167
+ )
168
+ meta["mapping_anchor_end_found"] = end_found
169
+ meta["mapping_end_strategy"] = strategy
170
+ meta["mapping_slice_start_strategy"] = start_strategy
171
+ meta["mapping_slice_end_strategy"] = strategy
172
+
173
+ sliced = [copy.deepcopy(el) for el in elements[start_i:end_i]]
174
+ trim_start = (
175
+ anchor_start if start_strategy == "anchor_start" else (prev_end_needle if start_strategy == "prev_end" else "")
176
+ )
177
+ if trim_start and sliced:
178
+ sliced = trim_first_element_from_offset(sliced, trim_start)
179
+ if anchor_end and sliced and end_found and strategy.startswith("anchor_end"):
180
+ sliced = trim_last_element_to_anchor_end(sliced, anchor_end)
181
+ if start_strategy == "prev_end":
182
+ meta["mapping_effective_start_boundary"] = "prev_end"
183
+ elif start_strategy == "anchor_start":
184
+ meta["mapping_effective_start_boundary"] = "anchor_start"
185
+ elif start_strategy == "page_start":
186
+ meta["mapping_effective_start_boundary"] = "page_start"
187
+
188
+ if strategy == "next_start":
189
+ meta["mapping_effective_end_boundary"] = "next_start"
190
+ elif strategy.startswith("anchor_end"):
191
+ meta["mapping_effective_end_boundary"] = "anchor_end"
192
+ elif strategy == "page_end":
193
+ meta["mapping_effective_end_boundary"] = "page_end"
194
+ elif strategy == "span_end":
195
+ meta["mapping_effective_end_boundary"] = "span_end"
196
+ else:
197
+ meta["mapping_effective_end_boundary"] = strategy or "unknown"
198
+ meta["mapping_anchor_status"] = anchor_mapping_status_from_meta(meta)
199
+ meta["mapping_anchor_detail"] = anchor_mapping_detail_from_meta(meta)
200
+ return sliced, meta
@@ -0,0 +1,160 @@
1
+ """Needle search helpers for anchor-based element slicing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import copy
6
+ import re
7
+ from typing import Any
8
+
9
+ _WS = re.compile(r"\s+")
10
+
11
+
12
+ def element_page_number(element: dict[str, Any]) -> int | None:
13
+ value = element.get("page_number")
14
+ if isinstance(value, int):
15
+ return value
16
+ metadata = element.get("metadata")
17
+ if isinstance(metadata, dict) and isinstance(metadata.get("page_number"), int):
18
+ return metadata["page_number"]
19
+ return None
20
+
21
+
22
+ def element_text(element: dict[str, Any]) -> str:
23
+ return str(element.get("text") or "")
24
+
25
+
26
+ def fold_text(text: str) -> str:
27
+ return _WS.sub(" ", text or "").strip().casefold()
28
+
29
+
30
+ def find_needle_span(text: str, needle: str, *, start_at: int = 0) -> tuple[int, int] | None:
31
+ """
32
+ Find ``needle`` in ``text``; return ``(start, end_exclusive)`` char offsets.
33
+
34
+ Tolerates citation brackets before a trailing period (e.g. ``loss.`` vs ``loss [11].``).
35
+ """
36
+ if not needle or not text:
37
+ return None
38
+
39
+ folded = text.casefold()
40
+ nfold = fold_text(needle)
41
+ if not nfold:
42
+ return None
43
+
44
+ off = folded.find(nfold, start_at)
45
+ if off >= 0:
46
+ return off, off + len(nfold)
47
+
48
+ core = needle.rstrip(" .!?…").strip()
49
+ if len(core) < 12:
50
+ return None
51
+
52
+ core_fold = core.casefold()
53
+ pattern = re.escape(core_fold) + r"(?:\s*\[\d+\])?\s*\.?"
54
+ match = re.search(pattern, folded[start_at:])
55
+ if match:
56
+ start = start_at + match.start()
57
+ return start, start_at + match.end()
58
+
59
+ return None
60
+
61
+
62
+ def find_needle_in_text(text: str, needle: str, *, start_at: int = 0) -> int | None:
63
+ span = find_needle_span(text, needle, start_at=start_at)
64
+ return span[0] if span else None
65
+
66
+
67
+ def find_start_element_index(
68
+ elements: list[dict[str, Any]],
69
+ anchor_start: str,
70
+ *,
71
+ page_fallback: int | None,
72
+ prev_end_needle: str = "",
73
+ ) -> tuple[int, bool, str]:
74
+ """First element at/after start match; prev_end then anchor_start; else page fallback."""
75
+ if prev_end_needle and not anchor_start:
76
+ for i, el in enumerate(elements):
77
+ if find_needle_in_text(element_text(el), prev_end_needle) is not None:
78
+ return i, True, "prev_end"
79
+ if anchor_start:
80
+ for i, el in enumerate(elements):
81
+ if find_needle_in_text(element_text(el), anchor_start) is not None:
82
+ return i, True, "anchor_start"
83
+ if page_fallback is not None:
84
+ for i, el in enumerate(elements):
85
+ if element_page_number(el) == page_fallback:
86
+ return i, False, "page_start"
87
+ return 0, False, "page_start"
88
+
89
+
90
+ def find_end_exclusive_index(
91
+ elements: list[dict[str, Any]],
92
+ anchor_end: str,
93
+ *,
94
+ from_index: int,
95
+ next_start_needle: str,
96
+ page_end_fallback: int | None,
97
+ end_requires_next_start: bool = False,
98
+ allow_next_start: bool = False,
99
+ ) -> tuple[int, bool, str]:
100
+ """
101
+ Exclusive end index for slice [from_index, end).
102
+
103
+ When ``end_requires_next_start`` (start_only mode), skip node end anchor.
104
+ ``next_start`` is only used when ``allow_next_start`` is True (start_only).
105
+ """
106
+ n = len(elements)
107
+ if from_index >= n:
108
+ return n, False, "empty"
109
+
110
+ if anchor_end and not end_requires_next_start:
111
+ for i in range(from_index, n):
112
+ text = element_text(elements[i])
113
+ span = find_needle_span(text, anchor_end)
114
+ if span is not None:
115
+ _start, end_char = span
116
+ if end_char >= len(text.rstrip()):
117
+ return i + 1, True, "anchor_end"
118
+ return i + 1, True, "anchor_end_partial"
119
+
120
+ if allow_next_start and next_start_needle:
121
+ for i in range(from_index + 1, n):
122
+ text = element_text(elements[i])
123
+ if find_needle_in_text(text, next_start_needle) is not None:
124
+ return i, True, "next_start"
125
+
126
+ if page_end_fallback is not None:
127
+ last_on_page = from_index
128
+ for i in range(from_index, n):
129
+ if element_page_number(elements[i]) == page_end_fallback:
130
+ last_on_page = i
131
+ elif element_page_number(elements[i]) is not None and element_page_number(elements[i]) > page_end_fallback:
132
+ break
133
+ return last_on_page + 1, False, "page_end"
134
+
135
+ return n, False, "span_end"
136
+
137
+
138
+ def trim_first_element_from_offset(
139
+ elements: list[dict[str, Any]],
140
+ needle: str,
141
+ ) -> list[dict[str, Any]]:
142
+ if not elements or not needle:
143
+ return elements
144
+ first = copy.deepcopy(elements[0])
145
+ text = element_text(first)
146
+ span = find_needle_span(text, needle)
147
+ if span is not None and span[0] > 0:
148
+ first["text"] = text[span[0] :].strip()
149
+ return [first, *elements[1:]]
150
+
151
+
152
+ def trim_last_element_to_anchor_end(elements: list[dict[str, Any]], anchor_end: str) -> list[dict[str, Any]]:
153
+ if not elements or not anchor_end:
154
+ return elements
155
+ last = copy.deepcopy(elements[-1])
156
+ text = element_text(last)
157
+ span = find_needle_span(text, anchor_end)
158
+ if span is not None:
159
+ last["text"] = text[: span[1]].strip()
160
+ return [*elements[:-1], last]
@@ -0,0 +1,75 @@
1
+ """Anchor mapping status and detail strings from slice metadata."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+
8
+ def anchor_mapping_status_from_meta(meta: dict[str, Any]) -> str:
9
+ """Per-node pass/failed by anchor_mapping_mode."""
10
+ mode = str(meta.get("mapping_anchor_mode") or "both")
11
+ start_found = bool(meta.get("mapping_anchor_start_found"))
12
+ start_strategy = str(meta.get("mapping_start_strategy") or "")
13
+ end_strategy = str(meta.get("mapping_end_strategy") or "")
14
+ end_found = bool(meta.get("mapping_anchor_end_found"))
15
+
16
+ if mode == "start_only":
17
+ if not start_found or start_strategy == "page_start":
18
+ return "failed"
19
+ if end_strategy == "page_end":
20
+ return "failed"
21
+ return "pass"
22
+
23
+ if mode == "end_only":
24
+ if not start_found or start_strategy == "page_start":
25
+ return "failed"
26
+ if not end_found or end_strategy == "page_end":
27
+ return "failed"
28
+ return "pass"
29
+
30
+ if not start_found or start_strategy == "page_start":
31
+ return "failed"
32
+ if not end_found or not end_strategy.startswith("anchor_end"):
33
+ return "failed"
34
+ return "pass"
35
+
36
+
37
+ def anchor_mapping_detail_from_meta(meta: dict[str, Any]) -> str:
38
+ """Short explanation of which fallbacks were used."""
39
+ mode = str(meta.get("mapping_anchor_mode") or "both")
40
+ start_found = bool(meta.get("mapping_anchor_start_found"))
41
+ start_strategy = str(meta.get("mapping_start_strategy") or "")
42
+ end_found = bool(meta.get("mapping_anchor_end_found"))
43
+ end_strategy = str(meta.get("mapping_end_strategy") or "") or "unknown"
44
+ next_nid = meta.get("mapping_next_node_id")
45
+ prev_nid = meta.get("mapping_prev_node_id")
46
+
47
+ parts: list[str] = [f"mode: {mode}"]
48
+ if start_strategy == "prev_end":
49
+ label = "start: prev node end"
50
+ if prev_nid:
51
+ label += f" (node {prev_nid})"
52
+ parts.append(label)
53
+ elif start_found:
54
+ parts.append("start: anchor")
55
+ else:
56
+ parts.append("start: page fallback")
57
+
58
+ if end_strategy == "next_start":
59
+ label = "end: next node start"
60
+ if next_nid:
61
+ label += f" (node {next_nid})"
62
+ parts.append(label)
63
+ elif end_found and end_strategy.startswith("anchor_end"):
64
+ label = "end: anchor"
65
+ if end_strategy == "anchor_end_partial":
66
+ label = "end: anchor (trimmed at mid-element)"
67
+ parts.append(label)
68
+ elif end_strategy == "page_end":
69
+ parts.append("end: page fallback")
70
+ elif end_strategy == "span_end":
71
+ parts.append("end: span end")
72
+ else:
73
+ parts.append(f"end: {end_strategy}")
74
+
75
+ return "; ".join(parts)