chunksmith-multimodal 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunksmith_multimodal/Mapper/__init__.py +37 -0
- chunksmith_multimodal/Mapper/anchor_mapping_options.py +68 -0
- chunksmith_multimodal/Mapper/anchor_slice.py +200 -0
- chunksmith_multimodal/Mapper/anchor_slice_find.py +160 -0
- chunksmith_multimodal/Mapper/anchor_slice_meta.py +75 -0
- chunksmith_multimodal/Mapper/chunk_assignment.py +236 -0
- chunksmith_multimodal/Mapper/chunk_assignment_assign.py +193 -0
- chunksmith_multimodal/Mapper/combined_attach.py +275 -0
- chunksmith_multimodal/Mapper/combined_build.py +273 -0
- chunksmith_multimodal/Mapper/combined_index.py +43 -0
- chunksmith_multimodal/Mapper/combined_media.py +243 -0
- chunksmith_multimodal/Mapper/mapper.py +250 -0
- chunksmith_multimodal/Mapper/mapper_common.py +38 -0
- chunksmith_multimodal/__init__.py +77 -0
- chunksmith_multimodal/coded_formate.py +250 -0
- chunksmith_multimodal/common/__init__.py +0 -0
- chunksmith_multimodal/common/html_table.py +195 -0
- chunksmith_multimodal/config/__init__.py +81 -0
- chunksmith_multimodal/config/config.py +237 -0
- chunksmith_multimodal/display.py +46 -0
- chunksmith_multimodal/element_sources.py +39 -0
- chunksmith_multimodal/elements.py +165 -0
- chunksmith_multimodal/elements_metadata.py +165 -0
- chunksmith_multimodal/entry.py +75 -0
- chunksmith_multimodal/format.py +35 -0
- chunksmith_multimodal/group_by_title/__init__.py +5 -0
- chunksmith_multimodal/group_by_title/base.py +56 -0
- chunksmith_multimodal/group_by_title/chunk_strategies.py +17 -0
- chunksmith_multimodal/group_by_title/chunker.py +177 -0
- chunksmith_multimodal/group_by_title/chunking_options.py +286 -0
- chunksmith_multimodal/group_by_title/element_types.py +63 -0
- chunksmith_multimodal/group_by_title/element_types_constants.py +49 -0
- chunksmith_multimodal/group_by_title/html_table_splitter.py +218 -0
- chunksmith_multimodal/group_by_title/pre_chunk_combiner.py +90 -0
- chunksmith_multimodal/group_by_title/pre_chunk_model.py +127 -0
- chunksmith_multimodal/group_by_title/pre_chunk_pipeline.py +234 -0
- chunksmith_multimodal/group_by_title/table_accumulators.py +116 -0
- chunksmith_multimodal/group_by_title/table_chunker.py +248 -0
- chunksmith_multimodal/group_by_title/table_handling.py +16 -0
- chunksmith_multimodal/group_by_title/text_splitter.py +241 -0
- chunksmith_multimodal/group_by_title/title.py +186 -0
- chunksmith_multimodal/group_by_title/title_grouper.py +76 -0
- chunksmith_multimodal/group_by_title/typing_context.py +31 -0
- chunksmith_multimodal/helpers.py +158 -0
- chunksmith_multimodal/indexer/__init__.py +15 -0
- chunksmith_multimodal/indexer/anchor_fields.py +38 -0
- chunksmith_multimodal/indexer/client.py +184 -0
- chunksmith_multimodal/indexer/coded_chunks.py +79 -0
- chunksmith_multimodal/indexer/config.py +63 -0
- chunksmith_multimodal/indexer/defaults.py +38 -0
- chunksmith_multimodal/indexer/doc_description.py +45 -0
- chunksmith_multimodal/indexer/embedded_toc.py +146 -0
- chunksmith_multimodal/indexer/embedded_toc_merge.py +108 -0
- chunksmith_multimodal/indexer/embedded_toc_pages.py +105 -0
- chunksmith_multimodal/indexer/llm_config.py +57 -0
- chunksmith_multimodal/indexer/llm_progress.py +59 -0
- chunksmith_multimodal/indexer/options.py +57 -0
- chunksmith_multimodal/indexer/outline_backfill.py +140 -0
- chunksmith_multimodal/indexer/probes.py +78 -0
- chunksmith_multimodal/indexer/prompts.py +51 -0
- chunksmith_multimodal/indexer/prompts_core.py +181 -0
- chunksmith_multimodal/indexer/prompts_embedded_toc.py +146 -0
- chunksmith_multimodal/indexer/runner.py +212 -0
- chunksmith_multimodal/indexer/runner_extract.py +128 -0
- chunksmith_multimodal/indexer/runner_json.py +278 -0
- chunksmith_multimodal/indexer/tree.py +121 -0
- chunksmith_multimodal/multi_indexing_config.py +174 -0
- chunksmith_multimodal/outline.py +132 -0
- chunksmith_multimodal/pipeline.py +197 -0
- chunksmith_multimodal/pipeline_config.py +64 -0
- chunksmith_multimodal/pipeline_phases.py +155 -0
- chunksmith_multimodal/toon_codec.py +104 -0
- chunksmith_multimodal/unstructured/__init__.py +59 -0
- chunksmith_multimodal/unstructured/partition_api.py +226 -0
- chunksmith_multimodal/unstructured/partition_async.py +299 -0
- chunksmith_multimodal/unstructured/pdf_split.py +84 -0
- chunksmith_multimodal-0.3.0.dist-info/METADATA +23 -0
- chunksmith_multimodal-0.3.0.dist-info/RECORD +80 -0
- chunksmith_multimodal-0.3.0.dist-info/WHEEL +5 -0
- chunksmith_multimodal-0.3.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Mapper output helpers for ChunkSmith TitleIndexer."""
|
|
2
|
+
|
|
3
|
+
from chunksmith_multimodal.Mapper.anchor_mapping_options import (
|
|
4
|
+
AnchorMappingOptions,
|
|
5
|
+
apply_anchor_mapping_options,
|
|
6
|
+
)
|
|
7
|
+
from chunksmith_multimodal.Mapper.combined_index import (
|
|
8
|
+
MAPPING_METHODS,
|
|
9
|
+
build_combined_index,
|
|
10
|
+
print_combined_index_summary,
|
|
11
|
+
title_chunks_to_chunk_nodes,
|
|
12
|
+
)
|
|
13
|
+
from chunksmith_multimodal.Mapper.mapper import (
|
|
14
|
+
assign_mapper_node_ids,
|
|
15
|
+
build_mapper,
|
|
16
|
+
format_mapper_structure,
|
|
17
|
+
format_mapper_mapping_report_text,
|
|
18
|
+
print_mapper_mapping_report,
|
|
19
|
+
save_mapper_txt,
|
|
20
|
+
summarize_anchor_mapping,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"AnchorMappingOptions",
|
|
25
|
+
"apply_anchor_mapping_options",
|
|
26
|
+
"MAPPING_METHODS",
|
|
27
|
+
"assign_mapper_node_ids",
|
|
28
|
+
"build_combined_index",
|
|
29
|
+
"build_mapper",
|
|
30
|
+
"format_mapper_structure",
|
|
31
|
+
"print_combined_index_summary",
|
|
32
|
+
"format_mapper_mapping_report_text",
|
|
33
|
+
"print_mapper_mapping_report",
|
|
34
|
+
"save_mapper_txt",
|
|
35
|
+
"summarize_anchor_mapping",
|
|
36
|
+
"title_chunks_to_chunk_nodes",
|
|
37
|
+
]
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""Mapper anchor-slice settings (start/end boundary mode), separate from page-index LLM options."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from chunksmith_multimodal.indexer.options import (
|
|
9
|
+
anchor_mapping_mode_label,
|
|
10
|
+
validate_anchor_mapping_flags,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"AnchorMappingOptions",
|
|
15
|
+
"apply_anchor_mapping_options",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class AnchorMappingOptions:
|
|
21
|
+
"""
|
|
22
|
+
How the mapper slices raw elements per outline node (flat preorder).
|
|
23
|
+
|
|
24
|
+
- ``both``: node start anchor + node end anchor
|
|
25
|
+
- ``start_only``: node start; end at next flat node's start (``use_end=False``)
|
|
26
|
+
- ``end_only``: start at previous node's end; node end (``use_start=False``)
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
use_start_anchor_mapping: bool = True
|
|
30
|
+
use_end_anchor_mapping: bool = False
|
|
31
|
+
|
|
32
|
+
def mode_label(self) -> str:
|
|
33
|
+
return anchor_mapping_mode_label(
|
|
34
|
+
use_start=self.use_start_anchor_mapping,
|
|
35
|
+
use_end=self.use_end_anchor_mapping,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
def validate(self) -> None:
|
|
39
|
+
validate_anchor_mapping_flags(
|
|
40
|
+
use_start=self.use_start_anchor_mapping,
|
|
41
|
+
use_end=self.use_end_anchor_mapping,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
def to_indexer_options_dict(self) -> dict[str, bool | str]:
|
|
45
|
+
"""Patch for ``title_outline[\"indexer_options\"]`` / mapper."""
|
|
46
|
+
return {
|
|
47
|
+
"use_start_anchor_mapping": self.use_start_anchor_mapping,
|
|
48
|
+
"use_end_anchor_mapping": self.use_end_anchor_mapping,
|
|
49
|
+
"anchor_mapping_mode": self.mode_label(),
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def apply_anchor_mapping_options(
|
|
54
|
+
title_outline: dict[str, Any],
|
|
55
|
+
options: AnchorMappingOptions | None,
|
|
56
|
+
) -> dict[str, Any]:
|
|
57
|
+
"""
|
|
58
|
+
Merge anchor mapping flags into ``title_outline[\"indexer_options\"]`` (in place).
|
|
59
|
+
|
|
60
|
+
Call before ``build_mapper`` when re-running mapper only, or let the pipeline do it.
|
|
61
|
+
"""
|
|
62
|
+
if options is None:
|
|
63
|
+
return title_outline
|
|
64
|
+
options.validate()
|
|
65
|
+
merged = dict(title_outline.get("indexer_options") or {})
|
|
66
|
+
merged.update(options.to_indexer_options_dict())
|
|
67
|
+
title_outline["indexer_options"] = merged
|
|
68
|
+
return title_outline
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""O(n) element slicing by start/end anchor needles with page and next-start fallbacks."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import copy
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from chunksmith_multimodal.indexer.anchor_fields import (
|
|
9
|
+
anchor_end_from_row,
|
|
10
|
+
anchor_start_from_row,
|
|
11
|
+
)
|
|
12
|
+
from chunksmith_multimodal.Mapper.anchor_slice_find import (
|
|
13
|
+
find_end_exclusive_index,
|
|
14
|
+
find_start_element_index,
|
|
15
|
+
trim_first_element_from_offset,
|
|
16
|
+
trim_last_element_to_anchor_end,
|
|
17
|
+
)
|
|
18
|
+
from chunksmith_multimodal.Mapper.anchor_slice_meta import (
|
|
19
|
+
anchor_mapping_detail_from_meta,
|
|
20
|
+
anchor_mapping_status_from_meta,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"node_anchor_start",
|
|
25
|
+
"node_anchor_end",
|
|
26
|
+
"effective_boundary_needles",
|
|
27
|
+
"boundary_needles_for_node",
|
|
28
|
+
"slice_elements_by_anchor_range",
|
|
29
|
+
"anchor_mapping_status_from_meta",
|
|
30
|
+
"anchor_mapping_detail_from_meta",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def node_anchor_start(node: dict[str, Any]) -> str:
|
|
35
|
+
return anchor_start_from_row(node)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def node_anchor_end(node: dict[str, Any]) -> str:
|
|
39
|
+
return anchor_end_from_row(node)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def effective_boundary_needles(
|
|
43
|
+
flat_nodes: list[dict[str, Any]],
|
|
44
|
+
index: int,
|
|
45
|
+
*,
|
|
46
|
+
use_start_mapping: bool,
|
|
47
|
+
use_end_mapping: bool,
|
|
48
|
+
) -> tuple[str, str, str, str, str, str, str | None, str | None]:
|
|
49
|
+
"""
|
|
50
|
+
Declared effective boundaries (for metadata) and slice inputs (for element search).
|
|
51
|
+
|
|
52
|
+
Returns (
|
|
53
|
+
declared_start, declared_end,
|
|
54
|
+
slice_start, slice_end,
|
|
55
|
+
next_start, prev_end,
|
|
56
|
+
next_node_id, prev_node_id,
|
|
57
|
+
).
|
|
58
|
+
|
|
59
|
+
- both: this node's start + end anchors
|
|
60
|
+
- start_only: this start; declared end = next flat node's start
|
|
61
|
+
- end_only: declared start = previous flat node's end; this end
|
|
62
|
+
"""
|
|
63
|
+
node = flat_nodes[index]
|
|
64
|
+
node_start = node_anchor_start(node)
|
|
65
|
+
node_end = node_anchor_end(node)
|
|
66
|
+
|
|
67
|
+
next_start = ""
|
|
68
|
+
next_nid: str | None = None
|
|
69
|
+
if index + 1 < len(flat_nodes):
|
|
70
|
+
nxt = flat_nodes[index + 1]
|
|
71
|
+
next_start = node_anchor_start(nxt)
|
|
72
|
+
next_nid = str(nxt.get("node_id") or "") or None
|
|
73
|
+
|
|
74
|
+
prev_end = ""
|
|
75
|
+
prev_nid: str | None = None
|
|
76
|
+
if index > 0:
|
|
77
|
+
prev = flat_nodes[index - 1]
|
|
78
|
+
prev_end = node_anchor_end(prev)
|
|
79
|
+
prev_nid = str(prev.get("node_id") or "") or None
|
|
80
|
+
|
|
81
|
+
if use_start_mapping and use_end_mapping:
|
|
82
|
+
return node_start, node_end, node_start, node_end, next_start, prev_end, next_nid, prev_nid
|
|
83
|
+
if use_start_mapping and not use_end_mapping:
|
|
84
|
+
return node_start, next_start, node_start, "", next_start, prev_end, next_nid, prev_nid
|
|
85
|
+
if not use_start_mapping and use_end_mapping:
|
|
86
|
+
return prev_end, node_end, "", node_end, next_start, prev_end, next_nid, prev_nid
|
|
87
|
+
return node_start, node_end, node_start, node_end, next_start, prev_end, next_nid, prev_nid
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def boundary_needles_for_node(
|
|
91
|
+
flat_nodes: list[dict[str, Any]],
|
|
92
|
+
index: int,
|
|
93
|
+
*,
|
|
94
|
+
use_start_mapping: bool,
|
|
95
|
+
use_end_mapping: bool,
|
|
96
|
+
) -> tuple[str, str, str, str, str | None, str | None]:
|
|
97
|
+
"""
|
|
98
|
+
Slice needles for one flat-preorder node.
|
|
99
|
+
|
|
100
|
+
Returns (slice_start, slice_end, next_start, prev_end, next_node_id, prev_node_id).
|
|
101
|
+
"""
|
|
102
|
+
_decl_start, _decl_end, slice_start, slice_end, next_start, prev_end, next_nid, prev_nid = (
|
|
103
|
+
effective_boundary_needles(
|
|
104
|
+
flat_nodes, index, use_start_mapping=use_start_mapping, use_end_mapping=use_end_mapping
|
|
105
|
+
)
|
|
106
|
+
)
|
|
107
|
+
return slice_start, slice_end, next_start, prev_end, next_nid, prev_nid
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def slice_elements_by_anchor_range(
|
|
111
|
+
elements: list[dict[str, Any]],
|
|
112
|
+
*,
|
|
113
|
+
anchor_start: str,
|
|
114
|
+
anchor_end: str,
|
|
115
|
+
start_page: int | None,
|
|
116
|
+
end_page: int | None,
|
|
117
|
+
next_start_needle: str = "",
|
|
118
|
+
prev_end_needle: str = "",
|
|
119
|
+
anchor_mapping_mode: str = "both",
|
|
120
|
+
next_node_id: str | None = None,
|
|
121
|
+
prev_node_id: str | None = None,
|
|
122
|
+
effective_start_needle: str = "",
|
|
123
|
+
effective_end_needle: str = "",
|
|
124
|
+
) -> tuple[list[dict[str, Any]], dict[str, Any]]:
|
|
125
|
+
"""Slice ordered elements for one outline node; metadata for debugging."""
|
|
126
|
+
mode = anchor_mapping_mode or "both"
|
|
127
|
+
end_requires_next = mode == "start_only"
|
|
128
|
+
allow_next_start = mode == "start_only"
|
|
129
|
+
|
|
130
|
+
meta: dict[str, Any] = {
|
|
131
|
+
"mapping_anchor_mode": mode,
|
|
132
|
+
"mapping_anchor_start": anchor_start,
|
|
133
|
+
"mapping_anchor_end": anchor_end,
|
|
134
|
+
"mapping_effective_start_needle": effective_start_needle,
|
|
135
|
+
"mapping_effective_end_needle": effective_end_needle,
|
|
136
|
+
"mapping_anchor_start_found": False,
|
|
137
|
+
"mapping_anchor_end_found": False,
|
|
138
|
+
"mapping_start_strategy": "",
|
|
139
|
+
"mapping_end_strategy": "",
|
|
140
|
+
"mapping_slice_start_strategy": "",
|
|
141
|
+
"mapping_slice_end_strategy": "",
|
|
142
|
+
"mapping_next_node_id": next_node_id,
|
|
143
|
+
"mapping_prev_node_id": prev_node_id,
|
|
144
|
+
}
|
|
145
|
+
if not elements:
|
|
146
|
+
meta["mapping_anchor_status"] = anchor_mapping_status_from_meta(meta)
|
|
147
|
+
meta["mapping_anchor_detail"] = anchor_mapping_detail_from_meta(meta)
|
|
148
|
+
return [], meta
|
|
149
|
+
|
|
150
|
+
start_i, start_found, start_strategy = find_start_element_index(
|
|
151
|
+
elements,
|
|
152
|
+
anchor_start,
|
|
153
|
+
page_fallback=start_page,
|
|
154
|
+
prev_end_needle=prev_end_needle,
|
|
155
|
+
)
|
|
156
|
+
meta["mapping_anchor_start_found"] = start_found
|
|
157
|
+
meta["mapping_start_strategy"] = start_strategy
|
|
158
|
+
|
|
159
|
+
end_i, end_found, strategy = find_end_exclusive_index(
|
|
160
|
+
elements,
|
|
161
|
+
anchor_end,
|
|
162
|
+
from_index=start_i,
|
|
163
|
+
next_start_needle=next_start_needle if allow_next_start else "",
|
|
164
|
+
page_end_fallback=end_page,
|
|
165
|
+
end_requires_next_start=end_requires_next,
|
|
166
|
+
allow_next_start=allow_next_start,
|
|
167
|
+
)
|
|
168
|
+
meta["mapping_anchor_end_found"] = end_found
|
|
169
|
+
meta["mapping_end_strategy"] = strategy
|
|
170
|
+
meta["mapping_slice_start_strategy"] = start_strategy
|
|
171
|
+
meta["mapping_slice_end_strategy"] = strategy
|
|
172
|
+
|
|
173
|
+
sliced = [copy.deepcopy(el) for el in elements[start_i:end_i]]
|
|
174
|
+
trim_start = (
|
|
175
|
+
anchor_start if start_strategy == "anchor_start" else (prev_end_needle if start_strategy == "prev_end" else "")
|
|
176
|
+
)
|
|
177
|
+
if trim_start and sliced:
|
|
178
|
+
sliced = trim_first_element_from_offset(sliced, trim_start)
|
|
179
|
+
if anchor_end and sliced and end_found and strategy.startswith("anchor_end"):
|
|
180
|
+
sliced = trim_last_element_to_anchor_end(sliced, anchor_end)
|
|
181
|
+
if start_strategy == "prev_end":
|
|
182
|
+
meta["mapping_effective_start_boundary"] = "prev_end"
|
|
183
|
+
elif start_strategy == "anchor_start":
|
|
184
|
+
meta["mapping_effective_start_boundary"] = "anchor_start"
|
|
185
|
+
elif start_strategy == "page_start":
|
|
186
|
+
meta["mapping_effective_start_boundary"] = "page_start"
|
|
187
|
+
|
|
188
|
+
if strategy == "next_start":
|
|
189
|
+
meta["mapping_effective_end_boundary"] = "next_start"
|
|
190
|
+
elif strategy.startswith("anchor_end"):
|
|
191
|
+
meta["mapping_effective_end_boundary"] = "anchor_end"
|
|
192
|
+
elif strategy == "page_end":
|
|
193
|
+
meta["mapping_effective_end_boundary"] = "page_end"
|
|
194
|
+
elif strategy == "span_end":
|
|
195
|
+
meta["mapping_effective_end_boundary"] = "span_end"
|
|
196
|
+
else:
|
|
197
|
+
meta["mapping_effective_end_boundary"] = strategy or "unknown"
|
|
198
|
+
meta["mapping_anchor_status"] = anchor_mapping_status_from_meta(meta)
|
|
199
|
+
meta["mapping_anchor_detail"] = anchor_mapping_detail_from_meta(meta)
|
|
200
|
+
return sliced, meta
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""Needle search helpers for anchor-based element slicing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import copy
|
|
6
|
+
import re
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
_WS = re.compile(r"\s+")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def element_page_number(element: dict[str, Any]) -> int | None:
|
|
13
|
+
value = element.get("page_number")
|
|
14
|
+
if isinstance(value, int):
|
|
15
|
+
return value
|
|
16
|
+
metadata = element.get("metadata")
|
|
17
|
+
if isinstance(metadata, dict) and isinstance(metadata.get("page_number"), int):
|
|
18
|
+
return metadata["page_number"]
|
|
19
|
+
return None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def element_text(element: dict[str, Any]) -> str:
|
|
23
|
+
return str(element.get("text") or "")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def fold_text(text: str) -> str:
|
|
27
|
+
return _WS.sub(" ", text or "").strip().casefold()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def find_needle_span(text: str, needle: str, *, start_at: int = 0) -> tuple[int, int] | None:
|
|
31
|
+
"""
|
|
32
|
+
Find ``needle`` in ``text``; return ``(start, end_exclusive)`` char offsets.
|
|
33
|
+
|
|
34
|
+
Tolerates citation brackets before a trailing period (e.g. ``loss.`` vs ``loss [11].``).
|
|
35
|
+
"""
|
|
36
|
+
if not needle or not text:
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
folded = text.casefold()
|
|
40
|
+
nfold = fold_text(needle)
|
|
41
|
+
if not nfold:
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
off = folded.find(nfold, start_at)
|
|
45
|
+
if off >= 0:
|
|
46
|
+
return off, off + len(nfold)
|
|
47
|
+
|
|
48
|
+
core = needle.rstrip(" .!?…").strip()
|
|
49
|
+
if len(core) < 12:
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
core_fold = core.casefold()
|
|
53
|
+
pattern = re.escape(core_fold) + r"(?:\s*\[\d+\])?\s*\.?"
|
|
54
|
+
match = re.search(pattern, folded[start_at:])
|
|
55
|
+
if match:
|
|
56
|
+
start = start_at + match.start()
|
|
57
|
+
return start, start_at + match.end()
|
|
58
|
+
|
|
59
|
+
return None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def find_needle_in_text(text: str, needle: str, *, start_at: int = 0) -> int | None:
|
|
63
|
+
span = find_needle_span(text, needle, start_at=start_at)
|
|
64
|
+
return span[0] if span else None
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def find_start_element_index(
|
|
68
|
+
elements: list[dict[str, Any]],
|
|
69
|
+
anchor_start: str,
|
|
70
|
+
*,
|
|
71
|
+
page_fallback: int | None,
|
|
72
|
+
prev_end_needle: str = "",
|
|
73
|
+
) -> tuple[int, bool, str]:
|
|
74
|
+
"""First element at/after start match; prev_end then anchor_start; else page fallback."""
|
|
75
|
+
if prev_end_needle and not anchor_start:
|
|
76
|
+
for i, el in enumerate(elements):
|
|
77
|
+
if find_needle_in_text(element_text(el), prev_end_needle) is not None:
|
|
78
|
+
return i, True, "prev_end"
|
|
79
|
+
if anchor_start:
|
|
80
|
+
for i, el in enumerate(elements):
|
|
81
|
+
if find_needle_in_text(element_text(el), anchor_start) is not None:
|
|
82
|
+
return i, True, "anchor_start"
|
|
83
|
+
if page_fallback is not None:
|
|
84
|
+
for i, el in enumerate(elements):
|
|
85
|
+
if element_page_number(el) == page_fallback:
|
|
86
|
+
return i, False, "page_start"
|
|
87
|
+
return 0, False, "page_start"
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def find_end_exclusive_index(
|
|
91
|
+
elements: list[dict[str, Any]],
|
|
92
|
+
anchor_end: str,
|
|
93
|
+
*,
|
|
94
|
+
from_index: int,
|
|
95
|
+
next_start_needle: str,
|
|
96
|
+
page_end_fallback: int | None,
|
|
97
|
+
end_requires_next_start: bool = False,
|
|
98
|
+
allow_next_start: bool = False,
|
|
99
|
+
) -> tuple[int, bool, str]:
|
|
100
|
+
"""
|
|
101
|
+
Exclusive end index for slice [from_index, end).
|
|
102
|
+
|
|
103
|
+
When ``end_requires_next_start`` (start_only mode), skip node end anchor.
|
|
104
|
+
``next_start`` is only used when ``allow_next_start`` is True (start_only).
|
|
105
|
+
"""
|
|
106
|
+
n = len(elements)
|
|
107
|
+
if from_index >= n:
|
|
108
|
+
return n, False, "empty"
|
|
109
|
+
|
|
110
|
+
if anchor_end and not end_requires_next_start:
|
|
111
|
+
for i in range(from_index, n):
|
|
112
|
+
text = element_text(elements[i])
|
|
113
|
+
span = find_needle_span(text, anchor_end)
|
|
114
|
+
if span is not None:
|
|
115
|
+
_start, end_char = span
|
|
116
|
+
if end_char >= len(text.rstrip()):
|
|
117
|
+
return i + 1, True, "anchor_end"
|
|
118
|
+
return i + 1, True, "anchor_end_partial"
|
|
119
|
+
|
|
120
|
+
if allow_next_start and next_start_needle:
|
|
121
|
+
for i in range(from_index + 1, n):
|
|
122
|
+
text = element_text(elements[i])
|
|
123
|
+
if find_needle_in_text(text, next_start_needle) is not None:
|
|
124
|
+
return i, True, "next_start"
|
|
125
|
+
|
|
126
|
+
if page_end_fallback is not None:
|
|
127
|
+
last_on_page = from_index
|
|
128
|
+
for i in range(from_index, n):
|
|
129
|
+
if element_page_number(elements[i]) == page_end_fallback:
|
|
130
|
+
last_on_page = i
|
|
131
|
+
elif element_page_number(elements[i]) is not None and element_page_number(elements[i]) > page_end_fallback:
|
|
132
|
+
break
|
|
133
|
+
return last_on_page + 1, False, "page_end"
|
|
134
|
+
|
|
135
|
+
return n, False, "span_end"
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def trim_first_element_from_offset(
|
|
139
|
+
elements: list[dict[str, Any]],
|
|
140
|
+
needle: str,
|
|
141
|
+
) -> list[dict[str, Any]]:
|
|
142
|
+
if not elements or not needle:
|
|
143
|
+
return elements
|
|
144
|
+
first = copy.deepcopy(elements[0])
|
|
145
|
+
text = element_text(first)
|
|
146
|
+
span = find_needle_span(text, needle)
|
|
147
|
+
if span is not None and span[0] > 0:
|
|
148
|
+
first["text"] = text[span[0] :].strip()
|
|
149
|
+
return [first, *elements[1:]]
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def trim_last_element_to_anchor_end(elements: list[dict[str, Any]], anchor_end: str) -> list[dict[str, Any]]:
|
|
153
|
+
if not elements or not anchor_end:
|
|
154
|
+
return elements
|
|
155
|
+
last = copy.deepcopy(elements[-1])
|
|
156
|
+
text = element_text(last)
|
|
157
|
+
span = find_needle_span(text, anchor_end)
|
|
158
|
+
if span is not None:
|
|
159
|
+
last["text"] = text[: span[1]].strip()
|
|
160
|
+
return [*elements[:-1], last]
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Anchor mapping status and detail strings from slice metadata."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def anchor_mapping_status_from_meta(meta: dict[str, Any]) -> str:
|
|
9
|
+
"""Per-node pass/failed by anchor_mapping_mode."""
|
|
10
|
+
mode = str(meta.get("mapping_anchor_mode") or "both")
|
|
11
|
+
start_found = bool(meta.get("mapping_anchor_start_found"))
|
|
12
|
+
start_strategy = str(meta.get("mapping_start_strategy") or "")
|
|
13
|
+
end_strategy = str(meta.get("mapping_end_strategy") or "")
|
|
14
|
+
end_found = bool(meta.get("mapping_anchor_end_found"))
|
|
15
|
+
|
|
16
|
+
if mode == "start_only":
|
|
17
|
+
if not start_found or start_strategy == "page_start":
|
|
18
|
+
return "failed"
|
|
19
|
+
if end_strategy == "page_end":
|
|
20
|
+
return "failed"
|
|
21
|
+
return "pass"
|
|
22
|
+
|
|
23
|
+
if mode == "end_only":
|
|
24
|
+
if not start_found or start_strategy == "page_start":
|
|
25
|
+
return "failed"
|
|
26
|
+
if not end_found or end_strategy == "page_end":
|
|
27
|
+
return "failed"
|
|
28
|
+
return "pass"
|
|
29
|
+
|
|
30
|
+
if not start_found or start_strategy == "page_start":
|
|
31
|
+
return "failed"
|
|
32
|
+
if not end_found or not end_strategy.startswith("anchor_end"):
|
|
33
|
+
return "failed"
|
|
34
|
+
return "pass"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def anchor_mapping_detail_from_meta(meta: dict[str, Any]) -> str:
|
|
38
|
+
"""Short explanation of which fallbacks were used."""
|
|
39
|
+
mode = str(meta.get("mapping_anchor_mode") or "both")
|
|
40
|
+
start_found = bool(meta.get("mapping_anchor_start_found"))
|
|
41
|
+
start_strategy = str(meta.get("mapping_start_strategy") or "")
|
|
42
|
+
end_found = bool(meta.get("mapping_anchor_end_found"))
|
|
43
|
+
end_strategy = str(meta.get("mapping_end_strategy") or "") or "unknown"
|
|
44
|
+
next_nid = meta.get("mapping_next_node_id")
|
|
45
|
+
prev_nid = meta.get("mapping_prev_node_id")
|
|
46
|
+
|
|
47
|
+
parts: list[str] = [f"mode: {mode}"]
|
|
48
|
+
if start_strategy == "prev_end":
|
|
49
|
+
label = "start: prev node end"
|
|
50
|
+
if prev_nid:
|
|
51
|
+
label += f" (node {prev_nid})"
|
|
52
|
+
parts.append(label)
|
|
53
|
+
elif start_found:
|
|
54
|
+
parts.append("start: anchor")
|
|
55
|
+
else:
|
|
56
|
+
parts.append("start: page fallback")
|
|
57
|
+
|
|
58
|
+
if end_strategy == "next_start":
|
|
59
|
+
label = "end: next node start"
|
|
60
|
+
if next_nid:
|
|
61
|
+
label += f" (node {next_nid})"
|
|
62
|
+
parts.append(label)
|
|
63
|
+
elif end_found and end_strategy.startswith("anchor_end"):
|
|
64
|
+
label = "end: anchor"
|
|
65
|
+
if end_strategy == "anchor_end_partial":
|
|
66
|
+
label = "end: anchor (trimmed at mid-element)"
|
|
67
|
+
parts.append(label)
|
|
68
|
+
elif end_strategy == "page_end":
|
|
69
|
+
parts.append("end: page fallback")
|
|
70
|
+
elif end_strategy == "span_end":
|
|
71
|
+
parts.append("end: span end")
|
|
72
|
+
else:
|
|
73
|
+
parts.append(f"end: {end_strategy}")
|
|
74
|
+
|
|
75
|
+
return "; ".join(parts)
|