haiku.rag-slim 0.16.0__py3-none-any.whl → 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of haiku.rag-slim might be problematic. Click here for more details.
- haiku/rag/app.py +430 -72
- haiku/rag/chunkers/__init__.py +31 -0
- haiku/rag/chunkers/base.py +31 -0
- haiku/rag/chunkers/docling_local.py +164 -0
- haiku/rag/chunkers/docling_serve.py +179 -0
- haiku/rag/cli.py +207 -24
- haiku/rag/cli_chat.py +489 -0
- haiku/rag/client.py +1251 -266
- haiku/rag/config/__init__.py +16 -10
- haiku/rag/config/loader.py +5 -44
- haiku/rag/config/models.py +126 -17
- haiku/rag/converters/__init__.py +31 -0
- haiku/rag/converters/base.py +63 -0
- haiku/rag/converters/docling_local.py +193 -0
- haiku/rag/converters/docling_serve.py +229 -0
- haiku/rag/converters/text_utils.py +237 -0
- haiku/rag/embeddings/__init__.py +123 -24
- haiku/rag/embeddings/voyageai.py +175 -20
- haiku/rag/graph/__init__.py +0 -11
- haiku/rag/graph/agui/__init__.py +8 -2
- haiku/rag/graph/agui/cli_renderer.py +1 -1
- haiku/rag/graph/agui/emitter.py +219 -31
- haiku/rag/graph/agui/server.py +20 -62
- haiku/rag/graph/agui/stream.py +1 -2
- haiku/rag/graph/research/__init__.py +5 -2
- haiku/rag/graph/research/dependencies.py +12 -126
- haiku/rag/graph/research/graph.py +390 -135
- haiku/rag/graph/research/models.py +91 -112
- haiku/rag/graph/research/prompts.py +99 -91
- haiku/rag/graph/research/state.py +35 -27
- haiku/rag/inspector/__init__.py +8 -0
- haiku/rag/inspector/app.py +259 -0
- haiku/rag/inspector/widgets/__init__.py +6 -0
- haiku/rag/inspector/widgets/chunk_list.py +100 -0
- haiku/rag/inspector/widgets/context_modal.py +89 -0
- haiku/rag/inspector/widgets/detail_view.py +130 -0
- haiku/rag/inspector/widgets/document_list.py +75 -0
- haiku/rag/inspector/widgets/info_modal.py +209 -0
- haiku/rag/inspector/widgets/search_modal.py +183 -0
- haiku/rag/inspector/widgets/visual_modal.py +126 -0
- haiku/rag/mcp.py +106 -102
- haiku/rag/monitor.py +33 -9
- haiku/rag/providers/__init__.py +5 -0
- haiku/rag/providers/docling_serve.py +108 -0
- haiku/rag/qa/__init__.py +12 -10
- haiku/rag/qa/agent.py +43 -61
- haiku/rag/qa/prompts.py +35 -57
- haiku/rag/reranking/__init__.py +9 -6
- haiku/rag/reranking/base.py +1 -1
- haiku/rag/reranking/cohere.py +5 -4
- haiku/rag/reranking/mxbai.py +5 -2
- haiku/rag/reranking/vllm.py +3 -4
- haiku/rag/reranking/zeroentropy.py +6 -5
- haiku/rag/store/__init__.py +2 -1
- haiku/rag/store/engine.py +242 -42
- haiku/rag/store/exceptions.py +4 -0
- haiku/rag/store/models/__init__.py +8 -2
- haiku/rag/store/models/chunk.py +190 -0
- haiku/rag/store/models/document.py +46 -0
- haiku/rag/store/repositories/chunk.py +141 -121
- haiku/rag/store/repositories/document.py +25 -84
- haiku/rag/store/repositories/settings.py +11 -14
- haiku/rag/store/upgrades/__init__.py +19 -3
- haiku/rag/store/upgrades/v0_10_1.py +1 -1
- haiku/rag/store/upgrades/v0_19_6.py +65 -0
- haiku/rag/store/upgrades/v0_20_0.py +68 -0
- haiku/rag/store/upgrades/v0_23_1.py +100 -0
- haiku/rag/store/upgrades/v0_9_3.py +3 -3
- haiku/rag/utils.py +371 -146
- {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/METADATA +15 -12
- haiku_rag_slim-0.24.0.dist-info/RECORD +78 -0
- {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/WHEEL +1 -1
- haiku/rag/chunker.py +0 -65
- haiku/rag/embeddings/base.py +0 -25
- haiku/rag/embeddings/ollama.py +0 -28
- haiku/rag/embeddings/openai.py +0 -26
- haiku/rag/embeddings/vllm.py +0 -29
- haiku/rag/graph/agui/events.py +0 -254
- haiku/rag/graph/common/__init__.py +0 -5
- haiku/rag/graph/common/models.py +0 -42
- haiku/rag/graph/common/nodes.py +0 -265
- haiku/rag/graph/common/prompts.py +0 -46
- haiku/rag/graph/common/utils.py +0 -44
- haiku/rag/graph/deep_qa/__init__.py +0 -1
- haiku/rag/graph/deep_qa/dependencies.py +0 -27
- haiku/rag/graph/deep_qa/graph.py +0 -243
- haiku/rag/graph/deep_qa/models.py +0 -20
- haiku/rag/graph/deep_qa/prompts.py +0 -59
- haiku/rag/graph/deep_qa/state.py +0 -56
- haiku/rag/graph/research/common.py +0 -87
- haiku/rag/reader.py +0 -135
- haiku_rag_slim-0.16.0.dist-info/RECORD +0 -71
- {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/entry_points.txt +0 -0
- {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Document chunker abstraction for haiku.rag."""
|
|
2
|
+
|
|
3
|
+
from haiku.rag.chunkers.base import DocumentChunker
|
|
4
|
+
from haiku.rag.config import AppConfig, Config
|
|
5
|
+
|
|
6
|
+
__all__ = ["DocumentChunker", "get_chunker"]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_chunker(config: AppConfig = Config) -> DocumentChunker:
|
|
10
|
+
"""Get a document chunker instance based on configuration.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
config: Configuration to use. Defaults to global Config.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
DocumentChunker instance configured according to the config.
|
|
17
|
+
|
|
18
|
+
Raises:
|
|
19
|
+
ValueError: If the chunker provider is not recognized.
|
|
20
|
+
"""
|
|
21
|
+
if config.processing.chunker == "docling-local":
|
|
22
|
+
from haiku.rag.chunkers.docling_local import DoclingLocalChunker
|
|
23
|
+
|
|
24
|
+
return DoclingLocalChunker(config)
|
|
25
|
+
|
|
26
|
+
if config.processing.chunker == "docling-serve":
|
|
27
|
+
from haiku.rag.chunkers.docling_serve import DoclingServeChunker
|
|
28
|
+
|
|
29
|
+
return DoclingServeChunker(config)
|
|
30
|
+
|
|
31
|
+
raise ValueError(f"Unsupported chunker: {config.processing.chunker}")
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
from docling_core.types.doc.document import DoclingDocument
|
|
6
|
+
|
|
7
|
+
from haiku.rag.store.models.chunk import Chunk
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DocumentChunker(ABC):
|
|
11
|
+
"""Abstract base class for document chunkers.
|
|
12
|
+
|
|
13
|
+
Document chunkers split DoclingDocuments into smaller text chunks suitable
|
|
14
|
+
for embedding and retrieval, respecting document structure and semantic boundaries.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
@abstractmethod
|
|
18
|
+
async def chunk(self, document: "DoclingDocument") -> list["Chunk"]:
|
|
19
|
+
"""Split a document into chunks with metadata.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
document: The DoclingDocument to chunk.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
List of Chunk with content and structured metadata in the metadata dict
|
|
26
|
+
(doc_item_refs, headings, labels, page_numbers).
|
|
27
|
+
|
|
28
|
+
Raises:
|
|
29
|
+
ValueError: If chunking fails.
|
|
30
|
+
"""
|
|
31
|
+
pass
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING, cast
|
|
2
|
+
|
|
3
|
+
from haiku.rag.chunkers.base import DocumentChunker
|
|
4
|
+
from haiku.rag.config import AppConfig, Config
|
|
5
|
+
from haiku.rag.store.models.chunk import Chunk, ChunkMetadata
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from docling_core.transforms.chunker.doc_chunk import DocMeta
|
|
9
|
+
from docling_core.types.doc.document import DoclingDocument
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _create_markdown_serializer_provider(use_markdown_tables: bool = True):
|
|
13
|
+
"""Create a markdown serializer provider with configurable table rendering.
|
|
14
|
+
|
|
15
|
+
This function creates a custom serializer provider that extends ChunkingSerializerProvider
|
|
16
|
+
from docling-core. It's implemented as a factory function to avoid importing
|
|
17
|
+
docling-core at module level.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
use_markdown_tables: If True, use MarkdownTableSerializer for rendering tables as
|
|
21
|
+
markdown. If False, use default TripletTableSerializer for narrative format.
|
|
22
|
+
"""
|
|
23
|
+
from docling_core.transforms.chunker.hierarchical_chunker import (
|
|
24
|
+
ChunkingDocSerializer,
|
|
25
|
+
ChunkingSerializerProvider,
|
|
26
|
+
)
|
|
27
|
+
from docling_core.transforms.serializer.markdown import MarkdownTableSerializer
|
|
28
|
+
|
|
29
|
+
class MDTableSerializerProvider(ChunkingSerializerProvider):
|
|
30
|
+
"""Serializer provider for markdown table output."""
|
|
31
|
+
|
|
32
|
+
def __init__(self, use_markdown_tables: bool = True):
|
|
33
|
+
self.use_markdown_tables = use_markdown_tables
|
|
34
|
+
|
|
35
|
+
def get_serializer(self, doc):
|
|
36
|
+
if self.use_markdown_tables:
|
|
37
|
+
return ChunkingDocSerializer(
|
|
38
|
+
doc=doc,
|
|
39
|
+
table_serializer=MarkdownTableSerializer(),
|
|
40
|
+
)
|
|
41
|
+
else:
|
|
42
|
+
# Use default ChunkingDocSerializer (TripletTableSerializer)
|
|
43
|
+
return ChunkingDocSerializer(doc=doc)
|
|
44
|
+
|
|
45
|
+
return MDTableSerializerProvider(use_markdown_tables=use_markdown_tables)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class DoclingLocalChunker(DocumentChunker):
|
|
49
|
+
"""Local document chunker using docling's chunkers.
|
|
50
|
+
|
|
51
|
+
Supports both hybrid (structure-aware) and hierarchical chunking strategies.
|
|
52
|
+
Chunking is performed locally using the HuggingFace tokenizer specified in
|
|
53
|
+
configuration.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
config: Application configuration.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
def __init__(self, config: AppConfig = Config):
|
|
60
|
+
from docling_core.transforms.chunker.hierarchical_chunker import (
|
|
61
|
+
HierarchicalChunker,
|
|
62
|
+
)
|
|
63
|
+
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
|
|
64
|
+
from docling_core.transforms.chunker.tokenizer.huggingface import (
|
|
65
|
+
HuggingFaceTokenizer,
|
|
66
|
+
)
|
|
67
|
+
from transformers import AutoTokenizer
|
|
68
|
+
|
|
69
|
+
self.config = config
|
|
70
|
+
self.chunk_size = config.processing.chunk_size
|
|
71
|
+
self.chunker_type = config.processing.chunker_type
|
|
72
|
+
self.tokenizer_name = config.processing.chunking_tokenizer
|
|
73
|
+
|
|
74
|
+
if self.chunker_type == "hybrid":
|
|
75
|
+
hf_tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name)
|
|
76
|
+
tokenizer = HuggingFaceTokenizer(
|
|
77
|
+
tokenizer=hf_tokenizer, max_tokens=self.chunk_size
|
|
78
|
+
)
|
|
79
|
+
serializer_provider = _create_markdown_serializer_provider(
|
|
80
|
+
use_markdown_tables=config.processing.chunking_use_markdown_tables
|
|
81
|
+
)
|
|
82
|
+
self.chunker = HybridChunker(
|
|
83
|
+
tokenizer=tokenizer,
|
|
84
|
+
merge_peers=config.processing.chunking_merge_peers,
|
|
85
|
+
serializer_provider=serializer_provider,
|
|
86
|
+
)
|
|
87
|
+
elif self.chunker_type == "hierarchical":
|
|
88
|
+
serializer_provider = _create_markdown_serializer_provider(
|
|
89
|
+
use_markdown_tables=config.processing.chunking_use_markdown_tables
|
|
90
|
+
)
|
|
91
|
+
self.chunker = HierarchicalChunker(serializer_provider=serializer_provider)
|
|
92
|
+
else:
|
|
93
|
+
raise ValueError(
|
|
94
|
+
f"Unsupported chunker_type: {self.chunker_type}. "
|
|
95
|
+
"Must be 'hybrid' or 'hierarchical'."
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
async def chunk(self, document: "DoclingDocument") -> list[Chunk]:
|
|
99
|
+
"""Split the document into chunks with metadata.
|
|
100
|
+
|
|
101
|
+
Extracts structured metadata from each DocChunk including:
|
|
102
|
+
- doc_item_refs: JSON pointer references to DocItems (e.g., "#/texts/5")
|
|
103
|
+
- headings: Section heading hierarchy
|
|
104
|
+
- labels: Semantic labels for each doc_item (e.g., "paragraph", "table")
|
|
105
|
+
- page_numbers: Page numbers where content appears
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
document: The DoclingDocument to be split into chunks.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
List of Chunk containing content and structured metadata.
|
|
112
|
+
"""
|
|
113
|
+
if document is None:
|
|
114
|
+
return []
|
|
115
|
+
|
|
116
|
+
raw_chunks = list(self.chunker.chunk(document))
|
|
117
|
+
result: list[Chunk] = []
|
|
118
|
+
|
|
119
|
+
for chunk in raw_chunks:
|
|
120
|
+
text = chunk.text
|
|
121
|
+
|
|
122
|
+
# Extract metadata from DocChunk.meta (cast to DocMeta for type safety)
|
|
123
|
+
doc_item_refs: list[str] = []
|
|
124
|
+
labels: list[str] = []
|
|
125
|
+
page_numbers: list[int] = []
|
|
126
|
+
headings: list[str] | None = None
|
|
127
|
+
|
|
128
|
+
meta = cast("DocMeta | None", chunk.meta)
|
|
129
|
+
if meta and meta.doc_items:
|
|
130
|
+
for doc_item in meta.doc_items:
|
|
131
|
+
# Get JSON pointer reference
|
|
132
|
+
if doc_item.self_ref:
|
|
133
|
+
doc_item_refs.append(doc_item.self_ref)
|
|
134
|
+
# Get label
|
|
135
|
+
if doc_item.label:
|
|
136
|
+
labels.append(doc_item.label)
|
|
137
|
+
# Get page numbers from provenance
|
|
138
|
+
if doc_item.prov:
|
|
139
|
+
for prov in doc_item.prov:
|
|
140
|
+
if (
|
|
141
|
+
prov.page_no is not None
|
|
142
|
+
and prov.page_no not in page_numbers
|
|
143
|
+
):
|
|
144
|
+
page_numbers.append(prov.page_no)
|
|
145
|
+
|
|
146
|
+
# Get headings from chunk metadata
|
|
147
|
+
if meta and meta.headings:
|
|
148
|
+
headings = list(meta.headings)
|
|
149
|
+
|
|
150
|
+
chunk_metadata = ChunkMetadata(
|
|
151
|
+
doc_item_refs=doc_item_refs,
|
|
152
|
+
headings=headings,
|
|
153
|
+
labels=labels,
|
|
154
|
+
page_numbers=sorted(page_numbers),
|
|
155
|
+
)
|
|
156
|
+
result.append(
|
|
157
|
+
Chunk(
|
|
158
|
+
content=text,
|
|
159
|
+
metadata=chunk_metadata.model_dump(),
|
|
160
|
+
order=len(result),
|
|
161
|
+
)
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
return result
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from io import BytesIO
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from haiku.rag.chunkers.base import DocumentChunker
|
|
6
|
+
from haiku.rag.config import AppConfig, Config
|
|
7
|
+
from haiku.rag.providers.docling_serve import DoclingServeClient
|
|
8
|
+
from haiku.rag.store.models.chunk import Chunk, ChunkMetadata
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from docling_core.types.doc.document import DoclingDocument
|
|
12
|
+
|
|
13
|
+
# Pattern to parse refs like "#/texts/5" or "#/tables/0"
|
|
14
|
+
REF_PATTERN = re.compile(r"^#/(\w+)/(\d+)$")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _resolve_label_from_document(ref: str, document: "DoclingDocument") -> str | None:
|
|
18
|
+
"""Resolve the label for a doc_item ref by looking it up in the document.
|
|
19
|
+
|
|
20
|
+
The docling-serve API only returns ref strings in doc_items, not labels.
|
|
21
|
+
This function resolves actual labels from the DoclingDocument.
|
|
22
|
+
See: https://github.com/docling-project/docling-serve/issues/448
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
ref: JSON pointer reference like "#/texts/5" or "#/tables/0"
|
|
26
|
+
document: The DoclingDocument to look up the item in
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
The label string if found, None otherwise
|
|
30
|
+
"""
|
|
31
|
+
match = REF_PATTERN.match(ref)
|
|
32
|
+
if not match:
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
collection_name = match.group(1)
|
|
36
|
+
index = int(match.group(2))
|
|
37
|
+
|
|
38
|
+
collection = getattr(document, collection_name, None)
|
|
39
|
+
if collection is None or index >= len(collection):
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
item = collection[index]
|
|
43
|
+
return getattr(item, "label", None)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class DoclingServeChunker(DocumentChunker):
|
|
47
|
+
"""Remote document chunker using docling-serve API.
|
|
48
|
+
|
|
49
|
+
Sends DoclingDocument JSON to docling-serve for chunking. Supports both hybrid
|
|
50
|
+
and hierarchical chunking strategies via remote API.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
config: Application configuration containing docling-serve settings.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def __init__(self, config: AppConfig = Config):
|
|
57
|
+
self.config = config
|
|
58
|
+
self.client = DoclingServeClient(
|
|
59
|
+
base_url=config.providers.docling_serve.base_url,
|
|
60
|
+
api_key=config.providers.docling_serve.api_key,
|
|
61
|
+
)
|
|
62
|
+
self.chunker_type = config.processing.chunker_type
|
|
63
|
+
|
|
64
|
+
def _build_chunking_data(self) -> dict[str, str]:
|
|
65
|
+
"""Build form data for chunking request."""
|
|
66
|
+
return {
|
|
67
|
+
"chunking_max_tokens": str(self.config.processing.chunk_size),
|
|
68
|
+
"chunking_tokenizer": self.config.processing.chunking_tokenizer,
|
|
69
|
+
"chunking_merge_peers": str(
|
|
70
|
+
self.config.processing.chunking_merge_peers
|
|
71
|
+
).lower(),
|
|
72
|
+
"chunking_use_markdown_tables": str(
|
|
73
|
+
self.config.processing.chunking_use_markdown_tables
|
|
74
|
+
).lower(),
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
async def _call_chunk_api(self, document: "DoclingDocument") -> list[dict]:
|
|
78
|
+
"""Call docling-serve chunking API and return raw chunk data.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
document: The DoclingDocument to be split into chunks.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
List of chunk dictionaries from API response.
|
|
85
|
+
|
|
86
|
+
Raises:
|
|
87
|
+
ValueError: If chunking fails or service is unavailable.
|
|
88
|
+
"""
|
|
89
|
+
# Determine endpoint based on chunker_type
|
|
90
|
+
if self.chunker_type == "hierarchical":
|
|
91
|
+
endpoint = "/v1/chunk/hierarchical/file/async"
|
|
92
|
+
else:
|
|
93
|
+
endpoint = "/v1/chunk/hybrid/file/async"
|
|
94
|
+
|
|
95
|
+
# Export document to JSON
|
|
96
|
+
doc_json = document.model_dump_json()
|
|
97
|
+
doc_bytes = doc_json.encode("utf-8")
|
|
98
|
+
|
|
99
|
+
# Prepare multipart request with DoclingDocument JSON
|
|
100
|
+
files = {"files": ("document.json", BytesIO(doc_bytes), "application/json")}
|
|
101
|
+
data = self._build_chunking_data()
|
|
102
|
+
|
|
103
|
+
result = await self.client.submit_and_poll(
|
|
104
|
+
endpoint=endpoint,
|
|
105
|
+
files=files,
|
|
106
|
+
data=data,
|
|
107
|
+
name="document",
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
return result.get("chunks", [])
|
|
111
|
+
|
|
112
|
+
async def chunk(self, document: "DoclingDocument") -> list[Chunk]:
|
|
113
|
+
"""Split the document into chunks with metadata via docling-serve.
|
|
114
|
+
|
|
115
|
+
Extracts structured metadata from the API response including:
|
|
116
|
+
- doc_item_refs: JSON pointer references to DocItems (e.g., "#/texts/5")
|
|
117
|
+
- headings: Section heading hierarchy
|
|
118
|
+
- labels: Semantic labels for each doc_item
|
|
119
|
+
- page_numbers: Page numbers where content appears
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
document: The DoclingDocument to be split into chunks.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
List of Chunk containing content and structured metadata.
|
|
126
|
+
|
|
127
|
+
Raises:
|
|
128
|
+
ValueError: If chunking fails or service is unavailable.
|
|
129
|
+
"""
|
|
130
|
+
if document is None:
|
|
131
|
+
return []
|
|
132
|
+
|
|
133
|
+
raw_chunks = await self._call_chunk_api(document)
|
|
134
|
+
result: list[Chunk] = []
|
|
135
|
+
|
|
136
|
+
for chunk in raw_chunks:
|
|
137
|
+
text = chunk.get("text", "")
|
|
138
|
+
|
|
139
|
+
# doc_items from docling-serve is a list of ref strings like ["#/texts/1", "#/tables/0"]
|
|
140
|
+
doc_items = chunk.get("doc_items", [])
|
|
141
|
+
doc_item_refs: list[str] = []
|
|
142
|
+
labels: list[str] = []
|
|
143
|
+
|
|
144
|
+
for item in doc_items:
|
|
145
|
+
if isinstance(item, str):
|
|
146
|
+
# docling-serve returns refs as strings directly
|
|
147
|
+
doc_item_refs.append(item)
|
|
148
|
+
# Resolve label from the document using the ref
|
|
149
|
+
label = _resolve_label_from_document(item, document)
|
|
150
|
+
if label:
|
|
151
|
+
labels.append(label)
|
|
152
|
+
elif isinstance(item, dict):
|
|
153
|
+
# Handle dict format if API ever returns it
|
|
154
|
+
if "self_ref" in item:
|
|
155
|
+
doc_item_refs.append(item["self_ref"])
|
|
156
|
+
if "label" in item:
|
|
157
|
+
labels.append(item["label"])
|
|
158
|
+
|
|
159
|
+
# Get headings directly from chunk
|
|
160
|
+
headings = chunk.get("headings")
|
|
161
|
+
|
|
162
|
+
# Get page numbers directly from chunk
|
|
163
|
+
page_numbers = chunk.get("page_numbers", [])
|
|
164
|
+
|
|
165
|
+
chunk_metadata = ChunkMetadata(
|
|
166
|
+
doc_item_refs=doc_item_refs,
|
|
167
|
+
headings=headings,
|
|
168
|
+
labels=labels,
|
|
169
|
+
page_numbers=sorted(page_numbers) if page_numbers else [],
|
|
170
|
+
)
|
|
171
|
+
result.append(
|
|
172
|
+
Chunk(
|
|
173
|
+
content=text,
|
|
174
|
+
metadata=chunk_metadata.model_dump(),
|
|
175
|
+
order=len(result),
|
|
176
|
+
)
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
return result
|