haiku.rag-slim 0.16.0__py3-none-any.whl → 0.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of haiku.rag-slim might be problematic. Click here for more details.

Files changed (94) hide show
  1. haiku/rag/app.py +430 -72
  2. haiku/rag/chunkers/__init__.py +31 -0
  3. haiku/rag/chunkers/base.py +31 -0
  4. haiku/rag/chunkers/docling_local.py +164 -0
  5. haiku/rag/chunkers/docling_serve.py +179 -0
  6. haiku/rag/cli.py +207 -24
  7. haiku/rag/cli_chat.py +489 -0
  8. haiku/rag/client.py +1251 -266
  9. haiku/rag/config/__init__.py +16 -10
  10. haiku/rag/config/loader.py +5 -44
  11. haiku/rag/config/models.py +126 -17
  12. haiku/rag/converters/__init__.py +31 -0
  13. haiku/rag/converters/base.py +63 -0
  14. haiku/rag/converters/docling_local.py +193 -0
  15. haiku/rag/converters/docling_serve.py +229 -0
  16. haiku/rag/converters/text_utils.py +237 -0
  17. haiku/rag/embeddings/__init__.py +123 -24
  18. haiku/rag/embeddings/voyageai.py +175 -20
  19. haiku/rag/graph/__init__.py +0 -11
  20. haiku/rag/graph/agui/__init__.py +8 -2
  21. haiku/rag/graph/agui/cli_renderer.py +1 -1
  22. haiku/rag/graph/agui/emitter.py +219 -31
  23. haiku/rag/graph/agui/server.py +20 -62
  24. haiku/rag/graph/agui/stream.py +1 -2
  25. haiku/rag/graph/research/__init__.py +5 -2
  26. haiku/rag/graph/research/dependencies.py +12 -126
  27. haiku/rag/graph/research/graph.py +390 -135
  28. haiku/rag/graph/research/models.py +91 -112
  29. haiku/rag/graph/research/prompts.py +99 -91
  30. haiku/rag/graph/research/state.py +35 -27
  31. haiku/rag/inspector/__init__.py +8 -0
  32. haiku/rag/inspector/app.py +259 -0
  33. haiku/rag/inspector/widgets/__init__.py +6 -0
  34. haiku/rag/inspector/widgets/chunk_list.py +100 -0
  35. haiku/rag/inspector/widgets/context_modal.py +89 -0
  36. haiku/rag/inspector/widgets/detail_view.py +130 -0
  37. haiku/rag/inspector/widgets/document_list.py +75 -0
  38. haiku/rag/inspector/widgets/info_modal.py +209 -0
  39. haiku/rag/inspector/widgets/search_modal.py +183 -0
  40. haiku/rag/inspector/widgets/visual_modal.py +126 -0
  41. haiku/rag/mcp.py +106 -102
  42. haiku/rag/monitor.py +33 -9
  43. haiku/rag/providers/__init__.py +5 -0
  44. haiku/rag/providers/docling_serve.py +108 -0
  45. haiku/rag/qa/__init__.py +12 -10
  46. haiku/rag/qa/agent.py +43 -61
  47. haiku/rag/qa/prompts.py +35 -57
  48. haiku/rag/reranking/__init__.py +9 -6
  49. haiku/rag/reranking/base.py +1 -1
  50. haiku/rag/reranking/cohere.py +5 -4
  51. haiku/rag/reranking/mxbai.py +5 -2
  52. haiku/rag/reranking/vllm.py +3 -4
  53. haiku/rag/reranking/zeroentropy.py +6 -5
  54. haiku/rag/store/__init__.py +2 -1
  55. haiku/rag/store/engine.py +242 -42
  56. haiku/rag/store/exceptions.py +4 -0
  57. haiku/rag/store/models/__init__.py +8 -2
  58. haiku/rag/store/models/chunk.py +190 -0
  59. haiku/rag/store/models/document.py +46 -0
  60. haiku/rag/store/repositories/chunk.py +141 -121
  61. haiku/rag/store/repositories/document.py +25 -84
  62. haiku/rag/store/repositories/settings.py +11 -14
  63. haiku/rag/store/upgrades/__init__.py +19 -3
  64. haiku/rag/store/upgrades/v0_10_1.py +1 -1
  65. haiku/rag/store/upgrades/v0_19_6.py +65 -0
  66. haiku/rag/store/upgrades/v0_20_0.py +68 -0
  67. haiku/rag/store/upgrades/v0_23_1.py +100 -0
  68. haiku/rag/store/upgrades/v0_9_3.py +3 -3
  69. haiku/rag/utils.py +371 -146
  70. {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/METADATA +15 -12
  71. haiku_rag_slim-0.24.0.dist-info/RECORD +78 -0
  72. {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/WHEEL +1 -1
  73. haiku/rag/chunker.py +0 -65
  74. haiku/rag/embeddings/base.py +0 -25
  75. haiku/rag/embeddings/ollama.py +0 -28
  76. haiku/rag/embeddings/openai.py +0 -26
  77. haiku/rag/embeddings/vllm.py +0 -29
  78. haiku/rag/graph/agui/events.py +0 -254
  79. haiku/rag/graph/common/__init__.py +0 -5
  80. haiku/rag/graph/common/models.py +0 -42
  81. haiku/rag/graph/common/nodes.py +0 -265
  82. haiku/rag/graph/common/prompts.py +0 -46
  83. haiku/rag/graph/common/utils.py +0 -44
  84. haiku/rag/graph/deep_qa/__init__.py +0 -1
  85. haiku/rag/graph/deep_qa/dependencies.py +0 -27
  86. haiku/rag/graph/deep_qa/graph.py +0 -243
  87. haiku/rag/graph/deep_qa/models.py +0 -20
  88. haiku/rag/graph/deep_qa/prompts.py +0 -59
  89. haiku/rag/graph/deep_qa/state.py +0 -56
  90. haiku/rag/graph/research/common.py +0 -87
  91. haiku/rag/reader.py +0 -135
  92. haiku_rag_slim-0.16.0.dist-info/RECORD +0 -71
  93. {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/entry_points.txt +0 -0
  94. {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,31 @@
1
+ """Document chunker abstraction for haiku.rag."""
2
+
3
+ from haiku.rag.chunkers.base import DocumentChunker
4
+ from haiku.rag.config import AppConfig, Config
5
+
6
+ __all__ = ["DocumentChunker", "get_chunker"]
7
+
8
+
9
+ def get_chunker(config: AppConfig = Config) -> DocumentChunker:
10
+ """Get a document chunker instance based on configuration.
11
+
12
+ Args:
13
+ config: Configuration to use. Defaults to global Config.
14
+
15
+ Returns:
16
+ DocumentChunker instance configured according to the config.
17
+
18
+ Raises:
19
+ ValueError: If the chunker provider is not recognized.
20
+ """
21
+ if config.processing.chunker == "docling-local":
22
+ from haiku.rag.chunkers.docling_local import DoclingLocalChunker
23
+
24
+ return DoclingLocalChunker(config)
25
+
26
+ if config.processing.chunker == "docling-serve":
27
+ from haiku.rag.chunkers.docling_serve import DoclingServeChunker
28
+
29
+ return DoclingServeChunker(config)
30
+
31
+ raise ValueError(f"Unsupported chunker: {config.processing.chunker}")
@@ -0,0 +1,31 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import TYPE_CHECKING
3
+
4
+ if TYPE_CHECKING:
5
+ from docling_core.types.doc.document import DoclingDocument
6
+
7
+ from haiku.rag.store.models.chunk import Chunk
8
+
9
+
10
+ class DocumentChunker(ABC):
11
+ """Abstract base class for document chunkers.
12
+
13
+ Document chunkers split DoclingDocuments into smaller text chunks suitable
14
+ for embedding and retrieval, respecting document structure and semantic boundaries.
15
+ """
16
+
17
+ @abstractmethod
18
+ async def chunk(self, document: "DoclingDocument") -> list["Chunk"]:
19
+ """Split a document into chunks with metadata.
20
+
21
+ Args:
22
+ document: The DoclingDocument to chunk.
23
+
24
+ Returns:
25
+ List of Chunk with content and structured metadata in the metadata dict
26
+ (doc_item_refs, headings, labels, page_numbers).
27
+
28
+ Raises:
29
+ ValueError: If chunking fails.
30
+ """
31
+ pass
@@ -0,0 +1,164 @@
1
+ from typing import TYPE_CHECKING, cast
2
+
3
+ from haiku.rag.chunkers.base import DocumentChunker
4
+ from haiku.rag.config import AppConfig, Config
5
+ from haiku.rag.store.models.chunk import Chunk, ChunkMetadata
6
+
7
+ if TYPE_CHECKING:
8
+ from docling_core.transforms.chunker.doc_chunk import DocMeta
9
+ from docling_core.types.doc.document import DoclingDocument
10
+
11
+
12
+ def _create_markdown_serializer_provider(use_markdown_tables: bool = True):
13
+ """Create a markdown serializer provider with configurable table rendering.
14
+
15
+ This function creates a custom serializer provider that extends ChunkingSerializerProvider
16
+ from docling-core. It's implemented as a factory function to avoid importing
17
+ docling-core at module level.
18
+
19
+ Args:
20
+ use_markdown_tables: If True, use MarkdownTableSerializer for rendering tables as
21
+ markdown. If False, use default TripletTableSerializer for narrative format.
22
+ """
23
+ from docling_core.transforms.chunker.hierarchical_chunker import (
24
+ ChunkingDocSerializer,
25
+ ChunkingSerializerProvider,
26
+ )
27
+ from docling_core.transforms.serializer.markdown import MarkdownTableSerializer
28
+
29
+ class MDTableSerializerProvider(ChunkingSerializerProvider):
30
+ """Serializer provider for markdown table output."""
31
+
32
+ def __init__(self, use_markdown_tables: bool = True):
33
+ self.use_markdown_tables = use_markdown_tables
34
+
35
+ def get_serializer(self, doc):
36
+ if self.use_markdown_tables:
37
+ return ChunkingDocSerializer(
38
+ doc=doc,
39
+ table_serializer=MarkdownTableSerializer(),
40
+ )
41
+ else:
42
+ # Use default ChunkingDocSerializer (TripletTableSerializer)
43
+ return ChunkingDocSerializer(doc=doc)
44
+
45
+ return MDTableSerializerProvider(use_markdown_tables=use_markdown_tables)
46
+
47
+
48
+ class DoclingLocalChunker(DocumentChunker):
49
+ """Local document chunker using docling's chunkers.
50
+
51
+ Supports both hybrid (structure-aware) and hierarchical chunking strategies.
52
+ Chunking is performed locally using the HuggingFace tokenizer specified in
53
+ configuration.
54
+
55
+ Args:
56
+ config: Application configuration.
57
+ """
58
+
59
+ def __init__(self, config: AppConfig = Config):
60
+ from docling_core.transforms.chunker.hierarchical_chunker import (
61
+ HierarchicalChunker,
62
+ )
63
+ from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
64
+ from docling_core.transforms.chunker.tokenizer.huggingface import (
65
+ HuggingFaceTokenizer,
66
+ )
67
+ from transformers import AutoTokenizer
68
+
69
+ self.config = config
70
+ self.chunk_size = config.processing.chunk_size
71
+ self.chunker_type = config.processing.chunker_type
72
+ self.tokenizer_name = config.processing.chunking_tokenizer
73
+
74
+ if self.chunker_type == "hybrid":
75
+ hf_tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name)
76
+ tokenizer = HuggingFaceTokenizer(
77
+ tokenizer=hf_tokenizer, max_tokens=self.chunk_size
78
+ )
79
+ serializer_provider = _create_markdown_serializer_provider(
80
+ use_markdown_tables=config.processing.chunking_use_markdown_tables
81
+ )
82
+ self.chunker = HybridChunker(
83
+ tokenizer=tokenizer,
84
+ merge_peers=config.processing.chunking_merge_peers,
85
+ serializer_provider=serializer_provider,
86
+ )
87
+ elif self.chunker_type == "hierarchical":
88
+ serializer_provider = _create_markdown_serializer_provider(
89
+ use_markdown_tables=config.processing.chunking_use_markdown_tables
90
+ )
91
+ self.chunker = HierarchicalChunker(serializer_provider=serializer_provider)
92
+ else:
93
+ raise ValueError(
94
+ f"Unsupported chunker_type: {self.chunker_type}. "
95
+ "Must be 'hybrid' or 'hierarchical'."
96
+ )
97
+
98
+ async def chunk(self, document: "DoclingDocument") -> list[Chunk]:
99
+ """Split the document into chunks with metadata.
100
+
101
+ Extracts structured metadata from each DocChunk including:
102
+ - doc_item_refs: JSON pointer references to DocItems (e.g., "#/texts/5")
103
+ - headings: Section heading hierarchy
104
+ - labels: Semantic labels for each doc_item (e.g., "paragraph", "table")
105
+ - page_numbers: Page numbers where content appears
106
+
107
+ Args:
108
+ document: The DoclingDocument to be split into chunks.
109
+
110
+ Returns:
111
+ List of Chunk containing content and structured metadata.
112
+ """
113
+ if document is None:
114
+ return []
115
+
116
+ raw_chunks = list(self.chunker.chunk(document))
117
+ result: list[Chunk] = []
118
+
119
+ for chunk in raw_chunks:
120
+ text = chunk.text
121
+
122
+ # Extract metadata from DocChunk.meta (cast to DocMeta for type safety)
123
+ doc_item_refs: list[str] = []
124
+ labels: list[str] = []
125
+ page_numbers: list[int] = []
126
+ headings: list[str] | None = None
127
+
128
+ meta = cast("DocMeta | None", chunk.meta)
129
+ if meta and meta.doc_items:
130
+ for doc_item in meta.doc_items:
131
+ # Get JSON pointer reference
132
+ if doc_item.self_ref:
133
+ doc_item_refs.append(doc_item.self_ref)
134
+ # Get label
135
+ if doc_item.label:
136
+ labels.append(doc_item.label)
137
+ # Get page numbers from provenance
138
+ if doc_item.prov:
139
+ for prov in doc_item.prov:
140
+ if (
141
+ prov.page_no is not None
142
+ and prov.page_no not in page_numbers
143
+ ):
144
+ page_numbers.append(prov.page_no)
145
+
146
+ # Get headings from chunk metadata
147
+ if meta and meta.headings:
148
+ headings = list(meta.headings)
149
+
150
+ chunk_metadata = ChunkMetadata(
151
+ doc_item_refs=doc_item_refs,
152
+ headings=headings,
153
+ labels=labels,
154
+ page_numbers=sorted(page_numbers),
155
+ )
156
+ result.append(
157
+ Chunk(
158
+ content=text,
159
+ metadata=chunk_metadata.model_dump(),
160
+ order=len(result),
161
+ )
162
+ )
163
+
164
+ return result
@@ -0,0 +1,179 @@
1
+ import re
2
+ from io import BytesIO
3
+ from typing import TYPE_CHECKING
4
+
5
+ from haiku.rag.chunkers.base import DocumentChunker
6
+ from haiku.rag.config import AppConfig, Config
7
+ from haiku.rag.providers.docling_serve import DoclingServeClient
8
+ from haiku.rag.store.models.chunk import Chunk, ChunkMetadata
9
+
10
+ if TYPE_CHECKING:
11
+ from docling_core.types.doc.document import DoclingDocument
12
+
13
+ # Pattern to parse refs like "#/texts/5" or "#/tables/0"
14
+ REF_PATTERN = re.compile(r"^#/(\w+)/(\d+)$")
15
+
16
+
17
+ def _resolve_label_from_document(ref: str, document: "DoclingDocument") -> str | None:
18
+ """Resolve the label for a doc_item ref by looking it up in the document.
19
+
20
+ The docling-serve API only returns ref strings in doc_items, not labels.
21
+ This function resolves actual labels from the DoclingDocument.
22
+ See: https://github.com/docling-project/docling-serve/issues/448
23
+
24
+ Args:
25
+ ref: JSON pointer reference like "#/texts/5" or "#/tables/0"
26
+ document: The DoclingDocument to look up the item in
27
+
28
+ Returns:
29
+ The label string if found, None otherwise
30
+ """
31
+ match = REF_PATTERN.match(ref)
32
+ if not match:
33
+ return None
34
+
35
+ collection_name = match.group(1)
36
+ index = int(match.group(2))
37
+
38
+ collection = getattr(document, collection_name, None)
39
+ if collection is None or index >= len(collection):
40
+ return None
41
+
42
+ item = collection[index]
43
+ return getattr(item, "label", None)
44
+
45
+
46
+ class DoclingServeChunker(DocumentChunker):
47
+ """Remote document chunker using docling-serve API.
48
+
49
+ Sends DoclingDocument JSON to docling-serve for chunking. Supports both hybrid
50
+ and hierarchical chunking strategies via remote API.
51
+
52
+ Args:
53
+ config: Application configuration containing docling-serve settings.
54
+ """
55
+
56
+ def __init__(self, config: AppConfig = Config):
57
+ self.config = config
58
+ self.client = DoclingServeClient(
59
+ base_url=config.providers.docling_serve.base_url,
60
+ api_key=config.providers.docling_serve.api_key,
61
+ )
62
+ self.chunker_type = config.processing.chunker_type
63
+
64
+ def _build_chunking_data(self) -> dict[str, str]:
65
+ """Build form data for chunking request."""
66
+ return {
67
+ "chunking_max_tokens": str(self.config.processing.chunk_size),
68
+ "chunking_tokenizer": self.config.processing.chunking_tokenizer,
69
+ "chunking_merge_peers": str(
70
+ self.config.processing.chunking_merge_peers
71
+ ).lower(),
72
+ "chunking_use_markdown_tables": str(
73
+ self.config.processing.chunking_use_markdown_tables
74
+ ).lower(),
75
+ }
76
+
77
+ async def _call_chunk_api(self, document: "DoclingDocument") -> list[dict]:
78
+ """Call docling-serve chunking API and return raw chunk data.
79
+
80
+ Args:
81
+ document: The DoclingDocument to be split into chunks.
82
+
83
+ Returns:
84
+ List of chunk dictionaries from API response.
85
+
86
+ Raises:
87
+ ValueError: If chunking fails or service is unavailable.
88
+ """
89
+ # Determine endpoint based on chunker_type
90
+ if self.chunker_type == "hierarchical":
91
+ endpoint = "/v1/chunk/hierarchical/file/async"
92
+ else:
93
+ endpoint = "/v1/chunk/hybrid/file/async"
94
+
95
+ # Export document to JSON
96
+ doc_json = document.model_dump_json()
97
+ doc_bytes = doc_json.encode("utf-8")
98
+
99
+ # Prepare multipart request with DoclingDocument JSON
100
+ files = {"files": ("document.json", BytesIO(doc_bytes), "application/json")}
101
+ data = self._build_chunking_data()
102
+
103
+ result = await self.client.submit_and_poll(
104
+ endpoint=endpoint,
105
+ files=files,
106
+ data=data,
107
+ name="document",
108
+ )
109
+
110
+ return result.get("chunks", [])
111
+
112
+ async def chunk(self, document: "DoclingDocument") -> list[Chunk]:
113
+ """Split the document into chunks with metadata via docling-serve.
114
+
115
+ Extracts structured metadata from the API response including:
116
+ - doc_item_refs: JSON pointer references to DocItems (e.g., "#/texts/5")
117
+ - headings: Section heading hierarchy
118
+ - labels: Semantic labels for each doc_item
119
+ - page_numbers: Page numbers where content appears
120
+
121
+ Args:
122
+ document: The DoclingDocument to be split into chunks.
123
+
124
+ Returns:
125
+ List of Chunk containing content and structured metadata.
126
+
127
+ Raises:
128
+ ValueError: If chunking fails or service is unavailable.
129
+ """
130
+ if document is None:
131
+ return []
132
+
133
+ raw_chunks = await self._call_chunk_api(document)
134
+ result: list[Chunk] = []
135
+
136
+ for chunk in raw_chunks:
137
+ text = chunk.get("text", "")
138
+
139
+ # doc_items from docling-serve is a list of ref strings like ["#/texts/1", "#/tables/0"]
140
+ doc_items = chunk.get("doc_items", [])
141
+ doc_item_refs: list[str] = []
142
+ labels: list[str] = []
143
+
144
+ for item in doc_items:
145
+ if isinstance(item, str):
146
+ # docling-serve returns refs as strings directly
147
+ doc_item_refs.append(item)
148
+ # Resolve label from the document using the ref
149
+ label = _resolve_label_from_document(item, document)
150
+ if label:
151
+ labels.append(label)
152
+ elif isinstance(item, dict):
153
+ # Handle dict format if API ever returns it
154
+ if "self_ref" in item:
155
+ doc_item_refs.append(item["self_ref"])
156
+ if "label" in item:
157
+ labels.append(item["label"])
158
+
159
+ # Get headings directly from chunk
160
+ headings = chunk.get("headings")
161
+
162
+ # Get page numbers directly from chunk
163
+ page_numbers = chunk.get("page_numbers", [])
164
+
165
+ chunk_metadata = ChunkMetadata(
166
+ doc_item_refs=doc_item_refs,
167
+ headings=headings,
168
+ labels=labels,
169
+ page_numbers=sorted(page_numbers) if page_numbers else [],
170
+ )
171
+ result.append(
172
+ Chunk(
173
+ content=text,
174
+ metadata=chunk_metadata.model_dump(),
175
+ order=len(result),
176
+ )
177
+ )
178
+
179
+ return result