datahub-agent-context 1.3.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datahub_agent_context/__init__.py +25 -0
- datahub_agent_context/_version.py +16 -0
- datahub_agent_context/context.py +97 -0
- datahub_agent_context/langchain_tools/__init__.py +8 -0
- datahub_agent_context/langchain_tools/builder.py +127 -0
- datahub_agent_context/mcp_tools/__init__.py +46 -0
- datahub_agent_context/mcp_tools/_token_estimator.py +71 -0
- datahub_agent_context/mcp_tools/base.py +325 -0
- datahub_agent_context/mcp_tools/descriptions.py +299 -0
- datahub_agent_context/mcp_tools/documents.py +473 -0
- datahub_agent_context/mcp_tools/domains.py +246 -0
- datahub_agent_context/mcp_tools/entities.py +349 -0
- datahub_agent_context/mcp_tools/get_me.py +99 -0
- datahub_agent_context/mcp_tools/gql/__init__.py +13 -0
- datahub_agent_context/mcp_tools/gql/document_search.gql +114 -0
- datahub_agent_context/mcp_tools/gql/document_semantic_search.gql +111 -0
- datahub_agent_context/mcp_tools/gql/entity_details.gql +1682 -0
- datahub_agent_context/mcp_tools/gql/queries.gql +51 -0
- datahub_agent_context/mcp_tools/gql/query_entity.gql +37 -0
- datahub_agent_context/mcp_tools/gql/read_documents.gql +16 -0
- datahub_agent_context/mcp_tools/gql/search.gql +242 -0
- datahub_agent_context/mcp_tools/helpers.py +448 -0
- datahub_agent_context/mcp_tools/lineage.py +698 -0
- datahub_agent_context/mcp_tools/owners.py +318 -0
- datahub_agent_context/mcp_tools/queries.py +191 -0
- datahub_agent_context/mcp_tools/search.py +239 -0
- datahub_agent_context/mcp_tools/structured_properties.py +447 -0
- datahub_agent_context/mcp_tools/tags.py +296 -0
- datahub_agent_context/mcp_tools/terms.py +295 -0
- datahub_agent_context/py.typed +2 -0
- datahub_agent_context-1.3.1.8.dist-info/METADATA +233 -0
- datahub_agent_context-1.3.1.8.dist-info/RECORD +34 -0
- datahub_agent_context-1.3.1.8.dist-info/WHEEL +5 -0
- datahub_agent_context-1.3.1.8.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,473 @@
|
|
|
1
|
+
"""Document tools for DataHub MCP server."""
|
|
2
|
+
|
|
3
|
+
import pathlib
|
|
4
|
+
from typing import Any, Dict, List, Literal, Optional
|
|
5
|
+
|
|
6
|
+
import re2 # type: ignore[import-untyped]
|
|
7
|
+
|
|
8
|
+
from datahub_agent_context.context import get_graph
|
|
9
|
+
from datahub_agent_context.mcp_tools.base import (
|
|
10
|
+
clean_gql_response,
|
|
11
|
+
execute_graphql,
|
|
12
|
+
fetch_global_default_view,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
# Load GraphQL queries at module level
|
|
16
|
+
document_search_gql = (
|
|
17
|
+
pathlib.Path(__file__).parent / "gql/document_search.gql"
|
|
18
|
+
).read_text()
|
|
19
|
+
document_semantic_search_gql = (
|
|
20
|
+
pathlib.Path(__file__).parent / "gql/document_semantic_search.gql"
|
|
21
|
+
).read_text()
|
|
22
|
+
read_documents_gql = (
|
|
23
|
+
pathlib.Path(__file__).parent / "gql/read_documents.gql"
|
|
24
|
+
).read_text()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def search_documents(
|
|
28
|
+
query: str = "*",
|
|
29
|
+
platforms: Optional[List[str]] = None,
|
|
30
|
+
domains: Optional[List[str]] = None,
|
|
31
|
+
tags: Optional[List[str]] = None,
|
|
32
|
+
glossary_terms: Optional[List[str]] = None,
|
|
33
|
+
owners: Optional[List[str]] = None,
|
|
34
|
+
num_results: int = 10,
|
|
35
|
+
offset: int = 0,
|
|
36
|
+
) -> dict:
|
|
37
|
+
"""Search for documents stored in the customer's DataHub deployment.
|
|
38
|
+
|
|
39
|
+
These are the organization's own documents (runbooks, FAQs, knowledge articles)
|
|
40
|
+
ingested from sources like Notion, Confluence, etc.
|
|
41
|
+
|
|
42
|
+
Returns document metadata WITHOUT content to keep responses concise.
|
|
43
|
+
Use get_entities() with a document URN to retrieve full content when needed.
|
|
44
|
+
|
|
45
|
+
KEYWORD SEARCH:
|
|
46
|
+
- Full-text search with boolean logic
|
|
47
|
+
- Use /q prefix for structured queries
|
|
48
|
+
- Examples:
|
|
49
|
+
• /q deployment guide → documents containing both terms
|
|
50
|
+
• /q kubernetes OR k8s → documents with either term
|
|
51
|
+
• /q "production deployment" → exact phrase match
|
|
52
|
+
|
|
53
|
+
FILTERS - Narrow results by metadata:
|
|
54
|
+
|
|
55
|
+
platforms: Filter by source platform (use full URN)
|
|
56
|
+
- Examples: ["urn:li:dataPlatform:notion"], ["urn:li:dataPlatform:confluence"]
|
|
57
|
+
|
|
58
|
+
domains: Filter by business domain (use full URN)
|
|
59
|
+
- Examples: ["urn:li:domain:engineering"], ["urn:li:domain:data-platform"]
|
|
60
|
+
|
|
61
|
+
tags: Filter by tags (use full URN)
|
|
62
|
+
- Examples: ["urn:li:tag:critical"], ["urn:li:tag:deprecated"]
|
|
63
|
+
|
|
64
|
+
glossary_terms: Filter by glossary terms (use full URN)
|
|
65
|
+
- Examples: ["urn:li:glossaryTerm:pii"], ["urn:li:glossaryTerm:gdpr"]
|
|
66
|
+
|
|
67
|
+
owners: Filter by document owners (use full URN)
|
|
68
|
+
- Examples: ["urn:li:corpuser:alice"], ["urn:li:corpGroup:platform-team"]
|
|
69
|
+
|
|
70
|
+
PAGINATION:
|
|
71
|
+
- num_results: Number of results per page (max: 50)
|
|
72
|
+
- offset: Starting position (default: 0)
|
|
73
|
+
|
|
74
|
+
FACET DISCOVERY:
|
|
75
|
+
- Set num_results=0 to get ONLY facets (no results)
|
|
76
|
+
- Useful for discovering what platforms, domains exist
|
|
77
|
+
|
|
78
|
+
EXAMPLE WORKFLOWS:
|
|
79
|
+
|
|
80
|
+
1. Find Notion docs about deployment:
|
|
81
|
+
search_documents(query="deployment", platforms=["urn:li:dataPlatform:notion"])
|
|
82
|
+
|
|
83
|
+
2. Discover document sources:
|
|
84
|
+
search_documents(num_results=0)
|
|
85
|
+
→ Examine facets to see available platforms, domains
|
|
86
|
+
|
|
87
|
+
3. Find engineering team's critical docs:
|
|
88
|
+
search_documents(
|
|
89
|
+
domains=["urn:li:domain:engineering"],
|
|
90
|
+
tags=["urn:li:tag:critical"]
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
query: Search query string
|
|
95
|
+
platforms: List of platform URNs to filter by
|
|
96
|
+
domains: List of domain URNs to filter by
|
|
97
|
+
tags: List of tag URNs to filter by
|
|
98
|
+
glossary_terms: List of glossary term URNs to filter by
|
|
99
|
+
owners: List of owner URNs to filter by
|
|
100
|
+
num_results: Number of results to return (max 50)
|
|
101
|
+
offset: Starting position for pagination
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Dictionary with search results and facets
|
|
105
|
+
|
|
106
|
+
Example:
|
|
107
|
+
from datahub_agent_context.context import DataHubContext
|
|
108
|
+
|
|
109
|
+
with DataHubContext(client.graph):
|
|
110
|
+
result = search_documents(query="deployment")
|
|
111
|
+
"""
|
|
112
|
+
graph = get_graph()
|
|
113
|
+
return _search_documents_impl(
|
|
114
|
+
graph,
|
|
115
|
+
query=query,
|
|
116
|
+
search_strategy="keyword",
|
|
117
|
+
platforms=platforms,
|
|
118
|
+
domains=domains,
|
|
119
|
+
tags=tags,
|
|
120
|
+
glossary_terms=glossary_terms,
|
|
121
|
+
owners=owners,
|
|
122
|
+
num_results=num_results,
|
|
123
|
+
offset=offset,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _search_documents_impl(
|
|
128
|
+
graph,
|
|
129
|
+
query: str = "*",
|
|
130
|
+
search_strategy: Optional[Literal["semantic", "keyword"]] = None,
|
|
131
|
+
sub_types: Optional[List[str]] = None,
|
|
132
|
+
platforms: Optional[List[str]] = None,
|
|
133
|
+
domains: Optional[List[str]] = None,
|
|
134
|
+
tags: Optional[List[str]] = None,
|
|
135
|
+
glossary_terms: Optional[List[str]] = None,
|
|
136
|
+
owners: Optional[List[str]] = None,
|
|
137
|
+
num_results: int = 10,
|
|
138
|
+
offset: int = 0,
|
|
139
|
+
) -> dict:
|
|
140
|
+
"""Search for documents stored in the customer's DataHub deployment.
|
|
141
|
+
|
|
142
|
+
These are the organization's own documents (runbooks, FAQs, knowledge articles)
|
|
143
|
+
ingested from sources like Notion, Confluence, etc. - not DataHub documentation.
|
|
144
|
+
|
|
145
|
+
Returns document metadata WITHOUT content to keep responses concise.
|
|
146
|
+
Use get_entities() with a document URN to retrieve full content when needed.
|
|
147
|
+
|
|
148
|
+
SEARCH STRATEGIES:
|
|
149
|
+
|
|
150
|
+
SEMANTIC SEARCH (search_strategy="semantic"):
|
|
151
|
+
- Uses AI embeddings to find conceptually related documents
|
|
152
|
+
- Best for: natural language queries, finding related topics
|
|
153
|
+
- Example: "how to deploy" finds deployment guides, CI/CD docs, release runbooks
|
|
154
|
+
|
|
155
|
+
KEYWORD SEARCH (search_strategy="keyword" or default):
|
|
156
|
+
- Full-text search with boolean logic
|
|
157
|
+
- Use /q prefix for structured queries
|
|
158
|
+
- Examples:
|
|
159
|
+
• /q deployment guide → documents containing both terms
|
|
160
|
+
• /q kubernetes OR k8s → documents with either term
|
|
161
|
+
• /q "production deployment" → exact phrase match
|
|
162
|
+
|
|
163
|
+
FILTERS - Narrow results by metadata:
|
|
164
|
+
|
|
165
|
+
sub_types: Filter by document type
|
|
166
|
+
- Examples: ["Runbook"], ["FAQ", "Tutorial"], ["Reference"]
|
|
167
|
+
|
|
168
|
+
platforms: Filter by source platform (use full URN)
|
|
169
|
+
- Examples: ["urn:li:dataPlatform:notion"], ["urn:li:dataPlatform:confluence"]
|
|
170
|
+
|
|
171
|
+
domains: Filter by business domain (use full URN)
|
|
172
|
+
- Examples: ["urn:li:domain:engineering"], ["urn:li:domain:data-platform"]
|
|
173
|
+
|
|
174
|
+
tags: Filter by tags (use full URN)
|
|
175
|
+
- Examples: ["urn:li:tag:critical"], ["urn:li:tag:deprecated"]
|
|
176
|
+
|
|
177
|
+
glossary_terms: Filter by glossary terms (use full URN)
|
|
178
|
+
- Examples: ["urn:li:glossaryTerm:pii"], ["urn:li:glossaryTerm:gdpr"]
|
|
179
|
+
|
|
180
|
+
owners: Filter by document owners (use full URN)
|
|
181
|
+
- Examples: ["urn:li:corpuser:alice"], ["urn:li:corpGroup:platform-team"]
|
|
182
|
+
|
|
183
|
+
PAGINATION:
|
|
184
|
+
- num_results: Number of results per page (max: 50)
|
|
185
|
+
- offset: Starting position (default: 0)
|
|
186
|
+
|
|
187
|
+
FACET DISCOVERY:
|
|
188
|
+
- Set num_results=0 to get ONLY facets (no results)
|
|
189
|
+
- Useful for discovering what sub_types, platforms, domains exist
|
|
190
|
+
|
|
191
|
+
EXAMPLE WORKFLOWS:
|
|
192
|
+
|
|
193
|
+
1. Find all runbooks:
|
|
194
|
+
search_documents(sub_types=["Runbook"])
|
|
195
|
+
|
|
196
|
+
2. Find Notion docs about deployment:
|
|
197
|
+
search_documents(query="deployment", platforms=["urn:li:dataPlatform:notion"])
|
|
198
|
+
|
|
199
|
+
3. Discover document types:
|
|
200
|
+
search_documents(num_results=0)
|
|
201
|
+
→ Examine facets to see available subTypes, platforms, domains
|
|
202
|
+
|
|
203
|
+
4. Find engineering team's critical docs:
|
|
204
|
+
search_documents(
|
|
205
|
+
domains=["urn:li:domain:engineering"],
|
|
206
|
+
tags=["urn:li:tag:critical"]
|
|
207
|
+
)
|
|
208
|
+
"""
|
|
209
|
+
# Cap num_results at 50
|
|
210
|
+
num_results = min(num_results, 50)
|
|
211
|
+
|
|
212
|
+
# Build orFilters from the simple filter parameters
|
|
213
|
+
# Each filter type is ANDed together, values within a filter are ORed
|
|
214
|
+
and_filters: List[Dict[str, Any]] = []
|
|
215
|
+
|
|
216
|
+
if sub_types:
|
|
217
|
+
and_filters.append({"field": "subTypes", "values": sub_types})
|
|
218
|
+
if platforms:
|
|
219
|
+
and_filters.append({"field": "platform", "values": platforms})
|
|
220
|
+
if domains:
|
|
221
|
+
and_filters.append({"field": "domains", "values": domains})
|
|
222
|
+
if tags:
|
|
223
|
+
and_filters.append({"field": "tags", "values": tags})
|
|
224
|
+
if glossary_terms:
|
|
225
|
+
and_filters.append({"field": "glossaryTerms", "values": glossary_terms})
|
|
226
|
+
if owners:
|
|
227
|
+
and_filters.append({"field": "owners", "values": owners})
|
|
228
|
+
|
|
229
|
+
# Wrap in orFilters format (list of AND groups)
|
|
230
|
+
or_filters = [{"and": and_filters}] if and_filters else []
|
|
231
|
+
|
|
232
|
+
# Fetch and apply default view
|
|
233
|
+
view_urn = fetch_global_default_view(graph)
|
|
234
|
+
|
|
235
|
+
# Choose search strategy
|
|
236
|
+
if search_strategy == "semantic":
|
|
237
|
+
gql_query = document_semantic_search_gql
|
|
238
|
+
operation_name = "documentSemanticSearch"
|
|
239
|
+
response_key = "semanticSearchAcrossEntities"
|
|
240
|
+
variables: Dict[str, Any] = {
|
|
241
|
+
"query": query,
|
|
242
|
+
"orFilters": or_filters,
|
|
243
|
+
"count": max(num_results, 1),
|
|
244
|
+
"viewUrn": view_urn,
|
|
245
|
+
}
|
|
246
|
+
else:
|
|
247
|
+
# Default: keyword search
|
|
248
|
+
gql_query = document_search_gql
|
|
249
|
+
operation_name = "documentSearch"
|
|
250
|
+
response_key = "searchAcrossEntities"
|
|
251
|
+
variables = {
|
|
252
|
+
"query": query,
|
|
253
|
+
"orFilters": or_filters,
|
|
254
|
+
"count": max(num_results, 1),
|
|
255
|
+
"start": offset,
|
|
256
|
+
"viewUrn": view_urn,
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
response = execute_graphql(
|
|
260
|
+
graph,
|
|
261
|
+
query=gql_query,
|
|
262
|
+
variables=variables,
|
|
263
|
+
operation_name=operation_name,
|
|
264
|
+
)[response_key]
|
|
265
|
+
|
|
266
|
+
if num_results == 0 and isinstance(response, dict):
|
|
267
|
+
# Support num_results=0 for facet-only queries
|
|
268
|
+
response.pop("searchResults", None)
|
|
269
|
+
response.pop("count", None)
|
|
270
|
+
|
|
271
|
+
return clean_gql_response(response)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def grep_documents(
|
|
275
|
+
urns: List[str],
|
|
276
|
+
pattern: str,
|
|
277
|
+
context_chars: int = 200,
|
|
278
|
+
max_matches_per_doc: int = 5,
|
|
279
|
+
start_offset: int = 0,
|
|
280
|
+
) -> dict:
|
|
281
|
+
"""Search within document content using regex patterns.
|
|
282
|
+
|
|
283
|
+
Similar to ripgrep/grep - finds matching excerpts within documents.
|
|
284
|
+
Use search_documents() first to find relevant document URNs, then use this
|
|
285
|
+
tool to search within their content.
|
|
286
|
+
|
|
287
|
+
PATTERN SYNTAX (RE2 regex):
|
|
288
|
+
- Simple text: "deploy" matches the word deploy
|
|
289
|
+
- Case insensitive: "(?i)deploy" matches Deploy, DEPLOY, deploy
|
|
290
|
+
- Word boundaries: r"\\bword\\b" matches whole word only
|
|
291
|
+
- Alternatives: "deploy|release" matches either term
|
|
292
|
+
- Wildcards: "deploy.*prod" matches deploy followed by prod
|
|
293
|
+
- Character classes: "[Dd]eploy" matches Deploy or deploy
|
|
294
|
+
|
|
295
|
+
PARAMETERS:
|
|
296
|
+
|
|
297
|
+
urns: List of document URNs to search within
|
|
298
|
+
- Get these from search_documents() results
|
|
299
|
+
- Example: ["urn:li:document:doc1", "urn:li:document:doc2"]
|
|
300
|
+
|
|
301
|
+
pattern: Regex pattern to search for
|
|
302
|
+
- Examples: "kubernetes", "(?i)deploy.*production", "error|warning"
|
|
303
|
+
- Use ".*" to get raw content (for continuing after truncation)
|
|
304
|
+
|
|
305
|
+
context_chars: Characters to show before/after each match (default: 200)
|
|
306
|
+
- Higher values show more surrounding context
|
|
307
|
+
- When reading raw content (pattern=".*"), use higher values (e.g., 8000)
|
|
308
|
+
|
|
309
|
+
max_matches_per_doc: Maximum matches to return per document (default: 5)
|
|
310
|
+
- Limits output size for documents with many matches
|
|
311
|
+
|
|
312
|
+
start_offset: Character offset to start searching from (default: 0)
|
|
313
|
+
- Use this to continue reading after get_entities() truncation
|
|
314
|
+
- When get_entities() returns _truncatedAtChar=8000, use start_offset=8000
|
|
315
|
+
to continue reading from where it left off
|
|
316
|
+
|
|
317
|
+
EXAMPLE WORKFLOWS:
|
|
318
|
+
|
|
319
|
+
1. Find deployment instructions:
|
|
320
|
+
docs = search_documents(query="deployment", sub_types=["Runbook"])
|
|
321
|
+
urns = [r["entity"]["urn"] for r in docs["searchResults"]]
|
|
322
|
+
grep_documents(urns, pattern="kubectl apply", context_chars=300)
|
|
323
|
+
|
|
324
|
+
2. Find all error handling sections (case insensitive):
|
|
325
|
+
grep_documents(urns, pattern="(?i)error|exception|failure")
|
|
326
|
+
|
|
327
|
+
3. Find specific configuration values:
|
|
328
|
+
grep_documents(urns, pattern=r"timeout.*=.*\\d+")
|
|
329
|
+
|
|
330
|
+
4. Continue reading after truncation (when get_entities returns _truncatedAtChar):
|
|
331
|
+
# After get_entities() shows: _truncatedAtChar=8000, _originalLengthChars=15000
|
|
332
|
+
grep_documents(urns=[doc_urn], pattern=".*", context_chars=8000, start_offset=8000)
|
|
333
|
+
# Returns content from char 8000 onwards
|
|
334
|
+
|
|
335
|
+
RETURNS:
|
|
336
|
+
- results: List of documents with matches, each containing:
|
|
337
|
+
- urn: Document URN
|
|
338
|
+
- title: Document title
|
|
339
|
+
- matches: List of excerpts with position info (positions are absolute)
|
|
340
|
+
- total_matches: Total matches found (may exceed max_matches_per_doc)
|
|
341
|
+
- content_length: Total length of document content (when start_offset is used)
|
|
342
|
+
- total_matches: Total matches across all documents
|
|
343
|
+
- documents_with_matches: Number of documents containing matches
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
urns: List of document URNs to search within
|
|
347
|
+
pattern: Regex pattern to search for
|
|
348
|
+
context_chars: Characters to show before/after each match
|
|
349
|
+
max_matches_per_doc: Maximum matches to return per document
|
|
350
|
+
start_offset: Character offset to start searching from
|
|
351
|
+
|
|
352
|
+
Returns:
|
|
353
|
+
Dictionary with search results
|
|
354
|
+
|
|
355
|
+
Example:
|
|
356
|
+
from datahub_agent_context.context import DataHubContext
|
|
357
|
+
|
|
358
|
+
with DataHubContext(client.graph):
|
|
359
|
+
result = grep_documents(urns=["urn:li:document:doc1"], pattern="kubernetes")
|
|
360
|
+
"""
|
|
361
|
+
graph = get_graph()
|
|
362
|
+
if not urns:
|
|
363
|
+
return {
|
|
364
|
+
"results": [],
|
|
365
|
+
"total_matches": 0,
|
|
366
|
+
"documents_with_matches": 0,
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
# Fetch document content via GraphQL
|
|
370
|
+
response = execute_graphql(
|
|
371
|
+
graph,
|
|
372
|
+
query=read_documents_gql,
|
|
373
|
+
variables={"urns": urns},
|
|
374
|
+
operation_name="documentContent",
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
entities = response.get("entities", [])
|
|
378
|
+
|
|
379
|
+
# Compile regex pattern using RE2 (safe against ReDoS attacks)
|
|
380
|
+
# RE2 guarantees linear-time matching, preventing pathological backtracking
|
|
381
|
+
try:
|
|
382
|
+
regex = re2.compile(pattern)
|
|
383
|
+
except re2.error as e:
|
|
384
|
+
return {
|
|
385
|
+
"error": f"Invalid regex pattern: {e}",
|
|
386
|
+
"results": [],
|
|
387
|
+
"total_matches": 0,
|
|
388
|
+
"documents_with_matches": 0,
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
results = []
|
|
392
|
+
total_matches = 0
|
|
393
|
+
documents_with_matches = 0
|
|
394
|
+
|
|
395
|
+
for entity in entities:
|
|
396
|
+
if not entity:
|
|
397
|
+
continue
|
|
398
|
+
|
|
399
|
+
urn = entity.get("urn", "")
|
|
400
|
+
info = entity.get("info", {})
|
|
401
|
+
title = info.get("title", "Untitled")
|
|
402
|
+
contents = info.get("contents", {})
|
|
403
|
+
text = contents.get("text", "") if contents else ""
|
|
404
|
+
|
|
405
|
+
if not text:
|
|
406
|
+
continue
|
|
407
|
+
|
|
408
|
+
# Store original length before applying offset
|
|
409
|
+
full_content_length = len(text)
|
|
410
|
+
|
|
411
|
+
# Apply start_offset - skip first N characters
|
|
412
|
+
if start_offset > 0:
|
|
413
|
+
if start_offset >= len(text):
|
|
414
|
+
# Offset is beyond document length, skip this document
|
|
415
|
+
continue
|
|
416
|
+
text = text[start_offset:]
|
|
417
|
+
|
|
418
|
+
# Iterate through matches - only store excerpts for first max_matches_per_doc,
|
|
419
|
+
# but count all matches without keeping them in memory
|
|
420
|
+
excerpts: List[Dict[str, Any]] = []
|
|
421
|
+
doc_total_matches = 0
|
|
422
|
+
|
|
423
|
+
for match in regex.finditer(text):
|
|
424
|
+
doc_total_matches += 1
|
|
425
|
+
|
|
426
|
+
# Only extract excerpts for first max_matches_per_doc matches
|
|
427
|
+
if len(excerpts) < max_matches_per_doc:
|
|
428
|
+
start_pos = max(0, match.start() - context_chars)
|
|
429
|
+
end_pos = min(len(text), match.end() + context_chars)
|
|
430
|
+
|
|
431
|
+
# Extract excerpt
|
|
432
|
+
excerpt = text[start_pos:end_pos]
|
|
433
|
+
|
|
434
|
+
# Add ellipsis if truncated
|
|
435
|
+
if start_pos > 0:
|
|
436
|
+
excerpt = "..." + excerpt
|
|
437
|
+
if end_pos < len(text):
|
|
438
|
+
excerpt = excerpt + "..."
|
|
439
|
+
|
|
440
|
+
# Report absolute position (accounting for start_offset)
|
|
441
|
+
absolute_position = match.start() + start_offset
|
|
442
|
+
|
|
443
|
+
excerpts.append(
|
|
444
|
+
{
|
|
445
|
+
"excerpt": excerpt,
|
|
446
|
+
"position": absolute_position,
|
|
447
|
+
}
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
if doc_total_matches == 0:
|
|
451
|
+
continue
|
|
452
|
+
|
|
453
|
+
documents_with_matches += 1
|
|
454
|
+
total_matches += doc_total_matches
|
|
455
|
+
|
|
456
|
+
result_entry: Dict[str, Any] = {
|
|
457
|
+
"urn": urn,
|
|
458
|
+
"title": title,
|
|
459
|
+
"matches": excerpts,
|
|
460
|
+
"total_matches": doc_total_matches,
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
# Include content_length when using start_offset to help with pagination
|
|
464
|
+
if start_offset > 0:
|
|
465
|
+
result_entry["content_length"] = full_content_length
|
|
466
|
+
|
|
467
|
+
results.append(result_entry)
|
|
468
|
+
|
|
469
|
+
return {
|
|
470
|
+
"results": results,
|
|
471
|
+
"total_matches": total_matches,
|
|
472
|
+
"documents_with_matches": documents_with_matches,
|
|
473
|
+
}
|