sirchmunk 0.0.0__py3-none-any.whl → 0.0.1.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. sirchmunk/__init__.py +8 -0
  2. sirchmunk/base.py +17 -0
  3. sirchmunk/insight/__init__.py +4 -0
  4. sirchmunk/insight/text_insights.py +292 -0
  5. sirchmunk/learnings/__init__.py +1 -0
  6. sirchmunk/learnings/evidence_processor.py +525 -0
  7. sirchmunk/learnings/knowledge_base.py +232 -0
  8. sirchmunk/llm/__init__.py +2 -0
  9. sirchmunk/llm/openai_chat.py +247 -0
  10. sirchmunk/llm/prompts.py +216 -0
  11. sirchmunk/retrieve/__init__.py +1 -0
  12. sirchmunk/retrieve/base.py +25 -0
  13. sirchmunk/retrieve/text_retriever.py +1026 -0
  14. sirchmunk/scan/__init__.py +1 -0
  15. sirchmunk/scan/base.py +18 -0
  16. sirchmunk/scan/file_scanner.py +373 -0
  17. sirchmunk/scan/web_scanner.py +18 -0
  18. sirchmunk/scheduler/__init__.py +0 -0
  19. sirchmunk/schema/__init__.py +2 -0
  20. sirchmunk/schema/cognition.py +106 -0
  21. sirchmunk/schema/context.py +25 -0
  22. sirchmunk/schema/knowledge.py +318 -0
  23. sirchmunk/schema/metadata.py +658 -0
  24. sirchmunk/schema/request.py +221 -0
  25. sirchmunk/schema/response.py +20 -0
  26. sirchmunk/schema/snapshot.py +346 -0
  27. sirchmunk/search.py +475 -0
  28. sirchmunk/storage/__init__.py +7 -0
  29. sirchmunk/storage/duckdb.py +676 -0
  30. sirchmunk/storage/knowledge_manager.py +720 -0
  31. sirchmunk/utils/__init__.py +15 -0
  32. sirchmunk/utils/constants.py +15 -0
  33. sirchmunk/utils/deps.py +23 -0
  34. sirchmunk/utils/file_utils.py +70 -0
  35. sirchmunk/utils/install_rga.py +124 -0
  36. sirchmunk/utils/log_utils.py +360 -0
  37. sirchmunk/utils/tokenizer_util.py +55 -0
  38. sirchmunk/utils/utils.py +108 -0
  39. sirchmunk/version.py +1 -1
  40. sirchmunk-0.0.1.post1.dist-info/METADATA +483 -0
  41. sirchmunk-0.0.1.post1.dist-info/RECORD +45 -0
  42. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/WHEEL +1 -1
  43. sirchmunk-0.0.0.dist-info/METADATA +0 -26
  44. sirchmunk-0.0.0.dist-info/RECORD +0 -8
  45. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/entry_points.txt +0 -0
  46. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/licenses/LICENSE +0 -0
  47. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,525 @@
1
+ # Copyright (c) ModelScope Contributors. All rights reserved.
2
+ import asyncio
3
+ import json
4
+ import math
5
+ import random
6
+ from dataclasses import dataclass
7
+ from typing import Any, Awaitable, Callable, Dict, List, Set, Tuple
8
+
9
+ from rapidfuzz import fuzz, process
10
+
11
+ from sirchmunk.llm.openai_chat import OpenAIChat
12
+ from sirchmunk.llm.prompts import EVALUATE_EVIDENCE_SAMPLE, ROI_RESULT_SUMMARY
13
+ from sirchmunk.utils import create_logger, LogCallback
14
+
15
+
16
+ @dataclass
17
+ class SampleWindow:
18
+ """
19
+ Sampling window configuration and metadata.
20
+ """
21
+
22
+ start_idx: int
23
+
24
+ end_idx: int
25
+
26
+ content: str
27
+
28
+ # Relevance score from LLM
29
+ score: float = 0.0
30
+
31
+ # Literal match score from RapidFuzz
32
+ fuzz_score: float = 0.0
33
+
34
+ reasoning: str = ""
35
+
36
+ round_num: int = 0
37
+ # 'fuzz', 'stratified', 'gaussian'
38
+
39
+ source: str = "unknown"
40
+
41
+
42
+ @dataclass
43
+ class RoiResult:
44
+ """
45
+ Data class to store the final Region of Interest (ROI) result and metadata.
46
+ """
47
+
48
+ summary: str
49
+
50
+ is_found: bool
51
+
52
+ # Segments within the document (e.g., paragraph, code snippet)
53
+ # Format: {"snippet": "xxx", "start": 7, "end": 65, "score": 9.0, "reasoning": "xxx"}
54
+ snippets: List[Dict[str, Any]]
55
+
56
+ def to_dict(self):
57
+ """
58
+ Convert RoiResult to a dictionary.
59
+ """
60
+ return {
61
+ "summary": self.summary,
62
+ "is_found": self.is_found,
63
+ "snippets": self.snippets,
64
+ }
65
+
66
+
67
+ class MonteCarloEvidenceSampling:
68
+ """
69
+ Monte Carlo Evidence Importance Sampling for Document Retrieval.
70
+ """
71
+
72
+ def __init__(
73
+ self,
74
+ llm: OpenAIChat,
75
+ doc_content: str,
76
+ verbose: bool = True,
77
+ log_callback: LogCallback = None,
78
+ ):
79
+ self.llm = llm
80
+ self.doc = doc_content
81
+ self.doc_len = len(doc_content)
82
+ self.verbose = verbose
83
+
84
+ self.max_rounds = 3
85
+ # Size of each probe sampling window
86
+ self.probe_window = 500
87
+ # Size of the final expanded context
88
+ self.roi_window = 2000
89
+
90
+ # ---Sampling Configuration--- #
91
+ # Number of anchors from Fuzz
92
+ self.fuzz_candidates_num = 5
93
+ # Number of random points for exploration
94
+ self.random_exploration_num = 2
95
+ # Samples per round for Gaussian sampling
96
+ self.samples_per_round = 5
97
+ # Top K samples to keep as seeds for next round
98
+ self.top_k_seeds = 2
99
+
100
+ self.visited_starts: Set[int] = set()
101
+
102
+ # Create bound logger with callback - returns AsyncLogger instance
103
+ self._log = create_logger(log_callback=log_callback)
104
+
105
+ self.llm_usages: List[Dict[str, Any]] = []
106
+
107
+ def _get_content(self, start: int) -> Tuple[int, int, str]:
108
+ """
109
+ Safely retrieves a document slice with boundary checks.
110
+ """
111
+ start = max(0, min(start, self.doc_len - self.probe_window))
112
+ end = min(start + self.probe_window, self.doc_len)
113
+ return start, end, self.doc[start:end]
114
+
115
+ async def _get_fuzzy_anchors(
116
+ self, query: str, keywords: List[str] = None, threshold: float = 10.0
117
+ ) -> List[SampleWindow]:
118
+ """
119
+ Uses RapidFuzz to find heuristic anchors based on literal matching.
120
+ Logic: Sliding window slices -> Calculate similarity with Query -> Top K.
121
+
122
+ Args:
123
+ query (str): The user query string.
124
+ threshold (float): Minimum similarity score to consider between 0-100.
125
+
126
+ Returns:
127
+ List[SampleWindow]: List of sampled windows based on fuzzy matching.
128
+ """
129
+ if self.verbose:
130
+ await self._log.info("Executing RapidFuzz heuristic pre-filtering...")
131
+
132
+ keywords = keywords or []
133
+
134
+ # 1. Build sliding window slices (stride = half window size)
135
+ stride = self.probe_window // 2
136
+ chunks = []
137
+ for i in range(0, self.doc_len, stride):
138
+ chunks.append(i)
139
+
140
+ # 2. Construct text list for matching
141
+ chunk_texts = [self.doc[i : i + self.probe_window] for i in chunks]
142
+
143
+ # 3. Extract most similar fragments
144
+ # TODO: try to add `fuzz.token_set_ratio` for multi-channel retrieval
145
+ results = process.extract(
146
+ query=f"{query} {' '.join(keywords)}".strip(),
147
+ choices=list(chunk_texts),
148
+ scorer=fuzz.token_set_ratio,
149
+ limit=int(self.fuzz_candidates_num * 2),
150
+ score_cutoff=None,
151
+ )
152
+
153
+ anchors = []
154
+ for text, score, index in results:
155
+ start_idx = chunks[index]
156
+
157
+ # Simple deduplication
158
+ if start_idx in self.visited_starts:
159
+ continue
160
+
161
+ # Threshold filtering (e.g., > 30)
162
+ if score < threshold:
163
+ continue
164
+
165
+ self.visited_starts.add(start_idx)
166
+ _, end, content = self._get_content(start_idx)
167
+
168
+ anchors.append(
169
+ SampleWindow(
170
+ start_idx=start_idx,
171
+ end_idx=end,
172
+ content=content,
173
+ fuzz_score=score,
174
+ round_num=1,
175
+ source="fuzz",
176
+ )
177
+ )
178
+
179
+ if len(anchors) >= self.fuzz_candidates_num:
180
+ break
181
+
182
+ top_score = anchors[0].fuzz_score if anchors else 0.0
183
+ if self.verbose:
184
+ await self._log.info(
185
+ f" Anchors hit: {len(anchors)} (Top Fuzz Score: {top_score:.1f})"
186
+ )
187
+
188
+ return anchors
189
+
190
+ def _sample_stratified_supplement(self, count: int) -> List[SampleWindow]:
191
+ """
192
+ Adds a small amount of global random sampling for 'Exploration',
193
+ preventing cases where Query is semantically relevant but lacks keyword matches.
194
+
195
+ Args:
196
+ count (int): Number of random samples to generate.
197
+
198
+ Returns:
199
+ List[SampleWindow]: List of randomly sampled windows.
200
+ """
201
+ samples = []
202
+ if count <= 0:
203
+ return samples
204
+
205
+ step = self.doc_len // count
206
+ for i in range(count):
207
+ section_start = i * step
208
+ section_end = min((i + 1) * step, self.doc_len)
209
+
210
+ # Random selection within section
211
+ max_start = max(section_start, section_end - self.probe_window)
212
+ rand_start = random.randint(section_start, max_start)
213
+
214
+ start, end, content = self._get_content(rand_start)
215
+
216
+ # Check for overlap with existing points
217
+ is_duplicate = False
218
+ for v in self.visited_starts:
219
+ if abs(v - start) < (self.probe_window // 2):
220
+ is_duplicate = True
221
+ break
222
+
223
+ if not is_duplicate:
224
+ self.visited_starts.add(start)
225
+ samples.append(
226
+ SampleWindow(
227
+ start_idx=start,
228
+ end_idx=end,
229
+ content=content,
230
+ round_num=1,
231
+ source="stratified",
232
+ )
233
+ )
234
+
235
+ return samples
236
+
237
+ def _sample_gaussian(
238
+ self, seeds: List[SampleWindow], current_round: int
239
+ ) -> List[SampleWindow]:
240
+ """
241
+ [Subsequent Rounds] Gaussian Importance Sampling.
242
+
243
+ Args:
244
+ seeds (List[SampleWindow]): High-value seeds from previous round.
245
+ current_round (int): Current round number.
246
+
247
+ Returns:
248
+ List[SampleWindow]: List of newly sampled windows.
249
+ """
250
+ samples = []
251
+ # Sigma Decay: Shrink search range as rounds progress
252
+ base_sigma = self.doc_len / 20
253
+ sigma = base_sigma / (2 ** (current_round - 1))
254
+
255
+ samples_needed = self.samples_per_round
256
+
257
+ for seed in seeds:
258
+ if samples_needed <= 0:
259
+ break
260
+
261
+ # Allocate children per seed
262
+ num_children = max(1, math.ceil(samples_needed / len(seeds)))
263
+ center = (seed.start_idx + seed.end_idx) // 2
264
+
265
+ for _ in range(num_children):
266
+ new_center = int(random.gauss(center, sigma))
267
+ raw_start = new_center - (self.probe_window // 2)
268
+ start, end, content = self._get_content(raw_start)
269
+
270
+ # Deduplication check
271
+ too_close = False
272
+ for existing in self.visited_starts:
273
+ if abs(existing - start) < (self.probe_window // 3):
274
+ too_close = True
275
+ break
276
+
277
+ if not too_close:
278
+ self.visited_starts.add(start)
279
+ samples.append(
280
+ SampleWindow(
281
+ start_idx=start,
282
+ end_idx=end,
283
+ content=content,
284
+ round_num=current_round,
285
+ source="gaussian",
286
+ )
287
+ )
288
+ samples_needed -= 1
289
+
290
+ return samples
291
+
292
+ async def _evaluate_sample_async(
293
+ self, sample: SampleWindow, query: str
294
+ ) -> SampleWindow:
295
+ """
296
+ Evaluates a single sample asynchronously.
297
+ """
298
+ prompt = EVALUATE_EVIDENCE_SAMPLE.format(
299
+ query=query,
300
+ sample_source=sample.source,
301
+ sample_content=sample.content,
302
+ )
303
+ try:
304
+ resp_obj = await self.llm.achat([{"role": "user", "content": prompt}])
305
+ resp: str = resp_obj.content
306
+ self.llm_usages.append(resp_obj.usage)
307
+
308
+ clean_resp = resp.replace("```json", "").replace("```", "").strip()
309
+ data = json.loads(clean_resp)
310
+ sample.score = float(data.get("score", 0))
311
+ sample.reasoning = data.get("reasoning", "")
312
+ except Exception as e:
313
+ await self._log.warning(f"Error evaluating sample at {sample.start_idx}: {e}")
314
+ sample.score = 0.0
315
+
316
+ return sample
317
+
318
+ async def _evaluate_batch(
319
+ self, samples: List[SampleWindow], query: str
320
+ ) -> List[SampleWindow]:
321
+ """
322
+ Evaluates a batch of samples concurrently.
323
+ """
324
+ if self.verbose:
325
+ await self._log.info(f" Evaluating {len(samples)} samples with LLM...")
326
+
327
+ # Create async tasks
328
+ tasks = [self._evaluate_sample_async(s, query) for s in samples]
329
+
330
+ # Run concurrently
331
+ evaluated_samples = await asyncio.gather(*tasks)
332
+ return list(evaluated_samples)
333
+
334
+ async def _generate_summary(
335
+ self, top_samples: List[SampleWindow], query: str
336
+ ) -> str:
337
+ """
338
+ Expands the context windows for multiple top samples and generates a summary.
339
+ """
340
+ combined_context = ""
341
+ half_window = self.roi_window // 2
342
+
343
+ # Sort by index to maintain document flow if needed, or by score
344
+ processed_samples = sorted(top_samples, key=lambda x: x.start_idx)
345
+
346
+ for i, sample in enumerate(processed_samples):
347
+ center = (sample.start_idx + sample.end_idx) // 2
348
+ start = max(0, center - half_window)
349
+ end = min(self.doc_len, center + half_window)
350
+ expanded_content = self.doc[start:end]
351
+ combined_context += (
352
+ f"\n--- Context Fragment {i + 1} ---\n...{expanded_content}...\n"
353
+ )
354
+
355
+ prompt = ROI_RESULT_SUMMARY.format(
356
+ user_input=query,
357
+ text_content=combined_context,
358
+ )
359
+
360
+ summary_response = await self.llm.achat([{"role": "user", "content": prompt}])
361
+ self.llm_usages.append(summary_response.usage)
362
+ return summary_response.content
363
+
364
+ async def get_roi(
365
+ self,
366
+ query: str,
367
+ keywords: Dict[str, float] = None,
368
+ confidence_threshold: float = 8.5,
369
+ top_k: int = 5,
370
+ ) -> RoiResult:
371
+ """
372
+ Get the Region of Interest (ROI) for the given query.
373
+
374
+ Args:
375
+ query (str): The user query string.
376
+ keywords (Dict[str, float], optional): Enhanced keywords with IDF scores for fuzzy matching.
377
+ confidence_threshold (float): Confidence score threshold for early stopping.
378
+ top_k (int): Number of top snippets to consider for final summary.
379
+
380
+ Returns:
381
+ RoiResult: The final ROI result with metadata.
382
+ """
383
+ if self.verbose:
384
+ await self._log.info(
385
+ f"=== Starting Hybrid Adaptive Retrieval (Doc Len: {self.doc_len}) ==="
386
+ )
387
+ await self._log.info(f"Query: {query}, optional keywords: {keywords}")
388
+
389
+ keywords = keywords or {}
390
+
391
+ all_candidates: List[SampleWindow] = []
392
+ top_seeds: List[SampleWindow] = []
393
+
394
+ for r in range(1, self.max_rounds + 1):
395
+ if self.verbose:
396
+ await self._log.info(f"--- Round {r}/{self.max_rounds} ---")
397
+ current_samples = []
398
+
399
+ if r == 1:
400
+ # === Strategy: Fuzz Anchors + Random Supplement ===
401
+ # 1. Get Fuzz Anchors (Exploitation)
402
+ # Note: Now async to support log callback
403
+ fuzz_samples = await self._get_fuzzy_anchors(
404
+ query=query,
405
+ keywords=list(keywords.keys()),
406
+ threshold=10.0,
407
+ )
408
+ current_samples.extend(fuzz_samples)
409
+
410
+ # 2. Supplement with Random Sampling (Exploration)
411
+ needed_random = self.random_exploration_num
412
+ if len(fuzz_samples) == 0:
413
+ needed_random += 3 # Downgrade to random mode
414
+
415
+ random_samples = self._sample_stratified_supplement(needed_random)
416
+ current_samples.extend(random_samples)
417
+
418
+ if self.verbose:
419
+ await self._log.info(
420
+ f"Sampling Distribution: Fuzz Anchors={len(fuzz_samples)}, Random Exploration={len(random_samples)}"
421
+ )
422
+
423
+ else:
424
+ # === Subsequent Rounds: Gaussian Focusing ===
425
+ # Filter low score seeds
426
+ valid_seeds = [s for s in top_seeds if s.score >= 4.0]
427
+
428
+ if not valid_seeds:
429
+ await self._log.warning(
430
+ "No high-value regions found, attempting global random sampling again..."
431
+ )
432
+ current_samples = self._sample_stratified_supplement(
433
+ self.samples_per_round
434
+ )
435
+ else:
436
+ max_score = valid_seeds[0].score
437
+ if self.verbose:
438
+ await self._log.info(
439
+ f"Focusing: Based on {len(valid_seeds)} seeds (Max Score: {max_score})"
440
+ )
441
+ current_samples = self._sample_gaussian(valid_seeds, r)
442
+
443
+ if not current_samples and self.verbose:
444
+ await self._log.info("No new samples generated this round, skipping.")
445
+ else:
446
+ evaluated = await self._evaluate_batch(current_samples, query)
447
+ all_candidates.extend(evaluated)
448
+
449
+ for s in evaluated:
450
+ await self._log.info(
451
+ f" [Pos {s.start_idx:6d} | Src: {s.source:8s}] Score: {s.score} | {s.reasoning[:30]}..."
452
+ )
453
+
454
+ # Sort and update seeds
455
+ all_candidates.sort(key=lambda x: x.score, reverse=True)
456
+ top_seeds = all_candidates[: self.top_k_seeds]
457
+
458
+ # Early stopping check
459
+ if top_seeds and top_seeds[0].score >= confidence_threshold:
460
+ if self.verbose:
461
+ await self._log.info(
462
+ f"High confidence target found (Score >= {confidence_threshold}), stopping early."
463
+ )
464
+ break
465
+
466
+ # --- Final Result Processing ---
467
+ if not all_candidates:
468
+ await self._log.warning("Failed to retrieve any content.")
469
+ return RoiResult(
470
+ summary="Could not retrieve relevant content.",
471
+ is_found=False,
472
+ snippets=[],
473
+ )
474
+
475
+ # Collect top candidates that are relevant enough
476
+ # Using 4.0 as a soft threshold for relevance inclusion
477
+ relevant_candidates = [c for c in all_candidates if c.score >= 4.0]
478
+
479
+ # If nothing meets the threshold, fallback to the single best candidate
480
+ if not relevant_candidates:
481
+ best = all_candidates[0]
482
+ return RoiResult(
483
+ summary="No exact answer found in the document.",
484
+ is_found=False,
485
+ snippets=[
486
+ {
487
+ "snippet": best.content,
488
+ "start": best.start_idx,
489
+ "end": best.end_idx,
490
+ "score": best.score,
491
+ "reasoning": best.reasoning,
492
+ }
493
+ ],
494
+ )
495
+
496
+ # Take up to top_k_seeds (e.g., 2 or 3) as the final set for summarization
497
+ final_candidates = relevant_candidates[:top_k]
498
+ best_score = final_candidates[0].score
499
+
500
+ if self.verbose:
501
+ await self._log.info(
502
+ f"=== Final Lock: {len(final_candidates)} snippets, Top Score {best_score} ==="
503
+ )
504
+
505
+ # Generate summary
506
+ summary = await self._generate_summary(final_candidates, query)
507
+
508
+ # Construct new snippet format
509
+ roi_snippets = []
510
+ for c in final_candidates:
511
+ roi_snippets.append(
512
+ {
513
+ "snippet": c.content,
514
+ "start": c.start_idx,
515
+ "end": c.end_idx,
516
+ "score": c.score,
517
+ "reasoning": c.reasoning,
518
+ }
519
+ )
520
+
521
+ return RoiResult(
522
+ summary=summary,
523
+ is_found=True,
524
+ snippets=roi_snippets,
525
+ )