sirchmunk 0.0.0__py3-none-any.whl → 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sirchmunk/__init__.py +8 -0
- sirchmunk/base.py +17 -0
- sirchmunk/insight/__init__.py +4 -0
- sirchmunk/insight/text_insights.py +292 -0
- sirchmunk/learnings/__init__.py +1 -0
- sirchmunk/learnings/evidence_processor.py +525 -0
- sirchmunk/learnings/knowledge_base.py +232 -0
- sirchmunk/llm/__init__.py +2 -0
- sirchmunk/llm/openai_chat.py +247 -0
- sirchmunk/llm/prompts.py +216 -0
- sirchmunk/retrieve/__init__.py +1 -0
- sirchmunk/retrieve/base.py +25 -0
- sirchmunk/retrieve/text_retriever.py +1026 -0
- sirchmunk/scan/__init__.py +1 -0
- sirchmunk/scan/base.py +18 -0
- sirchmunk/scan/file_scanner.py +373 -0
- sirchmunk/scan/web_scanner.py +18 -0
- sirchmunk/scheduler/__init__.py +0 -0
- sirchmunk/schema/__init__.py +2 -0
- sirchmunk/schema/cognition.py +106 -0
- sirchmunk/schema/context.py +25 -0
- sirchmunk/schema/knowledge.py +318 -0
- sirchmunk/schema/metadata.py +658 -0
- sirchmunk/schema/request.py +221 -0
- sirchmunk/schema/response.py +20 -0
- sirchmunk/schema/snapshot.py +346 -0
- sirchmunk/search.py +475 -0
- sirchmunk/storage/__init__.py +7 -0
- sirchmunk/storage/duckdb.py +676 -0
- sirchmunk/storage/knowledge_manager.py +720 -0
- sirchmunk/utils/__init__.py +15 -0
- sirchmunk/utils/constants.py +15 -0
- sirchmunk/utils/deps.py +23 -0
- sirchmunk/utils/file_utils.py +70 -0
- sirchmunk/utils/install_rga.py +124 -0
- sirchmunk/utils/log_utils.py +360 -0
- sirchmunk/utils/tokenizer_util.py +55 -0
- sirchmunk/utils/utils.py +108 -0
- sirchmunk/version.py +1 -1
- sirchmunk-0.0.1.dist-info/METADATA +416 -0
- sirchmunk-0.0.1.dist-info/RECORD +45 -0
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/WHEEL +1 -1
- sirchmunk-0.0.0.dist-info/METADATA +0 -26
- sirchmunk-0.0.0.dist-info/RECORD +0 -8
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/entry_points.txt +0 -0
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/licenses/LICENSE +0 -0
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
# Copyright (c) ModelScope Contributors. All rights reserved.
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Dict, List, Literal, Optional, Union
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class ImageURL:
|
|
8
|
+
"""Represents an image URL with optional detail and media type."""
|
|
9
|
+
|
|
10
|
+
url: str
|
|
11
|
+
detail: str = "auto"
|
|
12
|
+
media_type: str = "image/jpeg" # Necessary for Anthropic
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class ContentItem:
|
|
17
|
+
"""Represents a content item, which can be either text or an image URL."""
|
|
18
|
+
|
|
19
|
+
type: str # "text" or "image_url"
|
|
20
|
+
text: Optional[str] = None
|
|
21
|
+
image_url: Optional[ImageURL] = None
|
|
22
|
+
|
|
23
|
+
def to_openai(self):
|
|
24
|
+
if self.type == "text":
|
|
25
|
+
return {"type": "text", "text": self.text}
|
|
26
|
+
return {
|
|
27
|
+
"type": "image_url",
|
|
28
|
+
"image_url": {"url": self.image_url.url, "detail": self.image_url.detail},
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
def to_anthropic(self):
|
|
32
|
+
if self.type == "text":
|
|
33
|
+
return {"type": "text", "text": self.text}
|
|
34
|
+
|
|
35
|
+
# Strip Base64 prefix if present for Anthropic
|
|
36
|
+
raw_data = self.image_url.url
|
|
37
|
+
if "base64," in raw_data:
|
|
38
|
+
raw_data = raw_data.split("base64,")[1]
|
|
39
|
+
|
|
40
|
+
return {
|
|
41
|
+
"type": "image",
|
|
42
|
+
"source": {
|
|
43
|
+
"type": "base64",
|
|
44
|
+
"media_type": self.image_url.media_type,
|
|
45
|
+
"data": raw_data,
|
|
46
|
+
},
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class Message:
|
|
52
|
+
"""Represents a message in the conversation, system/user/assistant."""
|
|
53
|
+
|
|
54
|
+
role: str
|
|
55
|
+
content: Union[str, List[ContentItem]]
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class Request:
|
|
60
|
+
"""
|
|
61
|
+
Represents a request to Agentic Search API, supporting both OpenAI and Anthropic message formats.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
messages: List[Message]
|
|
65
|
+
system: Optional[str] = "You are a helpful assistant."
|
|
66
|
+
message_format: Literal["openai", "anthropic"] = "openai"
|
|
67
|
+
|
|
68
|
+
def get_system(self) -> str:
|
|
69
|
+
"""Get the system prompt."""
|
|
70
|
+
return self.system
|
|
71
|
+
|
|
72
|
+
def get_user_input(self) -> str:
|
|
73
|
+
"""Extract the user query from the messages."""
|
|
74
|
+
for m in self.messages:
|
|
75
|
+
if m.role == "user":
|
|
76
|
+
if isinstance(m.content, str):
|
|
77
|
+
return m.content
|
|
78
|
+
else:
|
|
79
|
+
texts = [c.text for c in m.content if c.type == "text" and c.text]
|
|
80
|
+
return " ".join(texts)
|
|
81
|
+
return ""
|
|
82
|
+
|
|
83
|
+
def get_image_urls(self) -> List[str]:
|
|
84
|
+
"""Extract image URLs from user messages."""
|
|
85
|
+
image_urls = []
|
|
86
|
+
for m in self.messages:
|
|
87
|
+
if m.role == "user":
|
|
88
|
+
if isinstance(m.content, list):
|
|
89
|
+
for c in m.content:
|
|
90
|
+
if c.type == "image_url" and c.image_url:
|
|
91
|
+
image_urls.append(c.image_url.url)
|
|
92
|
+
return image_urls
|
|
93
|
+
|
|
94
|
+
def to_payload(
|
|
95
|
+
self, prompt_template: Optional[str] = None
|
|
96
|
+
) -> Union[List[Dict], Dict]:
|
|
97
|
+
"""Convert messages to the appropriate API payload format based on message_format."""
|
|
98
|
+
if self.message_format == "openai":
|
|
99
|
+
return self._to_openai_payload(prompt_template=prompt_template)
|
|
100
|
+
elif self.message_format == "anthropic":
|
|
101
|
+
return self._to_anthropic_payload(prompt_template=prompt_template)
|
|
102
|
+
else:
|
|
103
|
+
raise ValueError(
|
|
104
|
+
f"Unsupported message format: {self.message_format}, must be 'openai' or 'anthropic'."
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
def _to_openai_payload(self, prompt_template: Optional[str] = None) -> List[Dict]:
|
|
108
|
+
"""Convert messages to OpenAI API payload format."""
|
|
109
|
+
formatted_msgs = []
|
|
110
|
+
system_msg: Message = Message(role="system", content=self.system)
|
|
111
|
+
self.messages.insert(0, system_msg)
|
|
112
|
+
|
|
113
|
+
for m in self.messages:
|
|
114
|
+
if m.role == "user" and prompt_template:
|
|
115
|
+
# Apply prompt template if provided
|
|
116
|
+
if isinstance(m.content, str):
|
|
117
|
+
content = prompt_template.format(user_input=m.content)
|
|
118
|
+
else:
|
|
119
|
+
content = []
|
|
120
|
+
for c in m.content:
|
|
121
|
+
if c.type == "text":
|
|
122
|
+
formatted_text = prompt_template.format(user_input=c.text)
|
|
123
|
+
content.append({"type": "text", "text": formatted_text})
|
|
124
|
+
else:
|
|
125
|
+
content.append(c.to_openai())
|
|
126
|
+
else:
|
|
127
|
+
content = (
|
|
128
|
+
m.content
|
|
129
|
+
if isinstance(m.content, str)
|
|
130
|
+
else [c.to_openai() for c in m.content]
|
|
131
|
+
)
|
|
132
|
+
formatted_msgs.append({"role": m.role, "content": content})
|
|
133
|
+
|
|
134
|
+
return formatted_msgs
|
|
135
|
+
|
|
136
|
+
def _to_anthropic_payload(self, prompt_template: Optional[str] = None) -> Dict:
|
|
137
|
+
"""Convert messages to Anthropic API payload format."""
|
|
138
|
+
formatted_msgs = []
|
|
139
|
+
|
|
140
|
+
for m in self.messages:
|
|
141
|
+
if m.role == "user" and prompt_template:
|
|
142
|
+
# Apply prompt template if provided
|
|
143
|
+
if isinstance(m.content, str):
|
|
144
|
+
content = prompt_template.format(user_input=m.content)
|
|
145
|
+
else:
|
|
146
|
+
content = []
|
|
147
|
+
for c in m.content:
|
|
148
|
+
if c.type == "text":
|
|
149
|
+
formatted_text = prompt_template.format(user_input=c.text)
|
|
150
|
+
content.append({"type": "text", "text": formatted_text})
|
|
151
|
+
else:
|
|
152
|
+
content.append(c.to_anthropic())
|
|
153
|
+
else:
|
|
154
|
+
# Anthropic expects 'system' as a top-level parameter, not in messages
|
|
155
|
+
content = (
|
|
156
|
+
m.content
|
|
157
|
+
if isinstance(m.content, str)
|
|
158
|
+
else [c.to_anthropic() for c in m.content]
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
formatted_msgs.append({"role": m.role, "content": content})
|
|
162
|
+
|
|
163
|
+
payload = {"system": self.system, "messages": formatted_msgs}
|
|
164
|
+
|
|
165
|
+
return payload
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
if __name__ == "__main__":
|
|
169
|
+
import json
|
|
170
|
+
|
|
171
|
+
prompt_template: str = (
|
|
172
|
+
"Please answer the following question carefully based on the given information: {user_input}"
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
# Usage for Anthropic
|
|
176
|
+
req_anthropic = Request(
|
|
177
|
+
message_format="anthropic",
|
|
178
|
+
system="Analyze the video frames carefully.",
|
|
179
|
+
messages=[
|
|
180
|
+
Message(
|
|
181
|
+
role="user",
|
|
182
|
+
content=[
|
|
183
|
+
ContentItem(type="text", text="What is happening here?"),
|
|
184
|
+
ContentItem(
|
|
185
|
+
type="image_url",
|
|
186
|
+
image_url=ImageURL(url="base64_string_here..."),
|
|
187
|
+
),
|
|
188
|
+
],
|
|
189
|
+
),
|
|
190
|
+
],
|
|
191
|
+
)
|
|
192
|
+
print(
|
|
193
|
+
f"Anthropic Payload:\n{json.dumps(req_anthropic.to_payload(prompt_template=prompt_template), ensure_ascii=False, indent=2)}"
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# Usage for OpenAI
|
|
197
|
+
req_openai = Request(
|
|
198
|
+
message_format="openai",
|
|
199
|
+
system="You are a helpful assistant.",
|
|
200
|
+
messages=[
|
|
201
|
+
Message(
|
|
202
|
+
role="user",
|
|
203
|
+
content=[
|
|
204
|
+
ContentItem(type="text", text="What is unusual about this image?"),
|
|
205
|
+
ContentItem(
|
|
206
|
+
type="image_url",
|
|
207
|
+
image_url=ImageURL(
|
|
208
|
+
url="https://example.com/strange-building.jpg"
|
|
209
|
+
),
|
|
210
|
+
),
|
|
211
|
+
],
|
|
212
|
+
),
|
|
213
|
+
],
|
|
214
|
+
)
|
|
215
|
+
print(
|
|
216
|
+
f"\nOpenAI Payload:\n{json.dumps(req_openai.to_payload(prompt_template=prompt_template), ensure_ascii=False, indent=2)}"
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
print(f"\nUser Query (OpenAI format): {req_openai.get_user_input()}")
|
|
220
|
+
|
|
221
|
+
print(f"\nImage URLs (OpenAI format): {req_openai.get_image_urls()}")
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# Copyright (c) ModelScope Contributors. All rights reserved.
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Any, Dict
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class Response:
|
|
8
|
+
"""
|
|
9
|
+
Represents a response generated by the agentic search system.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
# The main content of the response
|
|
13
|
+
content: str
|
|
14
|
+
|
|
15
|
+
# Additional data or metadata associated with the response
|
|
16
|
+
metadata: Dict[str, Any] = None
|
|
17
|
+
|
|
18
|
+
def __post_init__(self):
|
|
19
|
+
if self.metadata is None:
|
|
20
|
+
self.metadata = {}
|
|
@@ -0,0 +1,346 @@
|
|
|
1
|
+
# Copyright (c) ModelScope Contributors. All rights reserved.
|
|
2
|
+
import random
|
|
3
|
+
import re
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, List, Optional, Union
|
|
8
|
+
|
|
9
|
+
from loguru import logger
|
|
10
|
+
|
|
11
|
+
from sirchmunk.llm.openai_chat import OpenAIChat
|
|
12
|
+
from sirchmunk.utils.tokenizer_util import TokenizerUtil
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class SnapshotInfo:
|
|
17
|
+
"""
|
|
18
|
+
Data class to hold snapshot information of the specified file.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
title: str = field(default="")
|
|
22
|
+
|
|
23
|
+
description: str = field(default="")
|
|
24
|
+
|
|
25
|
+
keywords: List[str] = field(default_factory=list)
|
|
26
|
+
|
|
27
|
+
contents: List[str] = field(default_factory=list)
|
|
28
|
+
|
|
29
|
+
# Additional resources (e.g., images, tables, or others associated with current file)
|
|
30
|
+
resources: List[Any] = field(default_factory=list)
|
|
31
|
+
|
|
32
|
+
def to_dict(self):
|
|
33
|
+
"""Convert SnapshotInfo to a dictionary."""
|
|
34
|
+
return {
|
|
35
|
+
"title": self.title,
|
|
36
|
+
"description": self.description,
|
|
37
|
+
"keywords": self.keywords,
|
|
38
|
+
"contents": self.contents,
|
|
39
|
+
"resources": self.resources,
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class Snapshot(ABC):
|
|
44
|
+
"""
|
|
45
|
+
Base class for file snapshotting strategies.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
llm: Optional[OpenAIChat] = None,
|
|
51
|
+
**kwargs,
|
|
52
|
+
):
|
|
53
|
+
|
|
54
|
+
self.llm: Optional[OpenAIChat] = llm
|
|
55
|
+
self.kwargs = kwargs
|
|
56
|
+
|
|
57
|
+
@abstractmethod
|
|
58
|
+
def sampling(self, **kwargs) -> List[str]:
|
|
59
|
+
"""
|
|
60
|
+
Abstract method to perform sampling on a file.
|
|
61
|
+
"""
|
|
62
|
+
raise NotImplementedError("Subclasses must implement this method.")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class TextSnapshot(Snapshot):
|
|
66
|
+
"""
|
|
67
|
+
Text file sampling strategies for snapshotting large text files.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
# TODO: sampling支持多次采样,尤其是针对中间的chunks;后续LLM可以自适应动态调用sampling来做内容增强;
|
|
71
|
+
# Staring mode(凝视模式,参考Radar中的概念):该过程在search阶段由LLM启发式调度;越复杂的文档,中间chunks采样越多;同时需要多次迭代summary,并回写metadata的snapshot
|
|
72
|
+
# 对于contents,采用多阶段summary
|
|
73
|
+
# def resampling(): ...
|
|
74
|
+
|
|
75
|
+
MAX_SNAPSHOT_TOKENS = 2048 # Can be adaptive based on file size
|
|
76
|
+
|
|
77
|
+
MAX_FILE_SIZE = 500 * 1024 * 1024 # 500 MB
|
|
78
|
+
|
|
79
|
+
def __init__(self, llm: Optional[OpenAIChat] = None, **kwargs):
|
|
80
|
+
|
|
81
|
+
from sirchmunk.insight.text_insights import TextInsights
|
|
82
|
+
|
|
83
|
+
super().__init__(llm=llm, **kwargs)
|
|
84
|
+
|
|
85
|
+
self.text_insights = TextInsights(llm=llm)
|
|
86
|
+
self.tokenizer_util = TokenizerUtil()
|
|
87
|
+
|
|
88
|
+
@staticmethod
|
|
89
|
+
def filter_line(line: str) -> Optional[str]:
|
|
90
|
+
"""Filter out lines with low information content or noise.
|
|
91
|
+
|
|
92
|
+
Filters out:
|
|
93
|
+
- Empty lines or whitespace-only lines
|
|
94
|
+
- Markdown formatting noise (horizontal rules, excessive headings, etc.)
|
|
95
|
+
- Lines consisting mostly of symbols/punctuation
|
|
96
|
+
- URLs, email addresses, and common noise patterns
|
|
97
|
+
- Very short lines (typically < 3 characters after cleaning)
|
|
98
|
+
- Lines with excessive repeated characters
|
|
99
|
+
- Common boilerplate/footer patterns
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
line: Input line string.
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
Cleaned line string if it passes filters, None otherwise.
|
|
106
|
+
"""
|
|
107
|
+
if not line:
|
|
108
|
+
return None
|
|
109
|
+
|
|
110
|
+
# Strip whitespace and check for empty lines
|
|
111
|
+
stripped = line.strip()
|
|
112
|
+
if not stripped:
|
|
113
|
+
return None
|
|
114
|
+
|
|
115
|
+
# Remove leading/trailing whitespace but preserve internal structure
|
|
116
|
+
cleaned = line.rstrip(
|
|
117
|
+
"\n\r"
|
|
118
|
+
) # Keep original indentation, only remove line endings
|
|
119
|
+
|
|
120
|
+
# Check line length (after stripping)
|
|
121
|
+
if len(stripped) < 10:
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
# Markdown-specific noise patterns
|
|
125
|
+
markdown_noise_patterns = [
|
|
126
|
+
r"^\s*-{3,}\s*$", # Horizontal rules (---, --- , etc.)
|
|
127
|
+
r"^\s*\*{3,}\s*$", # Horizontal rules (***)
|
|
128
|
+
r"^\s*_{3,}\s*$", # Horizontal rules (___)
|
|
129
|
+
r"^\s*#{6,}.*$", # Excessive headings (6+ #)
|
|
130
|
+
r"^\s*>+\s*$", # Empty blockquotes
|
|
131
|
+
r"^\s*[-*+]\s*$", # Empty list items
|
|
132
|
+
r"^\s*\d+\.\s*$", # Empty numbered list items
|
|
133
|
+
r"^\s*!\[.*\]\(.*\)\s*$", # Standalone images without context
|
|
134
|
+
r"^\s*\[.*\]:\s*https?://\S+\s*$", # Reference-style link definitions
|
|
135
|
+
r"^\s*```\s*\w*\s*$", # Code block delimiters (``` or ```python)
|
|
136
|
+
r"^\s*~~~\s*\w*\s*$", # Alternative code block delimiters
|
|
137
|
+
r"^\s*\|\s*[-:]+\s*\|\s*$", # Table separator rows
|
|
138
|
+
r"^\s*\|(\s*:?-+:?\s*\|)+\s*$", # Alternative table separator format
|
|
139
|
+
]
|
|
140
|
+
|
|
141
|
+
# Check for Markdown noise patterns
|
|
142
|
+
for pattern in markdown_noise_patterns:
|
|
143
|
+
if re.match(pattern, stripped):
|
|
144
|
+
return None
|
|
145
|
+
|
|
146
|
+
# Remove common noise patterns and check if line becomes empty
|
|
147
|
+
noise_patterns = [
|
|
148
|
+
r"^https?://\S+", # URLs at start of line
|
|
149
|
+
r"\bhttps?://\S+\b", # URLs anywhere
|
|
150
|
+
r"\bwww\.\S+\b", # www URLs
|
|
151
|
+
r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", # emails
|
|
152
|
+
r"^[\s\*\-\_\=\#\+]+$", # lines with only symbols
|
|
153
|
+
]
|
|
154
|
+
|
|
155
|
+
# Check for excessive symbols ratio
|
|
156
|
+
alphanumeric_chars = sum(c.isalnum() for c in stripped)
|
|
157
|
+
if len(stripped) > 0 and alphanumeric_chars / len(stripped) < 0.3:
|
|
158
|
+
return None
|
|
159
|
+
|
|
160
|
+
# Check for excessive repeated characters (e.g., "..........." or "----------")
|
|
161
|
+
if TextSnapshot._has_excessive_repetition(stripped):
|
|
162
|
+
return None
|
|
163
|
+
|
|
164
|
+
# Check for common boilerplate patterns
|
|
165
|
+
boilerplate_patterns = [
|
|
166
|
+
r"^(copyright|©|\(c\))",
|
|
167
|
+
r"^all rights reserved",
|
|
168
|
+
r"^confidential",
|
|
169
|
+
r"^page \d+",
|
|
170
|
+
r"^\d+\s+of\s+\d+$",
|
|
171
|
+
r"^file:|^path:",
|
|
172
|
+
r"^created:|^modified:",
|
|
173
|
+
r"^author:",
|
|
174
|
+
r"^version:",
|
|
175
|
+
r"^build:",
|
|
176
|
+
r"^generated by",
|
|
177
|
+
r"^this document",
|
|
178
|
+
r"^last updated",
|
|
179
|
+
r"^table of contents",
|
|
180
|
+
r"^toc:",
|
|
181
|
+
r"^contents",
|
|
182
|
+
r"^index",
|
|
183
|
+
]
|
|
184
|
+
|
|
185
|
+
stripped_lower = stripped.lower()
|
|
186
|
+
for pattern in boilerplate_patterns:
|
|
187
|
+
if re.search(pattern, stripped_lower):
|
|
188
|
+
return None
|
|
189
|
+
|
|
190
|
+
# Apply noise pattern removal and check if meaningful content remains
|
|
191
|
+
temp_line = stripped
|
|
192
|
+
for pattern in noise_patterns + markdown_noise_patterns:
|
|
193
|
+
temp_line = re.sub(pattern, "", temp_line)
|
|
194
|
+
|
|
195
|
+
# After removing noise, check if we still have meaningful content
|
|
196
|
+
temp_stripped = temp_line.strip()
|
|
197
|
+
if (
|
|
198
|
+
len(temp_stripped) < 3
|
|
199
|
+
or (
|
|
200
|
+
sum(c.isalnum() for c in temp_stripped) / len(temp_stripped)
|
|
201
|
+
if len(temp_stripped) > 0
|
|
202
|
+
else 0
|
|
203
|
+
)
|
|
204
|
+
< 0.4
|
|
205
|
+
):
|
|
206
|
+
return None
|
|
207
|
+
|
|
208
|
+
# Additional check: reject lines that are mostly Markdown syntax characters
|
|
209
|
+
markdown_chars = sum(1 for c in stripped if c in "#*_-~`>![](){}|:")
|
|
210
|
+
if len(stripped) > 0 and markdown_chars / len(stripped) > 0.6:
|
|
211
|
+
return None
|
|
212
|
+
|
|
213
|
+
return cleaned
|
|
214
|
+
|
|
215
|
+
@staticmethod
|
|
216
|
+
def _has_excessive_repetition(text: str) -> bool:
|
|
217
|
+
"""Check if text has excessive character repetition.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
text: Input text string.
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
True if excessive repetition detected.
|
|
224
|
+
"""
|
|
225
|
+
if len(text) < 10:
|
|
226
|
+
return False
|
|
227
|
+
|
|
228
|
+
# Check for sequences of same character (e.g., "------", "......")
|
|
229
|
+
for i in range(len(text) - 4):
|
|
230
|
+
if text[i] == text[i + 1] == text[i + 2] == text[i + 3] == text[i + 4]:
|
|
231
|
+
# Allow some legitimate cases like "hello....." but not full line
|
|
232
|
+
if text.count(text[i]) / len(text) > 0.6:
|
|
233
|
+
return True
|
|
234
|
+
|
|
235
|
+
# Check for alternating patterns (e.g., "- - - -", ". . . .")
|
|
236
|
+
if re.search(r"([^\w\s])\s*\1\s*\1\s*\1", text):
|
|
237
|
+
return True
|
|
238
|
+
|
|
239
|
+
return False
|
|
240
|
+
|
|
241
|
+
def sampling(
|
|
242
|
+
self,
|
|
243
|
+
file_path: Union[str, Path],
|
|
244
|
+
max_snapshot_tokens: int = MAX_SNAPSHOT_TOKENS,
|
|
245
|
+
max_file_size: int = MAX_FILE_SIZE,
|
|
246
|
+
) -> Union[SnapshotInfo, None]:
|
|
247
|
+
|
|
248
|
+
file_path = Path(file_path)
|
|
249
|
+
file_size = file_path.stat().st_size
|
|
250
|
+
if file_size > max_file_size: # TODO: add more strategies for large files
|
|
251
|
+
logger.warning(
|
|
252
|
+
f"File size {file_size} exceeds maximum allowed size of {max_file_size} bytes, skipping snapshot."
|
|
253
|
+
)
|
|
254
|
+
return None
|
|
255
|
+
|
|
256
|
+
snapshot_info = SnapshotInfo()
|
|
257
|
+
|
|
258
|
+
selected_lines = []
|
|
259
|
+
accumulated_tokens = 0
|
|
260
|
+
line_count = 0
|
|
261
|
+
|
|
262
|
+
# Stream through file once
|
|
263
|
+
with open(file_path, "r", encoding="utf-8", errors="replace") as f:
|
|
264
|
+
for line in f:
|
|
265
|
+
line = self.filter_line(line)
|
|
266
|
+
if not line:
|
|
267
|
+
continue
|
|
268
|
+
|
|
269
|
+
line_count += 1
|
|
270
|
+
line_tokens = self.tokenizer_util.count_tokens(contents=line.strip())
|
|
271
|
+
|
|
272
|
+
# Adaptive sampling strategy:
|
|
273
|
+
# - Early in file: higher acceptance probability
|
|
274
|
+
# - Near token limit: lower probability
|
|
275
|
+
# - Always accept if budget allows and line is small
|
|
276
|
+
|
|
277
|
+
# Calculate acceptance probability
|
|
278
|
+
token_fill_ratio = (
|
|
279
|
+
accumulated_tokens / max_snapshot_tokens
|
|
280
|
+
if max_snapshot_tokens > 0
|
|
281
|
+
else 0
|
|
282
|
+
)
|
|
283
|
+
line_token_ratio = (
|
|
284
|
+
line_tokens / max_snapshot_tokens if max_snapshot_tokens > 0 else 0
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
# Base probability decreases as we fill token budget
|
|
288
|
+
base_prob = max(0.2, 1.0 - token_fill_ratio * 2.0)
|
|
289
|
+
|
|
290
|
+
# Adjust for line size (prefer smaller lines when budget is tight)
|
|
291
|
+
size_penalty = (
|
|
292
|
+
min(1.0, 0.8 + (1.0 - line_token_ratio) * 0.4)
|
|
293
|
+
if token_fill_ratio > 0.5
|
|
294
|
+
else 1.0
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
# Add some randomness to avoid deterministic patterns
|
|
298
|
+
noise = random.uniform(0.8, 1.2)
|
|
299
|
+
acceptance_prob = min(1.0, base_prob * size_penalty * noise)
|
|
300
|
+
|
|
301
|
+
# Force accept conditions
|
|
302
|
+
if (accumulated_tokens == 0 and line_tokens > 0) or (
|
|
303
|
+
accumulated_tokens + line_tokens <= max_snapshot_tokens * 0.7
|
|
304
|
+
):
|
|
305
|
+
acceptance_prob = 1.0
|
|
306
|
+
|
|
307
|
+
# Sampling decision
|
|
308
|
+
if random.random() < acceptance_prob:
|
|
309
|
+
# Check if adding this line keeps us within reasonable bounds
|
|
310
|
+
if (
|
|
311
|
+
accumulated_tokens + line_tokens <= max_snapshot_tokens * 1.5
|
|
312
|
+
): # Allow 50% overflow max
|
|
313
|
+
selected_lines.append((line, line_count))
|
|
314
|
+
accumulated_tokens += line_tokens
|
|
315
|
+
|
|
316
|
+
# Early termination if we've significantly exceeded target
|
|
317
|
+
if accumulated_tokens >= max_snapshot_tokens * 1.2:
|
|
318
|
+
break
|
|
319
|
+
|
|
320
|
+
# Sort by original line number to preserve document structure
|
|
321
|
+
selected_lines.sort(key=lambda x: x[1])
|
|
322
|
+
logger.info(
|
|
323
|
+
f"Got {len(selected_lines)} selected lines, total tokens: {accumulated_tokens}"
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
snapshot_info.contents = [line for line, _ in selected_lines]
|
|
327
|
+
|
|
328
|
+
# Get keywords/key phrase
|
|
329
|
+
try:
|
|
330
|
+
snapshot_info.keywords = self.text_insights.extract_phrase(
|
|
331
|
+
contents=snapshot_info.contents
|
|
332
|
+
)
|
|
333
|
+
except Exception as e:
|
|
334
|
+
logger.error(f"Error extracting keywords: {e}")
|
|
335
|
+
snapshot_info.keywords = []
|
|
336
|
+
|
|
337
|
+
try:
|
|
338
|
+
# TODO: 与phrase一起放到同一个llm calling中
|
|
339
|
+
snapshot_info.description = self.text_insights.extract_toc(
|
|
340
|
+
contents=snapshot_info.contents
|
|
341
|
+
)
|
|
342
|
+
except Exception as e:
|
|
343
|
+
logger.error(f"Error extracting description: {e}")
|
|
344
|
+
snapshot_info.description = ""
|
|
345
|
+
|
|
346
|
+
return snapshot_info
|