isage-middleware 0.2.4.3__cp311-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. isage_middleware-0.2.4.3.dist-info/METADATA +266 -0
  2. isage_middleware-0.2.4.3.dist-info/RECORD +94 -0
  3. isage_middleware-0.2.4.3.dist-info/WHEEL +5 -0
  4. isage_middleware-0.2.4.3.dist-info/top_level.txt +1 -0
  5. sage/middleware/__init__.py +59 -0
  6. sage/middleware/_version.py +6 -0
  7. sage/middleware/components/__init__.py +30 -0
  8. sage/middleware/components/extensions_compat.py +141 -0
  9. sage/middleware/components/sage_db/__init__.py +116 -0
  10. sage/middleware/components/sage_db/backend.py +136 -0
  11. sage/middleware/components/sage_db/service.py +15 -0
  12. sage/middleware/components/sage_flow/__init__.py +76 -0
  13. sage/middleware/components/sage_flow/python/__init__.py +14 -0
  14. sage/middleware/components/sage_flow/python/micro_service/__init__.py +4 -0
  15. sage/middleware/components/sage_flow/python/micro_service/sage_flow_service.py +88 -0
  16. sage/middleware/components/sage_flow/python/sage_flow.py +30 -0
  17. sage/middleware/components/sage_flow/service.py +14 -0
  18. sage/middleware/components/sage_mem/__init__.py +83 -0
  19. sage/middleware/components/sage_sias/__init__.py +59 -0
  20. sage/middleware/components/sage_sias/continual_learner.py +184 -0
  21. sage/middleware/components/sage_sias/coreset_selector.py +302 -0
  22. sage/middleware/components/sage_sias/types.py +94 -0
  23. sage/middleware/components/sage_tsdb/__init__.py +81 -0
  24. sage/middleware/components/sage_tsdb/python/__init__.py +21 -0
  25. sage/middleware/components/sage_tsdb/python/_sage_tsdb.pyi +17 -0
  26. sage/middleware/components/sage_tsdb/python/algorithms/__init__.py +17 -0
  27. sage/middleware/components/sage_tsdb/python/algorithms/base.py +51 -0
  28. sage/middleware/components/sage_tsdb/python/algorithms/out_of_order_join.py +248 -0
  29. sage/middleware/components/sage_tsdb/python/algorithms/window_aggregator.py +296 -0
  30. sage/middleware/components/sage_tsdb/python/micro_service/__init__.py +7 -0
  31. sage/middleware/components/sage_tsdb/python/micro_service/sage_tsdb_service.py +365 -0
  32. sage/middleware/components/sage_tsdb/python/sage_tsdb.py +523 -0
  33. sage/middleware/components/sage_tsdb/service.py +17 -0
  34. sage/middleware/components/vector_stores/__init__.py +25 -0
  35. sage/middleware/components/vector_stores/chroma.py +483 -0
  36. sage/middleware/components/vector_stores/chroma_adapter.py +185 -0
  37. sage/middleware/components/vector_stores/milvus.py +677 -0
  38. sage/middleware/operators/__init__.py +56 -0
  39. sage/middleware/operators/agent/__init__.py +24 -0
  40. sage/middleware/operators/agent/planning/__init__.py +5 -0
  41. sage/middleware/operators/agent/planning/llm_adapter.py +41 -0
  42. sage/middleware/operators/agent/planning/planner_adapter.py +98 -0
  43. sage/middleware/operators/agent/planning/router.py +107 -0
  44. sage/middleware/operators/agent/runtime.py +296 -0
  45. sage/middleware/operators/agentic/__init__.py +41 -0
  46. sage/middleware/operators/agentic/config.py +254 -0
  47. sage/middleware/operators/agentic/planning_operator.py +125 -0
  48. sage/middleware/operators/agentic/refined_searcher.py +132 -0
  49. sage/middleware/operators/agentic/runtime.py +241 -0
  50. sage/middleware/operators/agentic/timing_operator.py +125 -0
  51. sage/middleware/operators/agentic/tool_selection_operator.py +127 -0
  52. sage/middleware/operators/context/__init__.py +17 -0
  53. sage/middleware/operators/context/critic_evaluation.py +16 -0
  54. sage/middleware/operators/context/model_context.py +565 -0
  55. sage/middleware/operators/context/quality_label.py +12 -0
  56. sage/middleware/operators/context/search_query_results.py +61 -0
  57. sage/middleware/operators/context/search_result.py +42 -0
  58. sage/middleware/operators/context/search_session.py +79 -0
  59. sage/middleware/operators/filters/__init__.py +26 -0
  60. sage/middleware/operators/filters/context_sink.py +387 -0
  61. sage/middleware/operators/filters/context_source.py +376 -0
  62. sage/middleware/operators/filters/evaluate_filter.py +83 -0
  63. sage/middleware/operators/filters/tool_filter.py +74 -0
  64. sage/middleware/operators/llm/__init__.py +18 -0
  65. sage/middleware/operators/llm/sagellm_generator.py +432 -0
  66. sage/middleware/operators/rag/__init__.py +147 -0
  67. sage/middleware/operators/rag/arxiv.py +331 -0
  68. sage/middleware/operators/rag/chunk.py +13 -0
  69. sage/middleware/operators/rag/document_loaders.py +23 -0
  70. sage/middleware/operators/rag/evaluate.py +658 -0
  71. sage/middleware/operators/rag/generator.py +340 -0
  72. sage/middleware/operators/rag/index_builder/__init__.py +48 -0
  73. sage/middleware/operators/rag/index_builder/builder.py +363 -0
  74. sage/middleware/operators/rag/index_builder/manifest.py +101 -0
  75. sage/middleware/operators/rag/index_builder/storage.py +131 -0
  76. sage/middleware/operators/rag/pipeline.py +46 -0
  77. sage/middleware/operators/rag/profiler.py +59 -0
  78. sage/middleware/operators/rag/promptor.py +400 -0
  79. sage/middleware/operators/rag/refiner.py +231 -0
  80. sage/middleware/operators/rag/reranker.py +364 -0
  81. sage/middleware/operators/rag/retriever.py +1308 -0
  82. sage/middleware/operators/rag/searcher.py +37 -0
  83. sage/middleware/operators/rag/types.py +28 -0
  84. sage/middleware/operators/rag/writer.py +80 -0
  85. sage/middleware/operators/tools/__init__.py +71 -0
  86. sage/middleware/operators/tools/arxiv_paper_searcher.py +175 -0
  87. sage/middleware/operators/tools/arxiv_searcher.py +102 -0
  88. sage/middleware/operators/tools/duckduckgo_searcher.py +105 -0
  89. sage/middleware/operators/tools/image_captioner.py +104 -0
  90. sage/middleware/operators/tools/nature_news_fetcher.py +224 -0
  91. sage/middleware/operators/tools/searcher_tool.py +514 -0
  92. sage/middleware/operators/tools/text_detector.py +185 -0
  93. sage/middleware/operators/tools/url_text_extractor.py +104 -0
  94. sage/middleware/py.typed +2 -0
@@ -0,0 +1,331 @@
1
+ import json
2
+ import os
3
+ import re
4
+ import time
5
+ from collections import Counter
6
+ from urllib.parse import quote
7
+
8
+ import feedparser
9
+ import requests
10
+
11
+ from sage.common.core.functions import MapFunction as MapOperator
12
+
13
+ # PyMuPDF (fitz) is required for PDF processing
14
+ try:
15
+ import fitz # type: ignore[import-not-found]
16
+
17
+ FITZ_AVAILABLE = True
18
+ except ImportError:
19
+ FITZ_AVAILABLE = False
20
+ fitz = None # type: ignore[assignment]
21
+
22
+
23
+ class Paper:
24
+ def __init__(self, path, title="", url="", abs="", authors=None, **kwargs):
25
+ if authors is None:
26
+ authors = []
27
+ super().__init__(**kwargs)
28
+
29
+ # Check if fitz is available
30
+ if not FITZ_AVAILABLE or fitz is None:
31
+ raise RuntimeError(
32
+ "PyMuPDF (fitz) is required for PDF processing. Install with: pip install PyMuPDF"
33
+ )
34
+
35
+ # 初始化函数,根据pdf路径初始化Paper对象
36
+ self.url = url # 文章链接
37
+ self.path = path # pdf路径
38
+ self.section_names = [] # 段落标题
39
+ self.section_texts = {} # 段落内容
40
+ self.abs = abs
41
+ self.title_page = 0
42
+ if title == "":
43
+ self.pdf = fitz.open(self.path) # pdf文档 # type: ignore[attr-defined]
44
+ self.title = self.get_title()
45
+ self.parse_pdf()
46
+ else:
47
+ self.title = title
48
+ self.authors = authors
49
+ self.roman_num = [
50
+ "I",
51
+ "II",
52
+ "III",
53
+ "IV",
54
+ "V",
55
+ "VI",
56
+ "VII",
57
+ "VIII",
58
+ "IIX",
59
+ "IX",
60
+ "X",
61
+ ]
62
+ self.digit_num = [str(d + 1) for d in range(10)]
63
+ self.first_image = ""
64
+
65
+ def parse_pdf(self):
66
+ assert fitz is not None, "fitz must be available"
67
+ self.pdf = fitz.open(self.path) # type: ignore[attr-defined]
68
+ self.text_list = [page.get_text() for page in self.pdf]
69
+ self.all_text = " ".join(self.text_list)
70
+ self.extract_section_infomation()
71
+ self.section_texts.update({"title": self.title})
72
+ self.pdf.close()
73
+
74
+ # 定义一个函数,根据字体的大小,识别每个章节名称,并返回一个列表
75
+ def get_chapter_names(
76
+ self,
77
+ ):
78
+ assert fitz is not None, "fitz must be available"
79
+ # # 打开一个pdf文件
80
+ doc = fitz.open(self.path) # type: ignore[attr-defined]
81
+ text_list = [page.get_text() for page in doc]
82
+ all_text = ""
83
+ for text in text_list:
84
+ all_text += text
85
+ # # 创建一个空列表,用于存储章节名称
86
+ chapter_names = []
87
+ for line in all_text.split("\n"):
88
+ line.split(" ")
89
+ if "." in line:
90
+ point_split_list = line.split(".")
91
+ space_split_list = line.split(" ")
92
+ if 1 < len(space_split_list) < 5:
93
+ if 1 < len(point_split_list) < 5 and (
94
+ point_split_list[0] in self.roman_num
95
+ or point_split_list[0] in self.digit_num
96
+ ):
97
+ # print("line:", line)
98
+ chapter_names.append(line)
99
+
100
+ return chapter_names
101
+
102
+ def get_title(self):
103
+ doc = self.pdf # 打开pdf文件
104
+ max_font_size = 0 # 初始化最大字体大小为0
105
+ max_font_sizes = [0]
106
+ for page_index, page in enumerate(doc): # 遍历每一页
107
+ text = page.get_text("dict") # 获取页面上的文本信息
108
+ blocks = text["blocks"] # 获取文本块列表
109
+ for block in blocks: # 遍历每个文本块
110
+ if block["type"] == 0 and len(block["lines"]): # 如果是文字类型
111
+ if len(block["lines"][0]["spans"]):
112
+ font_size = block["lines"][0]["spans"][0][
113
+ "size"
114
+ ] # 获取第一行第一段文字的字体大小
115
+ max_font_sizes.append(font_size)
116
+ if font_size > max_font_size: # 如果字体大小大于当前最大值
117
+ max_font_size = font_size # 更新最大值
118
+ block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
119
+ max_font_sizes.sort()
120
+ # print("max_font_sizes", max_font_sizes[-10:])
121
+ cur_title = ""
122
+ for page_index, page in enumerate(doc): # 遍历每一页
123
+ text = page.get_text("dict") # 获取页面上的文本信息
124
+ blocks = text["blocks"] # 获取文本块列表
125
+ for block in blocks: # 遍历每个文本块
126
+ if block["type"] == 0 and len(block["lines"]): # 如果是文字类型
127
+ if len(block["lines"][0]["spans"]):
128
+ cur_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
129
+ block["lines"][0]["spans"][0]["flags"] # 获取第一行第一段文字的字体特征
130
+ font_size = block["lines"][0]["spans"][0][
131
+ "size"
132
+ ] # 获取第一行第一段文字的字体大小
133
+ # print(font_size)
134
+ if (
135
+ abs(font_size - max_font_sizes[-1]) < 0.3
136
+ or abs(font_size - max_font_sizes[-2]) < 0.3
137
+ ):
138
+ # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
139
+ if len(cur_string) > 4 and "arXiv" not in cur_string:
140
+ # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
141
+ if cur_title == "":
142
+ cur_title += cur_string
143
+ else:
144
+ cur_title += " " + cur_string
145
+ self.title_page = page_index
146
+ # break
147
+ title = cur_title.replace("\n", " ")
148
+ return title
149
+
150
+ def extract_section_infomation(self):
151
+ assert fitz is not None, "fitz must be available"
152
+ doc = fitz.open(self.path) # type: ignore[attr-defined]
153
+
154
+ # 获取文档中所有字体大小
155
+ font_sizes = []
156
+ for page in doc:
157
+ blocks = page.get_text("dict")["blocks"]
158
+ for block in blocks:
159
+ if "lines" not in block:
160
+ continue
161
+ lines = block["lines"]
162
+ for line in lines:
163
+ for span in line["spans"]:
164
+ font_sizes.append(span["size"])
165
+ most_common_size, _ = Counter(font_sizes).most_common(1)[0]
166
+
167
+ # 按照最频繁的字体大小确定标题字体大小的阈值
168
+ threshold = most_common_size * 1
169
+ section_dict = {}
170
+ section_dict["Abstract"] = ""
171
+ last_heading = None
172
+ subheadings = []
173
+ heading_font = -1
174
+ # 遍历每一页并查找子标题
175
+ found_abstract = False
176
+ upper_heading = False
177
+ font_heading = False
178
+ for page in doc:
179
+ blocks = page.get_text("dict")["blocks"]
180
+ for block in blocks:
181
+ if not found_abstract:
182
+ try:
183
+ text = json.dumps(block)
184
+ except Exception:
185
+ continue
186
+ if re.search(r"\bAbstract\b", text, re.IGNORECASE):
187
+ found_abstract = True
188
+ last_heading = "Abstract"
189
+ if found_abstract:
190
+ if "lines" not in block:
191
+ continue
192
+ lines = block["lines"]
193
+ for line in lines:
194
+ for span in line["spans"]:
195
+ # 如果当前文本是子标题
196
+ if (
197
+ not font_heading
198
+ and span["text"].isupper()
199
+ and sum(
200
+ 1 for c in span["text"] if c.isupper() and ("A" <= c <= "Z")
201
+ )
202
+ > 4
203
+ ): # 针对一些标题大小一样,但是全大写的论文
204
+ upper_heading = True
205
+ heading = span["text"].strip()
206
+ if "References" in heading: # reference 以后的内容不考虑
207
+ self.section_names = subheadings
208
+ self.section_texts = section_dict
209
+ return
210
+ subheadings.append(heading)
211
+ if last_heading is not None:
212
+ section_dict[last_heading] = section_dict[last_heading].strip()
213
+ section_dict[heading] = ""
214
+ last_heading = heading
215
+ if (
216
+ not upper_heading
217
+ and span["size"] > threshold
218
+ and re.match( # 正常情况下,通过字体大小判断
219
+ r"[A-Z][a-z]+(?:\s[A-Z][a-z]+)*",
220
+ span["text"].strip(),
221
+ )
222
+ ):
223
+ font_heading = True
224
+ if heading_font == -1:
225
+ heading_font = span["size"]
226
+ elif heading_font != span["size"]:
227
+ continue
228
+ heading = span["text"].strip()
229
+ if "References" in heading: # reference 以后的内容不考虑
230
+ self.section_names = subheadings
231
+ self.section_texts = section_dict
232
+ return
233
+ subheadings.append(heading)
234
+ if last_heading is not None:
235
+ section_dict[last_heading] = section_dict[last_heading].strip()
236
+ section_dict[heading] = ""
237
+ last_heading = heading
238
+ # 否则将当前文本添加到上一个子标题的文本中
239
+ elif last_heading is not None:
240
+ section_dict[last_heading] += " " + span["text"].strip()
241
+ self.section_names = subheadings
242
+ self.section_texts = section_dict
243
+
244
+
245
+ class ArxivPDFDownloader(MapOperator):
246
+ def __init__(self, config):
247
+ super().__init__()
248
+ config = config["ArxivPDFDownloader"]
249
+ self.max_results = config.get("max_results", 5)
250
+ self.save_dir = config.get("save_dir", "arxiv_pdfs")
251
+ os.makedirs(self.save_dir, exist_ok=True)
252
+
253
+ def execute(self, data: str) -> list[str]:
254
+ self.query = data
255
+ base_url = "http://export.arxiv.org/api/query?"
256
+ encoded_query = quote(self.query)
257
+ query = f"search_query={encoded_query}&start=0&max_results={self.max_results}&sortBy=submittedDate&sortOrder=descending"
258
+ url = base_url + query
259
+ feed = feedparser.parse(url)
260
+
261
+ pdf_paths = []
262
+
263
+ print(feed)
264
+ for entry in feed.entries:
265
+ # feedparser's type hints are incomplete, entry.id is actually a string
266
+ arxiv_id = entry.id.split("/abs/")[-1] # type: ignore[union-attr]
267
+ pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
268
+ pdf_path = os.path.join(self.save_dir, f"{arxiv_id}.pdf")
269
+
270
+ if not os.path.exists(pdf_path):
271
+ try:
272
+ resp = requests.get(pdf_url, timeout=15)
273
+ if resp.status_code == 200:
274
+ with open(pdf_path, "wb") as f:
275
+ f.write(resp.content)
276
+ pdf_paths.append(pdf_path)
277
+ self.logger.info(f"Downloaded: {pdf_path}")
278
+ else:
279
+ self.logger.error(f"HTTP {resp.status_code} for {pdf_url}")
280
+ except Exception as e:
281
+ self.logger.error(f"Failed to download {pdf_url}: {e}")
282
+ else:
283
+ self.logger.info(f"File already exists: {pdf_path}")
284
+ pdf_paths.append(pdf_path)
285
+
286
+ time.sleep(1) # 防止请求过快
287
+
288
+ return pdf_paths
289
+
290
+
291
+ class ArxivPDFParser(MapOperator):
292
+ def __init__(self, config):
293
+ super().__init__()
294
+ config = config["ArxivPDFParser"]
295
+ print(config)
296
+ self.output_dir = config.get("output_dir", "arxiv_structured_json")
297
+ os.makedirs(self.output_dir, exist_ok=True)
298
+
299
+ def execute(self, data: str) -> list[str]:
300
+ pdf_paths = data
301
+ output_paths = []
302
+
303
+ for pdf_path in pdf_paths:
304
+ filename = os.path.basename(pdf_path).replace(".pdf", ".json")
305
+ json_path = os.path.join(self.output_dir, filename)
306
+
307
+ if not os.path.exists(json_path):
308
+ try:
309
+ paper = Paper(pdf_path)
310
+ paper.parse_pdf()
311
+ with open(json_path, "w", encoding="utf-8") as f:
312
+ json.dump(
313
+ {
314
+ "title": paper.title,
315
+ "authors": paper.authors,
316
+ "abs": paper.abs,
317
+ "sections": paper.section_texts,
318
+ },
319
+ f,
320
+ ensure_ascii=False,
321
+ indent=4,
322
+ )
323
+ output_paths.append(json_path)
324
+ self.logger.info(f"Parsed and saved: {json_path}")
325
+ except Exception as e:
326
+ self.logger.error(f"Failed to parse {pdf_path}: {e}")
327
+ else:
328
+ self.logger.info(f"JSON already exists: {json_path}")
329
+ output_paths.append(json_path)
330
+
331
+ return output_paths
@@ -0,0 +1,13 @@
1
+ """Compatibility shim for RAG chunking operators.
2
+
3
+ The canonical implementations now live in ``sage.libs.rag.chunk``. This file
4
+ keeps the old import path available for middleware operators and third-party
5
+ code until the next minor release.
6
+ """
7
+
8
+ from sage.libs.rag.chunk import ( # noqa: F401
9
+ CharacterSplitter,
10
+ SentenceTransformersTokenTextSplitter,
11
+ )
12
+
13
+ __all__ = ["CharacterSplitter", "SentenceTransformersTokenTextSplitter"]
@@ -0,0 +1,23 @@
1
+ """Compatibility shim for RAG document loaders.
2
+
3
+ The actual loader implementations have moved to ``sage.libs.rag.document_loaders``
4
+ so that lower layers can reuse them without depending on middleware.
5
+ """
6
+
7
+ from sage.libs.rag.document_loaders import ( # noqa: F401
8
+ DocLoader,
9
+ DocxLoader,
10
+ LoaderFactory,
11
+ MarkdownLoader,
12
+ PDFLoader,
13
+ TextLoader,
14
+ )
15
+
16
+ __all__ = [
17
+ "TextLoader",
18
+ "PDFLoader",
19
+ "DocxLoader",
20
+ "DocLoader",
21
+ "MarkdownLoader",
22
+ "LoaderFactory",
23
+ ]