pyxllib 0.0.43__py3-none-any.whl → 0.3.197__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. pyxllib/__init__.py +9 -2
  2. pyxllib/algo/__init__.py +8 -0
  3. pyxllib/algo/disjoint.py +54 -0
  4. pyxllib/algo/geo.py +541 -0
  5. pyxllib/{util/mathlib.py → algo/intervals.py} +172 -36
  6. pyxllib/algo/matcher.py +389 -0
  7. pyxllib/algo/newbie.py +166 -0
  8. pyxllib/algo/pupil.py +629 -0
  9. pyxllib/algo/shapelylib.py +67 -0
  10. pyxllib/algo/specialist.py +241 -0
  11. pyxllib/algo/stat.py +494 -0
  12. pyxllib/algo/treelib.py +149 -0
  13. pyxllib/algo/unitlib.py +66 -0
  14. pyxllib/autogui/__init__.py +5 -0
  15. pyxllib/autogui/activewin.py +246 -0
  16. pyxllib/autogui/all.py +9 -0
  17. pyxllib/autogui/autogui.py +852 -0
  18. pyxllib/autogui/uiautolib.py +362 -0
  19. pyxllib/autogui/virtualkey.py +102 -0
  20. pyxllib/autogui/wechat.py +827 -0
  21. pyxllib/autogui/wechat_msg.py +421 -0
  22. pyxllib/autogui/wxautolib.py +84 -0
  23. pyxllib/cv/__init__.py +1 -11
  24. pyxllib/cv/expert.py +267 -0
  25. pyxllib/cv/{imlib.py → imfile.py} +18 -83
  26. pyxllib/cv/imhash.py +39 -0
  27. pyxllib/cv/pupil.py +9 -0
  28. pyxllib/cv/rgbfmt.py +1525 -0
  29. pyxllib/cv/slidercaptcha.py +137 -0
  30. pyxllib/cv/trackbartools.py +163 -49
  31. pyxllib/cv/xlcvlib.py +1040 -0
  32. pyxllib/cv/xlpillib.py +423 -0
  33. pyxllib/data/__init__.py +0 -0
  34. pyxllib/data/echarts.py +240 -0
  35. pyxllib/data/jsonlib.py +89 -0
  36. pyxllib/{util/oss2_.py → data/oss.py} +11 -9
  37. pyxllib/data/pglib.py +1127 -0
  38. pyxllib/data/sqlite.py +568 -0
  39. pyxllib/{util → data}/sqllib.py +13 -31
  40. pyxllib/ext/JLineViewer.py +505 -0
  41. pyxllib/ext/__init__.py +6 -0
  42. pyxllib/{util → ext}/demolib.py +119 -35
  43. pyxllib/ext/drissionlib.py +277 -0
  44. pyxllib/ext/kq5034lib.py +12 -0
  45. pyxllib/{util/main.py → ext/old.py} +122 -284
  46. pyxllib/ext/qt.py +449 -0
  47. pyxllib/ext/robustprocfile.py +497 -0
  48. pyxllib/ext/seleniumlib.py +76 -0
  49. pyxllib/{util/tklib.py → ext/tk.py} +10 -11
  50. pyxllib/ext/unixlib.py +827 -0
  51. pyxllib/ext/utools.py +351 -0
  52. pyxllib/{util/webhooklib.py → ext/webhook.py} +45 -17
  53. pyxllib/ext/win32lib.py +40 -0
  54. pyxllib/ext/wjxlib.py +88 -0
  55. pyxllib/ext/wpsapi.py +124 -0
  56. pyxllib/ext/xlwork.py +9 -0
  57. pyxllib/ext/yuquelib.py +1105 -0
  58. pyxllib/file/__init__.py +17 -0
  59. pyxllib/file/docxlib.py +761 -0
  60. pyxllib/{util → file}/gitlib.py +40 -27
  61. pyxllib/file/libreoffice.py +165 -0
  62. pyxllib/file/movielib.py +148 -0
  63. pyxllib/file/newbie.py +10 -0
  64. pyxllib/file/onenotelib.py +1469 -0
  65. pyxllib/file/packlib/__init__.py +330 -0
  66. pyxllib/{util → file/packlib}/zipfile.py +598 -195
  67. pyxllib/file/pdflib.py +426 -0
  68. pyxllib/file/pupil.py +185 -0
  69. pyxllib/file/specialist/__init__.py +685 -0
  70. pyxllib/{basic/_5_dirlib.py → file/specialist/dirlib.py} +364 -93
  71. pyxllib/file/specialist/download.py +193 -0
  72. pyxllib/file/specialist/filelib.py +2829 -0
  73. pyxllib/file/xlsxlib.py +3131 -0
  74. pyxllib/file/xlsyncfile.py +341 -0
  75. pyxllib/prog/__init__.py +5 -0
  76. pyxllib/prog/cachetools.py +64 -0
  77. pyxllib/prog/deprecatedlib.py +233 -0
  78. pyxllib/prog/filelock.py +42 -0
  79. pyxllib/prog/ipyexec.py +253 -0
  80. pyxllib/prog/multiprogs.py +940 -0
  81. pyxllib/prog/newbie.py +451 -0
  82. pyxllib/prog/pupil.py +1197 -0
  83. pyxllib/{sitepackages.py → prog/sitepackages.py} +5 -3
  84. pyxllib/prog/specialist/__init__.py +391 -0
  85. pyxllib/prog/specialist/bc.py +203 -0
  86. pyxllib/prog/specialist/browser.py +497 -0
  87. pyxllib/prog/specialist/common.py +347 -0
  88. pyxllib/prog/specialist/datetime.py +199 -0
  89. pyxllib/prog/specialist/tictoc.py +240 -0
  90. pyxllib/prog/specialist/xllog.py +180 -0
  91. pyxllib/prog/xlosenv.py +108 -0
  92. pyxllib/stdlib/__init__.py +17 -0
  93. pyxllib/{util → stdlib}/tablepyxl/__init__.py +1 -3
  94. pyxllib/{util → stdlib}/tablepyxl/style.py +1 -1
  95. pyxllib/{util → stdlib}/tablepyxl/tablepyxl.py +2 -4
  96. pyxllib/text/__init__.py +8 -0
  97. pyxllib/text/ahocorasick.py +39 -0
  98. pyxllib/text/airscript.js +744 -0
  99. pyxllib/text/charclasslib.py +121 -0
  100. pyxllib/text/jiebalib.py +267 -0
  101. pyxllib/text/jinjalib.py +32 -0
  102. pyxllib/text/jsa_ai_prompt.md +271 -0
  103. pyxllib/text/jscode.py +922 -0
  104. pyxllib/text/latex/__init__.py +158 -0
  105. pyxllib/text/levenshtein.py +303 -0
  106. pyxllib/text/nestenv.py +1215 -0
  107. pyxllib/text/newbie.py +300 -0
  108. pyxllib/text/pupil/__init__.py +8 -0
  109. pyxllib/text/pupil/common.py +1121 -0
  110. pyxllib/text/pupil/xlalign.py +326 -0
  111. pyxllib/text/pycode.py +47 -0
  112. pyxllib/text/specialist/__init__.py +8 -0
  113. pyxllib/text/specialist/common.py +112 -0
  114. pyxllib/text/specialist/ptag.py +186 -0
  115. pyxllib/text/spellchecker.py +172 -0
  116. pyxllib/text/templates/echart_base.html +11 -0
  117. pyxllib/text/templates/highlight_code.html +17 -0
  118. pyxllib/text/templates/latex_editor.html +103 -0
  119. pyxllib/text/vbacode.py +17 -0
  120. pyxllib/text/xmllib.py +747 -0
  121. pyxllib/xl.py +39 -0
  122. pyxllib/xlcv.py +17 -0
  123. pyxllib-0.3.197.dist-info/METADATA +48 -0
  124. pyxllib-0.3.197.dist-info/RECORD +126 -0
  125. {pyxllib-0.0.43.dist-info → pyxllib-0.3.197.dist-info}/WHEEL +4 -5
  126. pyxllib/basic/_1_strlib.py +0 -945
  127. pyxllib/basic/_2_timelib.py +0 -488
  128. pyxllib/basic/_3_pathlib.py +0 -916
  129. pyxllib/basic/_4_loglib.py +0 -419
  130. pyxllib/basic/__init__.py +0 -54
  131. pyxllib/basic/arrow_.py +0 -250
  132. pyxllib/basic/chardet_.py +0 -66
  133. pyxllib/basic/dirlib.py +0 -529
  134. pyxllib/basic/dprint.py +0 -202
  135. pyxllib/basic/extension.py +0 -12
  136. pyxllib/basic/judge.py +0 -31
  137. pyxllib/basic/log.py +0 -204
  138. pyxllib/basic/pathlib_.py +0 -705
  139. pyxllib/basic/pytictoc.py +0 -102
  140. pyxllib/basic/qiniu_.py +0 -61
  141. pyxllib/basic/strlib.py +0 -761
  142. pyxllib/basic/timer.py +0 -132
  143. pyxllib/cv/cv.py +0 -834
  144. pyxllib/cv/cvlib/_1_geo.py +0 -543
  145. pyxllib/cv/cvlib/_2_cvprcs.py +0 -309
  146. pyxllib/cv/cvlib/_2_imgproc.py +0 -594
  147. pyxllib/cv/cvlib/_3_pilprcs.py +0 -80
  148. pyxllib/cv/cvlib/_4_cvimg.py +0 -211
  149. pyxllib/cv/cvlib/__init__.py +0 -10
  150. pyxllib/cv/debugtools.py +0 -82
  151. pyxllib/cv/fitz_.py +0 -300
  152. pyxllib/cv/installer.py +0 -42
  153. pyxllib/debug/_0_installer.py +0 -38
  154. pyxllib/debug/_1_typelib.py +0 -277
  155. pyxllib/debug/_2_chrome.py +0 -198
  156. pyxllib/debug/_3_showdir.py +0 -161
  157. pyxllib/debug/_4_bcompare.py +0 -140
  158. pyxllib/debug/__init__.py +0 -49
  159. pyxllib/debug/bcompare.py +0 -132
  160. pyxllib/debug/chrome.py +0 -198
  161. pyxllib/debug/installer.py +0 -38
  162. pyxllib/debug/showdir.py +0 -158
  163. pyxllib/debug/typelib.py +0 -278
  164. pyxllib/image/__init__.py +0 -12
  165. pyxllib/torch/__init__.py +0 -20
  166. pyxllib/torch/modellib.py +0 -37
  167. pyxllib/torch/trainlib.py +0 -344
  168. pyxllib/util/__init__.py +0 -20
  169. pyxllib/util/aip_.py +0 -141
  170. pyxllib/util/casiadb.py +0 -59
  171. pyxllib/util/excellib.py +0 -495
  172. pyxllib/util/filelib.py +0 -612
  173. pyxllib/util/jsondata.py +0 -27
  174. pyxllib/util/jsondata2.py +0 -92
  175. pyxllib/util/labelmelib.py +0 -139
  176. pyxllib/util/onepy/__init__.py +0 -29
  177. pyxllib/util/onepy/onepy.py +0 -574
  178. pyxllib/util/onepy/onmanager.py +0 -170
  179. pyxllib/util/pyautogui_.py +0 -219
  180. pyxllib/util/textlib.py +0 -1305
  181. pyxllib/util/unorder.py +0 -22
  182. pyxllib/util/xmllib.py +0 -639
  183. pyxllib-0.0.43.dist-info/METADATA +0 -39
  184. pyxllib-0.0.43.dist-info/RECORD +0 -80
  185. pyxllib-0.0.43.dist-info/top_level.txt +0 -1
  186. {pyxllib-0.0.43.dist-info → pyxllib-0.3.197.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,1469 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : 陈坤泽
4
+ # @Email : 877362867@qq.com
5
+ # @Date : 2022/06/28 21:40
6
+
7
+ import datetime
8
+ import logging
9
+ import re
10
+ import time
11
+ import warnings
12
+ import os
13
+ from threading import Thread
14
+ from functools import reduce
15
+
16
+ import bs4
17
+ import pytz
18
+ from xml.etree import ElementTree
19
+ from humanfriendly import format_size
20
+ from anytree.importer import DictImporter
21
+ from anytree.exporter import DictExporter
22
+
23
+ # 过滤这类警告
24
+ warnings.filterwarnings("ignore", category=bs4.MarkupResemblesLocatorWarning, module='bs4')
25
+
26
+ import win32com.client
27
+
28
+ if win32com.client.gencache.is_readonly:
29
+ win32com.client.gencache.is_readonly = False
30
+ win32com.client.gencache.Rebuild()
31
+
32
+ from pyxllib.prog.newbie import SingletonForEveryClass
33
+ from pyxllib.prog.pupil import Timeout
34
+ from pyxllib.prog.specialist import tqdm
35
+ from pyxllib.algo.treelib import Node, XlNode
36
+ from pyxllib.text.xmllib import BeautifulSoup, XlBs4Tag
37
+ from pyxllib.file.specialist import XlPath, get_etag
38
+
39
+ """
40
+ 参考了onepy的实现,做了重构。OnePy:Provides pythonic wrappers around OneNote COM interfaces
41
+ """
42
+
43
+ namespace = "{http://schemas.microsoft.com/office/onenote/2013/onenote}"
44
+
45
+ # 还未绑定父结点的游离page node,用于进度条子线程
46
+ _free_page_nodes = []
47
+
48
+ # 缓存文件地址
49
+ CACHE_DIR = XlPath.tempdir() / 'OneNote/SearchCache'
50
+ os.makedirs(CACHE_DIR, exist_ok=True)
51
+
52
+ # 页面解析结果的缓存,用于解析加速
53
+ _page_parsed_cache = {}
54
+ _page_parsed_cache_file = CACHE_DIR / 'page_parsed_cache_file.pkl'
55
+ if _page_parsed_cache_file.is_file():
56
+ _page_parsed_cache = _page_parsed_cache_file.read_pkl()
57
+
58
+ # 用来读取、保存序列化的node数据
59
+ importer = DictImporter()
60
+ exporter = DictExporter()
61
+
62
+
63
+ class ONProcess(metaclass=SingletonForEveryClass):
64
+ """ onenote 底层win32的接口
65
+
66
+ 详细功能可以查官方文档:
67
+ Application interface (OneNote) | Microsoft Docs:
68
+ https://docs.microsoft.com/en-us/office/client-developer/onenote/application-interface-onenote
69
+ """
70
+
71
+ def __init__(self, timeout=30):
72
+ """ onenote的win32接口方法是驼峰命名,这个ONProcess做了一层功能封装
73
+ 而且估计理论上对所有可以获得的接口都做了封装了
74
+
75
+ :param timeout: 读取单个页面的时候,限制用时,单位:秒
76
+ 本来只想限制5秒,但发现会有一些页面特别长,需要多一些时间~
77
+ 再后来发现还有更慢的页面,半分钟的都有,就再改成30秒了
78
+ """
79
+ # TODO 这里需要针对不同的OneNote版本做自动化兼容,不要让用户填版本
80
+ # 因为让用户填版本,会存在多个实例化对象,使用get_xml会有各种问题
81
+ # 目前是支持onenote2016的,但不知道其他版本onenote会怎样
82
+ # self.process = win32com.client.gencache.EnsureDispatch('OneNote.Application')
83
+ self.process = win32com.client.DispatchEx('OneNote.Application')
84
+ self.namespace = "{http://schemas.microsoft.com/office/onenote/2013/onenote}"
85
+
86
+ # 官方原版的实现,但我觉得可以去掉版本号
87
+ # try:
88
+ # if version == 16:
89
+ # self.process = win32com.client.gencache.EnsureDispatch('OneNote.Application')
90
+ # self.namespace = "{http://schemas.microsoft.com/office/onenote/2013/onenote}"
91
+ # if version == 15:
92
+ # self.process = win32com.client.gencache.EnsureDispatch('OneNote.Application.15')
93
+ # self.namespace = "{http://schemas.microsoft.com/office/onenote/2013/onenote}"
94
+ # if version == 14:
95
+ # self.process = win32com.client.gencache.EnsureDispatch('OneNote.Application.14')
96
+ # self.namespace = "{http://schemas.microsoft.com/office/onenote/2010/onenote}"
97
+ # except Exception as e:
98
+ # # pywintypes.com_error: (-2147221005, '无效的类字符串', None, None)
99
+ # # pywintypes.com_error: (-2147221005, '无效的类字符串', None, None)
100
+ # print(e)
101
+ # print("error starting onenote {}".format(version))
102
+
103
+ global namespace
104
+ namespace = self.namespace
105
+
106
+ self.timeout_seconds = timeout
107
+
108
+ def get_hierarchy(self, start_node_id="", hierarchy_scope=4):
109
+ """
110
+ HierarchyScope
111
+ 0 - Gets just the start node specified and no descendants.
112
+ 1 - Gets the immediate child nodes of the start node, and no descendants in higher or lower subsection groups.
113
+ 2 - Gets all notebooks below the start node, or root.
114
+ 3 - Gets all sections below the start node, including sections in section groups and subsection groups.
115
+ 4 - Gets all pages below the start node, including all pages in section groups and subsection groups.
116
+ """
117
+ return self.process.GetHierarchy(start_node_id, hierarchy_scope)
118
+
119
+ def update_hierarchy(self, changes_xml_in):
120
+ try:
121
+ self.process.UpdateHierarchy(changes_xml_in)
122
+ except Exception as e:
123
+ print("Could not Update Hierarchy")
124
+
125
+ def open_hierarchy(self, path, relative_to_object_id, object_id, create_file_type=0):
126
+ """
127
+ CreateFileType
128
+ 0 - Creates no new object.
129
+ 1 - Creates a notebook with the specified name at the specified location.
130
+ 2 - Creates a section group with the specified name at the specified location.
131
+ 3 - Creates a section with the specified name at the specified location.
132
+ """
133
+ try:
134
+ return self.process.OpenHierarchy(path, relative_to_object_id, "", create_file_type)
135
+ except Exception as e:
136
+ print("Could not Open Hierarchy")
137
+
138
+ def delete_hierarchy(self, object_id, excpect_last_modified=""):
139
+ try:
140
+ self.process.DeleteHierarchy(object_id, excpect_last_modified)
141
+ except Exception as e:
142
+ print("Could not Delete Hierarchy")
143
+
144
+ def create_new_page(self, section_id, new_page_style=0):
145
+ """
146
+ NewPageStyle
147
+ 0 - Create a Page that has Default Page Style
148
+ 1 - Create a blank page with no title
149
+ 2 - Createa blank page that has no title
150
+ """
151
+ try:
152
+ self.process.CreateNewPage(section_id, "", new_page_style)
153
+ except Exception as e:
154
+ print("Unable to create the page")
155
+
156
+ def close_notebook(self, notebook_id):
157
+ try:
158
+ self.process.CloseNotebook(notebook_id)
159
+ except Exception as e:
160
+ print("Could not Close Notebook")
161
+
162
+ def get_page_content(self, page_id, page_info=0):
163
+ """
164
+ PageInfo
165
+ 0 - Returns only file page content, without selection markup and binary data objects. This is the standard value to pass.
166
+ 1 - Returns page content with no selection markup, but with all binary data.
167
+ 2 - Returns page content with selection markup, but no binary data.
168
+ 3 - Returns page content with selection markup and all binary data.
169
+ """
170
+ with Timeout(self.timeout_seconds):
171
+ return self.process.GetPageContent(page_id, "", page_info)
172
+
173
+ def update_page_content(self, page_changes_xml_in, excpect_last_modified=0):
174
+ try:
175
+ self.process.UpdatePageContent(page_changes_xml_in, excpect_last_modified)
176
+ except Exception as e:
177
+ print("Could not Update Page Content")
178
+
179
+ def get_binary_page_content(self, page_id, callback_id):
180
+ try:
181
+ return self.process.GetBinaryPageContent(page_id, callback_id)
182
+ except Exception as e:
183
+ print("Could not Get Binary Page Content")
184
+
185
+ def delete_page_content(self, page_id, object_id, excpect_last_modified=0):
186
+ try:
187
+ self.process.DeletePageContent(page_id, object_id, excpect_last_modified)
188
+ except Exception as e:
189
+ print("Could not Delete Page Content")
190
+
191
+ # Actions
192
+
193
+ def navigate_to(self, page_id, object_id='', new_window=False):
194
+ try:
195
+ self.process.NavigateTo(page_id, object_id, new_window)
196
+ except Exception as e:
197
+ print("Could not Navigate To")
198
+
199
+ def publish(self, hierarchy_id, target_file_path, publish_format, clsid_of_exporter=""):
200
+ """
201
+ PublishFormat
202
+ 0 - Published page is in .one format.
203
+ 1 - Published page is in .onea format.
204
+ 2 - Published page is in .mht format.
205
+ 3 - Published page is in .pdf format.
206
+ 4 - Published page is in .xps format.
207
+ 5 - Published page is in .doc or .docx format.
208
+ 6 - Published page is in enhanced metafile (.emf) format.
209
+ """
210
+ try:
211
+ self.process.Publish(hierarchy_id, target_file_path, publish_format, clsid_of_exporter)
212
+ except Exception as e:
213
+ print("Could not Publish")
214
+
215
+ def open_package(self, path_package, path_dest):
216
+ try:
217
+ return self.process.OpenPackage(path_package, path_dest)
218
+ except Exception as e:
219
+ print("Could not Open Package")
220
+
221
+ def get_hyperlink_to_object(self, page_id, object_id=""):
222
+ """
223
+
224
+ :param str page_id:
225
+ The OneNote ID for the notebook, section group, section, or page for which you want a hyperlink.
226
+ :param str object_id: The OneNote ID for the object within the page for which you want a hyperlink.
227
+ """
228
+ try:
229
+ return self.process.GetHyperlinkToObject(page_id, object_id)
230
+ except Exception as e:
231
+ print("Could not Get Hyperlink")
232
+
233
+ def find_pages(self, start_node_id, search_string, display):
234
+ try:
235
+ return self.process.FindPages(start_node_id, search_string, "", False, display)
236
+ except Exception as e:
237
+ print("Could not Find Pages")
238
+
239
+ def get_special_location(self, special_location=0):
240
+ """
241
+ SpecialLocation
242
+ 0 - Gets the path to the Backup Folders folder location.
243
+ 1 - Gets the path to the Unfiled Notes folder location.
244
+ 2 - Gets the path to the Default Notebook folder location.
245
+ """
246
+ try:
247
+ return self.process.GetSpecialLocation(special_location)
248
+ except Exception as e:
249
+ print("Could not retreive special location")
250
+
251
+
252
+ class _CommonMethods:
253
+ """ 笔记本、分区组、分区、页面 共有的一些成员方法 """
254
+
255
+ def init_node(self):
256
+ node = Node(self.name, _category=type(self).__name__)
257
+ # if type(self).__name__ != 'Page':
258
+ node._html_content = f'<a href="onenote/linkid?id={self.id}" target="_blank">{self.name}</a>'
259
+ return node
260
+
261
+ @property
262
+ def ancestors(self):
263
+ """ 获得所有父结点 """
264
+ parents = []
265
+ p = self
266
+ while getattr(p, 'parent', False):
267
+ parents.append(p.parent)
268
+ p = p.parent
269
+ return reversed(parents)
270
+
271
+ @property
272
+ def abspath_name(self):
273
+ names = [x.name for x in self.ancestors]
274
+ names.append(self.name)
275
+ return '/'.join(names)
276
+
277
+ def get_page_num(self):
278
+ return sum([x.get_page_num() for x in self._children])
279
+
280
+ def get_search_tree(self, *, print_mode=True, use_node_cache=True, reparse=False):
281
+ """ 获得检索树的根节点
282
+
283
+ :param print_mode: 输出所包含页面解析进度
284
+ :param use_node_cache: 检查xml是否有更新
285
+ 默认可以不检查,如果一个widget有_node结点可以直接使用
286
+ :param reparse: 使用旧的解析过xml的持久化文件数据
287
+ 这个好处是速度快,坏处是如果解析代码功能有更新,这样提取到的是错误的旧的解析数据
288
+ 这个缓存文件的处理规则,为了一些细节优化,也稍复杂~~
289
+ """
290
+ global _page_parsed_cache
291
+
292
+ # 1 进度条工具,开一个子线程,每秒监控页面解析进度
293
+ def timer_progress():
294
+ """ 为了实现这个进度条,还有点技术含量呢,用了子线程和一些trick """
295
+ global _free_page_nodes
296
+
297
+ def dfs(x):
298
+ cnt = 0
299
+
300
+ _category = getattr(x, '_category', '')
301
+ if _category == 'Page':
302
+ cnt += 1
303
+ elif _category == '':
304
+ return cnt
305
+
306
+ for y in x.children:
307
+ cnt += dfs(y)
308
+ return cnt
309
+
310
+ total = self.get_page_num()
311
+ _tqdm = tqdm(desc='OneNote解析页面', disable=False, total=total)
312
+ while not stop_flag:
313
+ num = dfs(self._node) # 只读取,不修改self._node,而且进度条稍微有些错没关系,所以不加锁
314
+ _free_page_nodes = list(filter(lambda x: x.root != self._node, _free_page_nodes))
315
+ num += len(_free_page_nodes)
316
+ _tqdm.n = num
317
+ _tqdm.refresh()
318
+
319
+ time.sleep(1)
320
+
321
+ # 最后统计一轮
322
+ num = dfs(self._node)
323
+ _free_page_nodes = []
324
+ _tqdm.total = _tqdm.n = num
325
+ _tqdm.refresh()
326
+
327
+ # print(f'一共{total}个页面,实际解析出{num}个页面')
328
+ # 实际解析成功的页面数
329
+ return num
330
+
331
+ # 2 主线程解析页面的过程,子线程每秒钟展示一次进度情况
332
+ if reparse and type(self).__name__ == 'OneNote': # 如果是OneNote层面reparse,直接重置整个缓存文件
333
+ _page_parsed_cache = {}
334
+ cache_num = len(_page_parsed_cache)
335
+
336
+ if print_mode:
337
+ stop_flag = False
338
+ timer_thread = Thread(target=timer_progress)
339
+ timer_thread.start()
340
+ root = self._search(use_node_cache=use_node_cache, reparse=reparse)
341
+ stop_flag = True # 使用进程里的共享变量,进行主线程和子线程之间的通信
342
+ timer_thread.join()
343
+ else:
344
+ root = self._search(use_node_cache=use_node_cache, reparse=reparse)
345
+
346
+ # 保存缓存文件
347
+ if cache_num != len(_page_parsed_cache) or reparse:
348
+ # 如果前后数量不一致,表示有更新内容,重新写入一份缓存文件。或者明确使用了reparse了也要保存。
349
+ _page_parsed_cache_file.write_pkl(_page_parsed_cache)
350
+
351
+ root.parent = None
352
+ return root
353
+
354
+ def search(self, pattern, child_depth=0, *,
355
+ edits=None, reparse=False,
356
+ print_mode=False, return_mode='text',
357
+ padding_mode=0, dedent=1, href_mode=1):
358
+ """ 查找内容
359
+
360
+ Page、Section、SectionGroup等实际所用的search方法
361
+
362
+ :param pattern:
363
+ text, 检索出现该关键词的node,并展示其相关的上下文内容
364
+ func, 可以输入自定义函数 check_node(node)->bool,True表示符合检索条件的node
365
+ re.compile,可以输入编译的正则模式,会使用re.search进行匹配
366
+ :param int child_depth: 对于检索到的node,向下展开几层子结点
367
+ -1,表示全部展开
368
+ 0,表示不展开子结点
369
+ 1,只展开直接子结点
370
+ ...
371
+ :param return_mode:
372
+ text,文本展示
373
+ html,网页富文本
374
+ :param href_mode: 超链接的模式
375
+ 0,不设超链接。text模式下强制设为0。
376
+ 1,静态链接(需要调用onenote生成,要多花一点点时间)
377
+ 2,动态链接,在开启server的时候使用才有意义
378
+ """
379
+ # 1 按照规则检索内容
380
+ if isinstance(pattern, str): # 文本关键词检索
381
+ def check_node(node):
382
+ # 纯文本部分
383
+ if pattern in node.name:
384
+ return True
385
+ # 如果纯文本找不到,也会在富文本格式里尝试匹配
386
+ html_text = getattr(node, '_html_content', '')
387
+ if html_text and pattern in html_text:
388
+ return True
389
+ return False
390
+
391
+ elif isinstance(pattern, re.Pattern): # 正则检索
392
+ def check_node(node):
393
+ if pattern.search(node.name):
394
+ return True
395
+ html_text = getattr(node, '_html_content', '')
396
+ if html_text and pattern.search(html_text):
397
+ return True
398
+ return False
399
+ else: # 自定义检索
400
+ check_node = pattern
401
+
402
+ # 2 更新索引并获得解析树
403
+ start_time = time.time()
404
+ edits = edits or []
405
+ edits = [list(name.split('/')) for name in edits]
406
+ # 更新易变数据的检索树
407
+ for path in edits:
408
+ node = self(path)
409
+ print(node.abspath_name)
410
+ node.get_search_tree(print_mode=False, use_node_cache=False, reparse=reparse)
411
+ root = self.get_search_tree(print_mode=print_mode, use_node_cache=True, reparse=reparse)
412
+ elapsed1 = time.time() - start_time
413
+
414
+ # 3 检索内容
415
+ start_time = time.time()
416
+ n = XlNode.sign_node(root, check_node, flag_name='_flag', child_depth=child_depth, reset_flag=True)
417
+ elapsed2 = time.time() - start_time
418
+
419
+ # 4 html情况下的渲染算法
420
+ def node_to_html(x, depth):
421
+ import html
422
+
423
+ if depth < 0:
424
+ return '<br/>'
425
+
426
+ content = f'{getattr(x, "_html_content", html.escape(x.name))}'
427
+ if not hasattr(x, '_category'):
428
+ pass
429
+ elif x._category == 'OE':
430
+ if href_mode == 1:
431
+ url = onenote.get_hyperlink_to_object(x._page_id, x._object_id)
432
+ content += f'&nbsp;<a href="{url}">go</a>'
433
+ elif href_mode == 2:
434
+ url = f"onenote/linkid?id={x._page_id}&object_id={x._object_id}"
435
+ content += f'&nbsp;<a href="{url}" target="_blank">go</a>'
436
+ elif x._category == 'Page':
437
+ color = ['#009900', '#00b300', '#00cc00'][x._page_level - 1]
438
+ content = f'<font color="{color}">{x.name}</font>'
439
+ if href_mode == 1:
440
+ url = onenote.get_hyperlink_to_object(x._page_id)
441
+ content = f'<a href="{url}">{content}</a>'
442
+ elif href_mode == 2:
443
+ url = f"onenote/linkid?id={x._page_id}"
444
+ content = f'<a href="{url}" target="_blank">{content}</a>'
445
+
446
+ content = content.replace('\n', ' ')
447
+
448
+ if padding_mode == 1:
449
+ div = f'<div>{"&nbsp;" * depth * 4}{content}</div>'
450
+ else:
451
+ div = f'<div style="padding-left:{depth * 2 + 1}em;text-indent:-1em">{content}</div>'
452
+
453
+ return div
454
+
455
+ # 5 展示内容
456
+ texts = [f'更新数据:{elapsed1:.2f}秒,内容检索:{elapsed2:.2f}秒,匹配条目数:{n}']
457
+ if return_mode == 'text':
458
+ body = XlNode.render(root, filter_=lambda x: getattr(x, '_flag', 0), dedent=dedent)
459
+ texts[0] += f',内容大小:{format_size(len(body.encode()), binary=True)}\n'
460
+ texts.append(body)
461
+ return '\n'.join(texts)
462
+ elif return_mode == 'html':
463
+ body = XlNode.render_html(root, node_to_html, filter_=lambda x: getattr(x, '_flag', 0), dedent=dedent)
464
+ texts[0] += f',内容大小:{format_size(len(body.encode()), binary=True)}<br/>'
465
+ texts.append(body)
466
+ return '<br/>'.join(texts)
467
+ else:
468
+ raise ValueError
469
+
470
+ def __call__(self, item=None):
471
+ """ 通过路径形式定位 """
472
+ if isinstance(item, str):
473
+ if '/' in item:
474
+ return reduce(lambda x, name: x[name], [self] + list(item.split('/')))
475
+ else:
476
+ return self[item]
477
+ else:
478
+ return self
479
+
480
+
481
+ class OneNote(ONProcess, _CommonMethods):
482
+ """ OneNote软件,这是一个单例类
483
+
484
+ 注意,从ONProcess继承的OneNote也是单例类
485
+ 但从ONProcess、OneNote生成的是不同的两个对象
486
+ """
487
+
488
+ def __init__(self, timeout=30):
489
+ """
490
+ 如果出现这个错误:This COM object can not automate the makepy process - please run makepy manually for this object
491
+ 可以照 https://github.com/varunsrin/one-py 文章末尾的方式操作
492
+ 把 HKEY_CLASSES_ROOT\TypeLib\{0EA692EE-BB50-4E3C-AEF0-356D91732725} 的 1.0 删掉
493
+ (这个 KEY ID 值大家电脑上都是一样的)
494
+ """
495
+ # trick: 这里有跟单例类有关的一些问题,导致ONProcess需要提前初始化一次
496
+ super().__init__(timeout)
497
+
498
+ self.xml = self.get_hierarchy("", 4)
499
+ self.object_tree = ElementTree.fromstring(self.xml)
500
+ self.hierarchy = Hierarchy(self.object_tree)
501
+ self._children = list(self.hierarchy)
502
+ self.name = 'onenote'
503
+ self.id = self._children[0].id # 以第1个笔记本作为OneNote的id,方便一些功能统一设计
504
+ self._node = self.init_node()
505
+
506
+ def get_href(self):
507
+ # OneNote软件本身没有跳转,这里默认设置跳转到第1个笔记本
508
+ return self._children[0].get_href()
509
+
510
+ def init_node(self):
511
+ node = Node(self.name, _category='OneNote', _html_content=f'<font color="purple">OneNote</font>')
512
+ return node
513
+
514
+ def get_page_content(self, page_id):
515
+ page_content_xml = ElementTree.fromstring(super(OneNote, self).get_page_content(page_id))
516
+ return PageContent(page_content_xml)
517
+
518
+ def update_page_content(self, page_changes_xml_in):
519
+ """
520
+ :param page_changes_xml_in:
521
+ xml,可以是原始的xml文本
522
+ onenote.update_page_content(page.get_xml().replace('曹一众', '曹二众'))
523
+ soup, 可以传入一个bs4的soup对象
524
+
525
+ 这里设置pytz时间的东西我也看不懂,但大受震撼~~有点莫名其妙
526
+ How to debug win32com call in python:
527
+ https://stackoverflow.com/questions/34904094/how-to-debug-win32com-call-in-python/34979646#34979646
528
+ """
529
+ return super(OneNote, self).update_page_content(page_changes_xml_in,
530
+ pytz.utc.localize(datetime.datetime(1899, 12, 30)))
531
+
532
+ def names(self):
533
+ """ 所有笔记本的名称 """
534
+ ls = list(map(lambda x: x.name, self.hierarchy))
535
+ return ls
536
+
537
+ def nicknames(self):
538
+ """ 所有笔记本的昵称 """
539
+ ls = list(map(lambda x: x.nickname, self.hierarchy))
540
+ return ls
541
+
542
+ def __getitem__(self, item):
543
+ """ 通过编号或名称索引获得笔记本 """
544
+ return self.hierarchy[item]
545
+
546
+ def _search(self, *, use_node_cache=True, reparse=False):
547
+ if not self._node.is_leaf and use_node_cache:
548
+ return self._node
549
+ else:
550
+ self._node = self.init_node()
551
+
552
+ for x in self._children:
553
+ cur_node = x._search(use_node_cache=use_node_cache, reparse=reparse)
554
+ cur_node.parent = self._node
555
+
556
+ return self._node
557
+
558
+
559
+ class Hierarchy:
560
+
561
+ def __init__(self, xml=None):
562
+ self.xml = xml
563
+ self._children = []
564
+ if xml is not None:
565
+ self.__deserialize_from_xml(xml)
566
+
567
+ def __deserialize_from_xml(self, xml):
568
+ self._children = [Notebook(n) for n in xml]
569
+
570
+ def __iter__(self):
571
+ for c in self._children:
572
+ yield c
573
+
574
+ def __getitem__(self, item):
575
+ """通过编号或名称索引子节点内容"""
576
+ if isinstance(item, int):
577
+ return self._children[item]
578
+ elif isinstance(item, str):
579
+ for nb in self:
580
+ if nb.nickname == item:
581
+ return nb
582
+ return None
583
+
584
+
585
+ class HierarchyNode:
586
+
587
+ def __init__(self, parent=None):
588
+ self.name = ""
589
+ self.path = ""
590
+ self.id = ""
591
+ self.last_modified_time = ""
592
+ self.synchronized = ""
593
+
594
+ def deserialize_from_xml(self, xml):
595
+ self.name = xml.get("name")
596
+ self.path = xml.get("path") # page没有这个属性,但也不会报错的
597
+ self.id = xml.get("ID")
598
+ self.last_modified_time = xml.get("lastModifiedTime")
599
+
600
+
601
+ class Notebook(HierarchyNode, _CommonMethods):
602
+
603
+ def __init__(self, xml=None):
604
+ self.xml = xml
605
+ super().__init__(self)
606
+ self.nickname = ""
607
+ self.color = ""
608
+ self.is_currently_viewed = ""
609
+ self.recycleBin = None
610
+ self._children = []
611
+ if xml is not None:
612
+ self.__deserialize_from_xml(xml)
613
+
614
+ self._node = self.init_node()
615
+
616
+ def get_href(self):
617
+ return 'onenote:' + self.path[:-1]
618
+
619
+ def init_node(self):
620
+ node = Node(self.name, _category='Notebook',
621
+ _html_content=f'<a href="{self.get_href()}"><font color="red">《{self.name}》</font></a>')
622
+ return node
623
+
624
+ def __deserialize_from_xml(self, xml):
625
+ HierarchyNode.deserialize_from_xml(self, xml)
626
+ self.nickname = xml.get("nickname")
627
+ self.color = xml.get("color")
628
+ self.is_currently_viewed = xml.get("isCurrentlyViewed")
629
+ self.recycleBin = None
630
+ for node in xml:
631
+ if node.tag == namespace + "Section":
632
+ self._children.append(Section(node, self))
633
+
634
+ elif node.tag == namespace + "SectionGroup":
635
+ if node.get("isRecycleBin"):
636
+ self.recycleBin = SectionGroup(node, self)
637
+ else:
638
+ self._children.append(SectionGroup(node, self))
639
+
640
+ def __iter__(self):
641
+ for c in self._children:
642
+ yield c
643
+
644
+ def __str__(self):
645
+ return self.name
646
+
647
+ def __getitem__(self, item):
648
+ """通过编号或名称索引子节点内容"""
649
+ if isinstance(item, int):
650
+ return self._children[item]
651
+ elif isinstance(item, str):
652
+ for nb in self:
653
+ if nb.name == item:
654
+ return nb
655
+ return None
656
+
657
+ def _search(self, *, use_node_cache=True, reparse=False):
658
+ if not self._node.is_leaf and use_node_cache:
659
+ return self._node
660
+ else:
661
+ self._node = self.init_node()
662
+
663
+ for x in self._children:
664
+ cur_node = x._search(use_node_cache=use_node_cache, reparse=reparse)
665
+ cur_node.parent = self._node
666
+
667
+ return self._node
668
+
669
+
670
+ class SectionGroup(HierarchyNode, _CommonMethods):
671
+ """ 分区组 """
672
+
673
+ def __init__(self, xml=None, parent_node=None):
674
+ self.xml = xml
675
+ super().__init__(self)
676
+ self.is_recycle_Bin = False
677
+ self._children = []
678
+ self.parent = parent_node
679
+ if xml is not None:
680
+ self.__deserialize_from_xml(xml)
681
+
682
+ self._node = self.init_node()
683
+
684
+ def get_href(self):
685
+ return 'onenote:' + self.path[:-1]
686
+
687
+ def init_node(self):
688
+ node = Node(self.name, _category='SectionGroup',
689
+ _html_content=f'<a href="{self.get_href()}"><font color="#e68a00">〖{self.name}〗</font></a>')
690
+ return node
691
+
692
+ def __iter__(self):
693
+ # ckz: 这个遍历的时候,就是OneNote里看到的从左到右的顺序:先所有分区,然后所有分区组
694
+ #
695
+ for c in self._children:
696
+ yield c
697
+
698
+ def __str__(self):
699
+ return self.name
700
+
701
+ def __deserialize_from_xml(self, xml):
702
+ HierarchyNode.deserialize_from_xml(self, xml)
703
+ self.is_recycle_Bin = xml.get("isRecycleBin")
704
+ for node in xml:
705
+ if node.tag == namespace + "SectionGroup":
706
+ self._children.append(SectionGroup(node, self))
707
+ if node.tag == namespace + "Section":
708
+ self._children.append(Section(node, self))
709
+
710
+ def __getitem__(self, item):
711
+ """ 通过 编号 或 名称 索引子节点内容
712
+
713
+ 注意使用字符串引用的时候,可能会有重名的问题!
714
+ """
715
+ if isinstance(item, int):
716
+ return self._children[item]
717
+ elif isinstance(item, str):
718
+ for nb in self:
719
+ if nb.name == item:
720
+ return nb
721
+ return None
722
+
723
+ def _search(self, *, use_node_cache=True, reparse=False):
724
+ if not self._node.is_leaf and use_node_cache:
725
+ return self._node
726
+ else:
727
+ self._node = self.init_node()
728
+
729
+ for x in self._children:
730
+ cur_node = x._search(use_node_cache=use_node_cache, reparse=reparse)
731
+ cur_node.parent = self._node
732
+
733
+ # 这里多线程效率几乎没差,就不开了
734
+ # def run_unit(x):
735
+ # cur_node = x._search(reset=reset)
736
+ # cur_node.parent = self._node
737
+ # mtqdm(run_unit, self._children, max_workers=2, disable=True)
738
+
739
+ return self._node
740
+
741
+
742
+ class Section(HierarchyNode, _CommonMethods):
743
+ """ 分区 """
744
+
745
+ def __init__(self, xml=None, parent_node=None):
746
+ self.xml = xml
747
+ super().__init__(self)
748
+ self.color = ""
749
+ self.read_only = False
750
+ self.is_currently_viewed = False
751
+ self._children = []
752
+ self.parent = parent_node
753
+ if xml is not None:
754
+ self.__deserialize_from_xml(xml)
755
+
756
+ self._node = self.init_node()
757
+
758
+ def get_href(self):
759
+ return 'onenote:' + self.path
760
+
761
+ def init_node(self):
762
+ node = Node(self.name, _category='Section',
763
+ _html_content=f'<a href="{self.get_href()}"><font color="#b38600">〈{self.name}〉</font></a>')
764
+ return node
765
+
766
+ def __iter__(self):
767
+ for c in self._children:
768
+ yield c
769
+
770
+ def __str__(self):
771
+ return self.name
772
+
773
+ def __deserialize_from_xml(self, xml):
774
+ HierarchyNode.deserialize_from_xml(self, xml)
775
+ self.color = xml.get("color")
776
+ try:
777
+ self.read_only = xml.get("readOnly")
778
+ except Exception as e:
779
+ self.read_only = False
780
+ try:
781
+ self.is_currently_viewed = xml.get("isCurrentlyViewed")
782
+ except Exception as e:
783
+ self.is_currently_viewed = False
784
+
785
+ self._children = [Page(node, self) for node in xml]
786
+
787
+ def __getitem__(self, item):
788
+ """通过编号或名称索引子节点内容"""
789
+ if isinstance(item, int):
790
+ return self._children[item]
791
+ elif isinstance(item, str):
792
+ for nb in self:
793
+ if nb.name == item:
794
+ return nb
795
+ return None
796
+
797
+ def get_page_num(self):
798
+ return len(self._children)
799
+
800
+ def _search(self, *, use_node_cache=True, reparse=False):
801
+ if not self._node.is_leaf and use_node_cache:
802
+ return self._node
803
+ else:
804
+ self._node = self.init_node()
805
+
806
+ page_lv1, page_lv2 = self._node, self._node
807
+
808
+ for x in self._children:
809
+ # print(x.name)
810
+ cur_page = x._search(use_node_cache=use_node_cache, reparse=reparse)
811
+ if x.page_level == '1':
812
+ cur_page.parent = self._node
813
+ page_lv2 = page_lv1 = cur_page
814
+ elif x.page_level == '2':
815
+ cur_page.parent = page_lv1
816
+ page_lv2 = cur_page
817
+ else:
818
+ cur_page.parent = page_lv2
819
+
820
+ return self._node
821
+
822
+
823
+ class Page(_CommonMethods):
824
+ """ 页面 """
825
+
826
+ def __init__(self, xml=None, parent_node=None):
827
+ self.xml = xml
828
+ self.name = ""
829
+ self.id = ""
830
+ self.date_time = ""
831
+ self.last_modified_time = ""
832
+ self.page_level = ""
833
+ self.is_currently_viewed = ""
834
+ self._children = []
835
+ self.parent = parent_node
836
+ if xml is not None: # != None is required here, since this can return false
837
+ self.__deserialize_from_xml(xml)
838
+
839
+ self._node = self.init_node() # 供全文检索的树形结点
840
+
841
+ def get_href(self, strict=False):
842
+ """ 按照OneNote的链接规则,如果同一个分区下,有重名Page,只会查找到第1个
843
+ 如果要精确指向页面,需要使用get_hyperlink_to_object方法
844
+ """
845
+ if strict:
846
+ return onenote.get_hyperlink_to_object(self.id)
847
+ else:
848
+ return self.parent.get_href() + f'#{self.name}'
849
+
850
+ def init_node(self):
851
+ node = Node(self.name, _category='Page', _page_id=self.id, _page_level=int(self.page_level))
852
+ return node
853
+
854
+ def __iter__(self):
855
+ for c in self._children:
856
+ yield c
857
+
858
+ def __str__(self):
859
+ return self.name
860
+
861
+ # Get / Set Meta
862
+
863
+ @property
864
+ def root(self):
865
+ p = self
866
+ while getattr(p, 'parent', False):
867
+ p = p.parent
868
+ return p
869
+
870
+ def __deserialize_from_xml(self, xml):
871
+ self.xml = xml
872
+ self.name = xml.get("name")
873
+ self.id = xml.get("ID")
874
+ self.date_time = xml.get("dateTime")
875
+ self.last_modified_time = xml.get("lastModifiedTime")
876
+ self.page_level = xml.get("pageLevel")
877
+ self.is_currently_viewed = xml.get("isCurrentlyViewed")
878
+ self._children = [Meta(node) for node in xml]
879
+
880
+ def get_xml(self, page_info=0):
881
+ """ 获得页面的xml内容 """
882
+ # 1 有缓存的文件直接读取
883
+ prefix = f'{self.id}_{page_info}_'
884
+ file = CACHE_DIR / (prefix + self.last_modified_time.replace(':', '') + f'.xml')
885
+
886
+ if file.is_file():
887
+ return file.read_text()
888
+
889
+ # 2 否则没有缓存,或者文件不是最新,则使用onenote的接口获得文件内容
890
+ try:
891
+ res = super(OneNote, onenote).get_page_content(self.id, page_info)
892
+ except TimeoutError as e:
893
+ e.args = [e.args[0] + f'\n\t{self.abspath_name} 页面获取失败,请检查可能包含的office公式并删除。' \
894
+ f'并且看下您的OneNote可能无响应了,请重启OneNote。']
895
+ raise e
896
+
897
+ if res is None:
898
+ logging.warning(f'{self.abspath_name} 未成功提取页面内容')
899
+ else:
900
+ # 删除旧时间点的缓存文件,存储新的缓存文件
901
+ for f in CACHE_DIR.glob(f'{prefix}*.xml'):
902
+ f.delete()
903
+ file.write_text(res)
904
+
905
+ return res
906
+
907
+ def browser_xml(self, page_info=0):
908
+ from pyxllib.prog.specialist import browser
909
+ from pyxllib.file.specialist import XlPath
910
+ xml = self.get_xml(page_info)
911
+ browser(xml, file=XlPath.tempfile('.xml'))
912
+
913
+ def parse_xml(self, root=None, *, page_info=0, reparse=False):
914
+ """ 获得本Page页面的树形结构,返回一个Node根节点
915
+
916
+ :param reparse: 默认直接使用缓存里的解析结果,如果设置了reparse则强制重新解析
917
+
918
+ 因为层次结构有多种不同的表现形式
919
+ 层次结构A:Outline 文本框
920
+ 层次结构B:h1、h2、h3 等标题结构
921
+ 层次结构C:正文里的缩进层级
922
+ 所以实现这里parent的引用,算法会稍复杂。需要动态更新,实时返回新的parent。
923
+ 比如在遍历tag.contents的时候,因为[层次结构C]的原因,会出现 parent = dfs_parse_node(y, parent) 的较奇怪的写法
924
+ 在parse_oe中,parent的层次实现规则,也会较复杂,有些trick
925
+ """
926
+
927
+ # 0 函数
928
+
929
+ def _parse_xml(root):
930
+ soup = BeautifulSoup(xml or '', 'xml')
931
+ # self.browser_xml() # 可以用这个查原始的xml内容
932
+
933
+ style_defs = {}
934
+
935
+ # trick: outline_cnt不仅用来标记是否有多个Outline需要设中介结点。也记录了当前Outline的编号。
936
+ outline_cnt = 1 if len(soup.find_all('Outline')) > 1 else 0
937
+
938
+ cur_node: XlBs4Tag = soup
939
+ parent = root
940
+ while cur_node:
941
+ x = cur_node
942
+ if isinstance(x, bs4.element.Tag):
943
+ # if分支后注释的数字,是实际逻辑结构上先后遇到的顺序,但为了效率,按照出现频率重排序了
944
+ if x.name == 'OE': # 3
945
+ parent = OETag.parse2tree(x, parent, style_defs)
946
+ cur_node = cur_node.next_preorder_node(False)
947
+ continue
948
+ elif x.name == 'Outline': # 2
949
+ # 处理层次结构A
950
+ if outline_cnt:
951
+ if outline_cnt == 1:
952
+ parent = Node(f'Outline{outline_cnt}', parent)
953
+ else:
954
+ pp = XlNode.find_parent(parent, re.compile('^Outline'))
955
+ parent = Node(f'Outline{outline_cnt}', pp.parent if pp else parent)
956
+ outline_cnt += 1
957
+ elif x.name == 'QuickStyleDef': # 1
958
+ style_defs[x['index']] = x['name']
959
+ cur_node = cur_node.next_preorder_node(False)
960
+ continue
961
+
962
+ cur_node = XlBs4Tag.next_preorder_node(cur_node)
963
+
964
+ return root
965
+
966
+ # 1 在一次程序执行中,相同的xml内容解析出的树也是一样的,可以做个缓存
967
+ xml = self.get_xml(page_info=page_info)
968
+ etag = get_etag(xml)
969
+
970
+ if root is None:
971
+ root = Node('root')
972
+
973
+ if etag in _page_parsed_cache and not reparse:
974
+ root.children = importer.import_(_page_parsed_cache[etag]).children
975
+ return root
976
+
977
+ # 2 否则进入正常解析流程
978
+ root = _parse_xml(root)
979
+ _page_parsed_cache[etag] = exporter.export(root)
980
+
981
+ return root
982
+
983
+ def get_page_num(self):
984
+ return 1
985
+
986
+ def _search(self, *, use_node_cache=True, reparse=False):
987
+ """ 先生成所有结点
988
+ """
989
+ if not self._node.is_leaf and use_node_cache:
990
+ # 首先要有孩子结点,不是叶子结点,才表示可能解析过的node,此时开启use_node_cache的话,则不重复解析
991
+ return self._node
992
+ else:
993
+ self._node = self.init_node()
994
+
995
+ self.parse_xml(self._node, reparse=reparse)
996
+ _free_page_nodes.append(self._node)
997
+ return self._node
998
+
999
+
1000
+ class Meta:
1001
+
1002
+ def __init__(self, xml=None):
1003
+ self.xml = xml
1004
+ self.name = ""
1005
+ self.content = ""
1006
+ if xml is not None:
1007
+ self.__deserialize_from_xml(xml)
1008
+
1009
+ def __str__(self):
1010
+ return self.name
1011
+
1012
+ def __deserialize_from_xml(self, xml):
1013
+ self.name = xml.get("name")
1014
+ self.id = xml.get("content")
1015
+
1016
+ def get_xml(self):
1017
+ return super(OneNote, onenote).get_page_content(self.id)
1018
+
1019
+
1020
+ class PageContent:
1021
+
1022
+ def __init__(self, xml=None):
1023
+ self.xml = xml
1024
+ self.name = ""
1025
+ self.id = ""
1026
+ self.date_time = ""
1027
+ self.last_modified_time = ""
1028
+ self.page_level = ""
1029
+ self.lang = ""
1030
+ self.is_currently_viewed = ""
1031
+ self._children = []
1032
+ self.files = []
1033
+ if xml is not None:
1034
+ self.__deserialize_from_xml(xml)
1035
+
1036
+ def __iter__(self):
1037
+ for c in self._children:
1038
+ yield c
1039
+
1040
+ def __str__(self):
1041
+ return self.name
1042
+
1043
+ def __deserialize_from_xml(self, xml):
1044
+ self.name = xml.get("name")
1045
+ self.id = xml.get("ID")
1046
+ self.date_time = xml.get("dateTime")
1047
+ self.last_modified_time = xml.get("lastModifiedTime")
1048
+ self.page_level = xml.get("pageLevel")
1049
+ self.lang = xml.get("lang")
1050
+ self.is_currently_viewed = xml.get("isCurrentlyViewed")
1051
+ for node in xml:
1052
+ if node.tag == namespace + "Outline":
1053
+ self._children.append(Outline(node))
1054
+ elif node.tag == namespace + "Ink":
1055
+ self.files.append(Ink(node))
1056
+ elif node.tag == namespace + "Image":
1057
+ self.files.append(Image(node))
1058
+ elif node.tag == namespace + "InsertedFile":
1059
+ self.files.append(InsertedFile(node))
1060
+ elif node.tag == namespace + "Title":
1061
+ self._children.append(Title(node))
1062
+
1063
+
1064
+ class Title:
1065
+
1066
+ def __init__(self, xml=None):
1067
+ self.xml = xml
1068
+ self.style = ""
1069
+ self.lang = ""
1070
+ self._children = []
1071
+ if xml is not None:
1072
+ self.__deserialize_from_xml(xml)
1073
+
1074
+ def __str__(self):
1075
+ return "Page Title"
1076
+
1077
+ def __iter__(self):
1078
+ for c in self._children:
1079
+ yield c
1080
+
1081
+ def __deserialize_from_xml(self, xml):
1082
+ self.style = xml.get("style")
1083
+ self.lang = xml.get("lang")
1084
+ for node in xml:
1085
+ if node.tag == namespace + "OE":
1086
+ self._children.append(OE(node, self))
1087
+
1088
+
1089
+ class Outline:
1090
+
1091
+ def __init__(self, xml=None):
1092
+ self.xml = xml
1093
+ self.author = ""
1094
+ self.author_initials = ""
1095
+ self.last_modified_by = ""
1096
+ self.last_modified_by_initials = ""
1097
+ self.last_modified_time = ""
1098
+ self.id = ""
1099
+ self._children = []
1100
+ if xml is not None:
1101
+ self.__deserialize_from_xml(xml)
1102
+
1103
+ def __iter__(self):
1104
+ for c in self._children:
1105
+ yield c
1106
+
1107
+ def __str__(self):
1108
+ return "Outline"
1109
+
1110
+ def __deserialize_from_xml(self, xml):
1111
+ self.author = xml.get("author")
1112
+ self.author_initials = xml.get("authorInitials")
1113
+ self.last_modified_by = xml.get("lastModifiedBy")
1114
+ self.last_modified_by_initials = xml.get("lastModifiedByInitials")
1115
+ self.last_modified_time = xml.get("lastModifiedTime")
1116
+ self.id = xml.get("objectID")
1117
+ append = self._children.append
1118
+ for node in xml:
1119
+ if node.tag == namespace + "OEChildren":
1120
+ for childNode in node:
1121
+ if childNode.tag == namespace + "OE":
1122
+ append(OE(childNode, self))
1123
+
1124
+
1125
+ class Position:
1126
+
1127
+ def __init__(self, xml=None, parent_node=None):
1128
+ self.xml = xml
1129
+ self.x = ""
1130
+ self.y = ""
1131
+ self.z = ""
1132
+ self.parent = parent_node
1133
+ if xml is not None:
1134
+ self.__deserialize_from_xml(xml)
1135
+
1136
+ def __deserialize_from_xml(self, xml):
1137
+ self.x = xml.get("x")
1138
+ self.y = xml.get("y")
1139
+ self.z = xml.get("z")
1140
+
1141
+
1142
+ class Size:
1143
+
1144
+ def __init__(self, xml=None, parent_node=None):
1145
+ self.xml = xml
1146
+ self.width = ""
1147
+ self.height = ""
1148
+ self.parent = parent_node
1149
+ if xml is not None:
1150
+ self.__deserialize_from_xml(xml)
1151
+
1152
+ def __deserialize_from_xml(self, xml):
1153
+ self.width = xml.get("width")
1154
+ self.height = xml.get("height")
1155
+
1156
+
1157
+ class OE:
1158
+
1159
+ def __init__(self, xml=None, parent_node=None):
1160
+ self.xml = xml
1161
+ self.creation_time = ""
1162
+ self.last_modified_time = ""
1163
+ self.last_modified_by = ""
1164
+ self.id = ""
1165
+ self.alignment = ""
1166
+ self.quick_style_index = ""
1167
+ self.style = ""
1168
+ self.text = ""
1169
+ self._children = []
1170
+ self.parent = parent_node
1171
+ self.files = []
1172
+ if xml is not None:
1173
+ self.__deserialize_from_xml(xml)
1174
+
1175
+ def __iter__(self):
1176
+ for c in self._children:
1177
+ yield c
1178
+
1179
+ def __str__(self):
1180
+ try:
1181
+ return self.text
1182
+ except AttributeError:
1183
+ return "Empty OE"
1184
+
1185
+ def __deserialize_from_xml(self, xml):
1186
+ self.creation_time = xml.get("creationTime")
1187
+ self.last_modified_time = xml.get("lastModifiedTime")
1188
+ self.last_modified_by = xml.get("lastModifiedBy")
1189
+ self.id = xml.get("objectID")
1190
+ self.alignment = xml.get("alignment")
1191
+ self.quick_style_index = xml.get("quickStyleIndex")
1192
+ self.style = xml.get("style")
1193
+
1194
+ for node in xml:
1195
+ if node.tag == namespace + "T":
1196
+ if node.text is not None:
1197
+ self.text = node.text
1198
+ else:
1199
+ self.text = "NO TEXT"
1200
+
1201
+ elif node.tag == namespace + "OEChildren":
1202
+ for childNode in node:
1203
+ if childNode.tag == namespace + "OE":
1204
+ self._children.append(OE(childNode, self))
1205
+
1206
+ elif node.tag == namespace + "Image":
1207
+ self.files.append(Image(node, self))
1208
+
1209
+ elif node.tag == namespace + "InkWord":
1210
+ self.files.append(Ink(node, self))
1211
+
1212
+ elif node.tag == namespace + "InsertedFile":
1213
+ self.files.append(InsertedFile(node, self))
1214
+
1215
+
1216
+ class OETag(bs4.element.Tag):
1217
+
1218
+ def get_text2(self):
1219
+ """ 这是给bs4.Tag准备的功能接口 """
1220
+ if y := self.find('T', recursive=False):
1221
+ t1 = BeautifulSoup(y.text, 'lxml').text
1222
+ t2 = y.text
1223
+ elif y := self.find('Table', recursive=False):
1224
+ # 先Columns标记了一共m列,每列的宽度
1225
+ # 然后每一行是一个Row,里面有m个Cell
1226
+ t1 = '[Table]'
1227
+ t2 = t1
1228
+ elif y := self.find('Image', recursive=False):
1229
+ t1 = '[Image]'
1230
+ t2 = t1
1231
+ else:
1232
+ t1 = ''
1233
+ t2 = ''
1234
+ return t1, t2
1235
+
1236
+ def parse2tree(self, parent, style_defs):
1237
+ """ 从Tag结点,解析出 anytree 格式的结点树
1238
+
1239
+ :param Node parent: anytree的node父结点
1240
+ 会将当前Tag解析的内容,转存,挂到parent.children下
1241
+ :param style_defs: 前文解析到的样式表
1242
+ """
1243
+
1244
+ # 1 获得3个主要属性
1245
+ style_name = style_defs.get(self.get('quickStyleIndex', ''), '')
1246
+ pure_text, html_text = OETag.get_text2(self) # 文本内容
1247
+ m = self.find('OEChildren', recursive=False) # 文本性质子结点
1248
+
1249
+ # 空数据跳过
1250
+ # if not pure_text and not m:
1251
+ # return parent
1252
+
1253
+ # 2 处理层次结构B
1254
+ if re.match(r'h\d$', style_name): # 标题类
1255
+ while True:
1256
+ parent_style_name = getattr(parent, '_style_name', '')
1257
+ if re.match(r'h\d$', parent_style_name) and parent_style_name >= style_name:
1258
+ # 如果父结点也是标题类型,且数值上不大于当前结点,则当前结点的实际父结点要往上层找
1259
+ parent = parent.parent
1260
+ else:
1261
+ break
1262
+ # 标题类,会重置parent,本身作为一个中间结点
1263
+ cur_node = parent = Node(pure_text, parent, _style_name=style_name, _html_content=html_text)
1264
+ else:
1265
+ cur_node = Node(pure_text, parent, _html_content=html_text)
1266
+
1267
+ setattr(cur_node, '_category', 'OE')
1268
+ setattr(cur_node, '_page_id', XlNode.find_parent(self, 'Page')['ID']) # noqa find_parent适用于bs4.element.Tag
1269
+ setattr(cur_node, '_object_id', self['objectID'])
1270
+
1271
+ # 3 表格、图片等特殊结构增设层级
1272
+ if pure_text.startswith('[Table]'):
1273
+ for z in self.find_all('T'):
1274
+ Node(BeautifulSoup(z.text, 'lxml').text, cur_node, _html_content=z.text)
1275
+ elif pure_text.startswith('[Image]'):
1276
+ y = self.find('Image', recursive=False)
1277
+ for z in y.get('alt', '').splitlines():
1278
+ Node(z, cur_node)
1279
+
1280
+ # 4 处理层次结构C
1281
+ if m:
1282
+ for y in m.find_all('OE', recursive=False):
1283
+ OETag.parse2tree(y, cur_node, style_defs)
1284
+
1285
+ return parent
1286
+
1287
+
1288
+ class InsertedFile:
1289
+
1290
+ # need to add position data to this class
1291
+
1292
+ def __init__(self, xml=None, parent_node=None):
1293
+ self.xml = xml
1294
+ self.path_cache = ""
1295
+ self.path_source = ""
1296
+ self.preferred_name = ""
1297
+ self.last_modified_time = ""
1298
+ self.last_modified_by = ""
1299
+ self.id = ""
1300
+ self.parent = parent_node
1301
+ if xml is not None:
1302
+ self.__deserialize_from_xml(xml)
1303
+
1304
+ def __iter__(self):
1305
+ yield None
1306
+
1307
+ def __str__(self):
1308
+ try:
1309
+ return self.preferredName
1310
+ except AttributeError:
1311
+ return "Unnamed File"
1312
+
1313
+ def __deserialize_from_xml(self, xml):
1314
+ self.path_cache = xml.get("pathCache")
1315
+ self.path_source = xml.get("pathSource")
1316
+ self.preferred_name = xml.get("preferredName")
1317
+ self.last_modified_time = xml.get("lastModifiedTime")
1318
+ self.last_modified_by = xml.get("lastModifiedBy")
1319
+ self.id = xml.get("objectID")
1320
+
1321
+
1322
+ class Ink:
1323
+
1324
+ # need to add position data to this class
1325
+
1326
+ def __init__(self, xml=None, parent_node=None):
1327
+ self.xml = xml
1328
+ self.recognized_text = ""
1329
+ self.x = ""
1330
+ self.y = ""
1331
+ self.ink_origin_x = ""
1332
+ self.ink_origin_y = ""
1333
+ self.width = ""
1334
+ self.height = ""
1335
+ self.data = ""
1336
+ self.callback_id = ""
1337
+ self.parent = parent_node
1338
+
1339
+ if xml is not None:
1340
+ self.__deserialize_from_xml(xml)
1341
+
1342
+ def __iter__(self):
1343
+ yield None
1344
+
1345
+ def __str__(self):
1346
+ try:
1347
+ return self.recognizedText
1348
+ except AttributeError:
1349
+ return "Unrecognized Ink"
1350
+
1351
+ def __deserialize_from_xml(self, xml):
1352
+ self.recognized_text = xml.get("recognizedText")
1353
+ self.x = xml.get("x")
1354
+ self.y = xml.get("y")
1355
+ self.ink_origin_x = xml.get("inkOriginX")
1356
+ self.ink_origin_y = xml.get("inkOriginY")
1357
+ self.width = xml.get("width")
1358
+ self.height = xml.get("height")
1359
+
1360
+ for node in xml:
1361
+ if node.tag == namespace + "CallbackID":
1362
+ self.callback_id = node.get("callbackID")
1363
+ elif node.tag == namespace + "Data":
1364
+ self.data = node.text
1365
+
1366
+
1367
+ class Image:
1368
+
1369
+ def __init__(self, xml=None, parent_node=None):
1370
+ self.xml = xml
1371
+ self.format = ""
1372
+ self.original_page_number = ""
1373
+ self.last_modified_time = ""
1374
+ self.id = ""
1375
+ self.callback_id = None
1376
+ self.data = ""
1377
+ self.parent = parent_node
1378
+ if xml is not None:
1379
+ self.__deserialize_from_xml(xml)
1380
+
1381
+ def __iter__(self):
1382
+ yield None
1383
+
1384
+ def __str__(self):
1385
+ return self.format + " Image"
1386
+
1387
+ def __deserialize_from_xml(self, xml):
1388
+ self.format = xml.get("format")
1389
+ self.original_page_number = xml.get("originalPageNumber")
1390
+ self.last_modified_time = xml.get("lastModifiedTime")
1391
+ self.id = xml.get("objectID")
1392
+ for node in xml:
1393
+ if node.tag == namespace + "CallbackID":
1394
+ self.callback_id = node.get("callbackID")
1395
+ elif node.tag == namespace + "Data":
1396
+ if node.text is not None:
1397
+ self.data = node.text
1398
+
1399
+
1400
+ onenote = OneNote()
1401
+
1402
+
1403
+ def start_server(root=None, edits=None, *, port=80, reparse=False):
1404
+ """ 在本地开启一个onenote搜索服务
1405
+
1406
+ :param str root: 要检索的onenote根目录,未设置时默认初始化所有OneNote笔记
1407
+ 所有的路径一律用反斜杠/隔开
1408
+ 注意这样写是会降低灵活性的
1409
+ 一方面本来是可以使用数字直接引用下标的,这种模式下出现的数字会直接判定为是页面名称
1410
+ 另一方面路径中本来就可能存在/
1411
+ 但这些情况是小概率事件,一般遇到有问题的子目录,改到大目录里定位就好
1412
+ 实在不行,读者可以自己复制这个函数自行扩展
1413
+ :param str|list[str] edits: 每次检索前要强制更新检索树的目录
1414
+ str, 用英文逗号隔开多个条目
1415
+ list[str]里的str,统一只要写在root下的相对路径
1416
+
1417
+ >> search_server('核心', ['2022ch4'])
1418
+ >> search_server('共享/陈坤泽', ['杂项', 'CF', '吃土乡/大家的幻想乡'])
1419
+ """
1420
+ from flask import Flask, request
1421
+
1422
+ # 1 初始化检索数据
1423
+ parent = onenote(root)
1424
+ parent.get_search_tree(print_mode=True, reparse=reparse)
1425
+
1426
+ if isinstance(edits, str):
1427
+ # 如果输入是字符串,可能使用命令行启动的,需要转义为list
1428
+ edits = edits.split(',') # 英文逗号隔开多个参数
1429
+
1430
+ # 2 开服务接口
1431
+ app = Flask(__name__)
1432
+
1433
+ @app.route('/search/onenote', methods=['GET'])
1434
+ def search_onenote():
1435
+ def get_args(key, default=None):
1436
+ return request.args.get(key, default)
1437
+
1438
+ pattern = get_args('pattern')
1439
+ if pattern:
1440
+ # 解析功能细节
1441
+ res = parent.search(pattern, edits=edits,
1442
+ child_depth=int(get_args('child_depth', 0)),
1443
+ return_mode=get_args('return_mode', 'html'),
1444
+ padding_mode=int(get_args('padding_mode', 0)),
1445
+ print_mode=get_args('print_mode', True),
1446
+ href_mode=int(get_args('href_mode', 2)), # 默认使用动态链接
1447
+ dedent=int(get_args('dedent', 1)))
1448
+ else:
1449
+ ref_url = 'http://localhost/search/onenote?pattern=test'
1450
+ return f'请输入检索内容,例如 <a href={ref_url}>{ref_url}</a>'
1451
+ return res
1452
+
1453
+ @app.route('/search/onenote/linkid', methods=['GET'])
1454
+ def linkid():
1455
+ """ 通过id进行目标跳转 """
1456
+ page_id = request.args.get('id', None)
1457
+ object_id = request.args.get('object_id', '')
1458
+ onenote.navigate_to(page_id, object_id)
1459
+ # 返回一个直接自关闭的页面内容
1460
+ return '<script type="text/javascript">window.close();</script>'
1461
+
1462
+ # 子线程无法调用win32com生成的onenote资源,如果要使用linkid功能,只能留一个主线程处理
1463
+ app.run(host='0.0.0.0', port=port, threaded=False)
1464
+
1465
+
1466
+ if __name__ == '__main__':
1467
+ import fire
1468
+
1469
+ fire.Fire()