pyxllib 0.0.43__py3-none-any.whl → 0.3.197__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyxllib/__init__.py +9 -2
- pyxllib/algo/__init__.py +8 -0
- pyxllib/algo/disjoint.py +54 -0
- pyxllib/algo/geo.py +541 -0
- pyxllib/{util/mathlib.py → algo/intervals.py} +172 -36
- pyxllib/algo/matcher.py +389 -0
- pyxllib/algo/newbie.py +166 -0
- pyxllib/algo/pupil.py +629 -0
- pyxllib/algo/shapelylib.py +67 -0
- pyxllib/algo/specialist.py +241 -0
- pyxllib/algo/stat.py +494 -0
- pyxllib/algo/treelib.py +149 -0
- pyxllib/algo/unitlib.py +66 -0
- pyxllib/autogui/__init__.py +5 -0
- pyxllib/autogui/activewin.py +246 -0
- pyxllib/autogui/all.py +9 -0
- pyxllib/autogui/autogui.py +852 -0
- pyxllib/autogui/uiautolib.py +362 -0
- pyxllib/autogui/virtualkey.py +102 -0
- pyxllib/autogui/wechat.py +827 -0
- pyxllib/autogui/wechat_msg.py +421 -0
- pyxllib/autogui/wxautolib.py +84 -0
- pyxllib/cv/__init__.py +1 -11
- pyxllib/cv/expert.py +267 -0
- pyxllib/cv/{imlib.py → imfile.py} +18 -83
- pyxllib/cv/imhash.py +39 -0
- pyxllib/cv/pupil.py +9 -0
- pyxllib/cv/rgbfmt.py +1525 -0
- pyxllib/cv/slidercaptcha.py +137 -0
- pyxllib/cv/trackbartools.py +163 -49
- pyxllib/cv/xlcvlib.py +1040 -0
- pyxllib/cv/xlpillib.py +423 -0
- pyxllib/data/__init__.py +0 -0
- pyxllib/data/echarts.py +240 -0
- pyxllib/data/jsonlib.py +89 -0
- pyxllib/{util/oss2_.py → data/oss.py} +11 -9
- pyxllib/data/pglib.py +1127 -0
- pyxllib/data/sqlite.py +568 -0
- pyxllib/{util → data}/sqllib.py +13 -31
- pyxllib/ext/JLineViewer.py +505 -0
- pyxllib/ext/__init__.py +6 -0
- pyxllib/{util → ext}/demolib.py +119 -35
- pyxllib/ext/drissionlib.py +277 -0
- pyxllib/ext/kq5034lib.py +12 -0
- pyxllib/{util/main.py → ext/old.py} +122 -284
- pyxllib/ext/qt.py +449 -0
- pyxllib/ext/robustprocfile.py +497 -0
- pyxllib/ext/seleniumlib.py +76 -0
- pyxllib/{util/tklib.py → ext/tk.py} +10 -11
- pyxllib/ext/unixlib.py +827 -0
- pyxllib/ext/utools.py +351 -0
- pyxllib/{util/webhooklib.py → ext/webhook.py} +45 -17
- pyxllib/ext/win32lib.py +40 -0
- pyxllib/ext/wjxlib.py +88 -0
- pyxllib/ext/wpsapi.py +124 -0
- pyxllib/ext/xlwork.py +9 -0
- pyxllib/ext/yuquelib.py +1105 -0
- pyxllib/file/__init__.py +17 -0
- pyxllib/file/docxlib.py +761 -0
- pyxllib/{util → file}/gitlib.py +40 -27
- pyxllib/file/libreoffice.py +165 -0
- pyxllib/file/movielib.py +148 -0
- pyxllib/file/newbie.py +10 -0
- pyxllib/file/onenotelib.py +1469 -0
- pyxllib/file/packlib/__init__.py +330 -0
- pyxllib/{util → file/packlib}/zipfile.py +598 -195
- pyxllib/file/pdflib.py +426 -0
- pyxllib/file/pupil.py +185 -0
- pyxllib/file/specialist/__init__.py +685 -0
- pyxllib/{basic/_5_dirlib.py → file/specialist/dirlib.py} +364 -93
- pyxllib/file/specialist/download.py +193 -0
- pyxllib/file/specialist/filelib.py +2829 -0
- pyxllib/file/xlsxlib.py +3131 -0
- pyxllib/file/xlsyncfile.py +341 -0
- pyxllib/prog/__init__.py +5 -0
- pyxllib/prog/cachetools.py +64 -0
- pyxllib/prog/deprecatedlib.py +233 -0
- pyxllib/prog/filelock.py +42 -0
- pyxllib/prog/ipyexec.py +253 -0
- pyxllib/prog/multiprogs.py +940 -0
- pyxllib/prog/newbie.py +451 -0
- pyxllib/prog/pupil.py +1197 -0
- pyxllib/{sitepackages.py → prog/sitepackages.py} +5 -3
- pyxllib/prog/specialist/__init__.py +391 -0
- pyxllib/prog/specialist/bc.py +203 -0
- pyxllib/prog/specialist/browser.py +497 -0
- pyxllib/prog/specialist/common.py +347 -0
- pyxllib/prog/specialist/datetime.py +199 -0
- pyxllib/prog/specialist/tictoc.py +240 -0
- pyxllib/prog/specialist/xllog.py +180 -0
- pyxllib/prog/xlosenv.py +108 -0
- pyxllib/stdlib/__init__.py +17 -0
- pyxllib/{util → stdlib}/tablepyxl/__init__.py +1 -3
- pyxllib/{util → stdlib}/tablepyxl/style.py +1 -1
- pyxllib/{util → stdlib}/tablepyxl/tablepyxl.py +2 -4
- pyxllib/text/__init__.py +8 -0
- pyxllib/text/ahocorasick.py +39 -0
- pyxllib/text/airscript.js +744 -0
- pyxllib/text/charclasslib.py +121 -0
- pyxllib/text/jiebalib.py +267 -0
- pyxllib/text/jinjalib.py +32 -0
- pyxllib/text/jsa_ai_prompt.md +271 -0
- pyxllib/text/jscode.py +922 -0
- pyxllib/text/latex/__init__.py +158 -0
- pyxllib/text/levenshtein.py +303 -0
- pyxllib/text/nestenv.py +1215 -0
- pyxllib/text/newbie.py +300 -0
- pyxllib/text/pupil/__init__.py +8 -0
- pyxllib/text/pupil/common.py +1121 -0
- pyxllib/text/pupil/xlalign.py +326 -0
- pyxllib/text/pycode.py +47 -0
- pyxllib/text/specialist/__init__.py +8 -0
- pyxllib/text/specialist/common.py +112 -0
- pyxllib/text/specialist/ptag.py +186 -0
- pyxllib/text/spellchecker.py +172 -0
- pyxllib/text/templates/echart_base.html +11 -0
- pyxllib/text/templates/highlight_code.html +17 -0
- pyxllib/text/templates/latex_editor.html +103 -0
- pyxllib/text/vbacode.py +17 -0
- pyxllib/text/xmllib.py +747 -0
- pyxllib/xl.py +39 -0
- pyxllib/xlcv.py +17 -0
- pyxllib-0.3.197.dist-info/METADATA +48 -0
- pyxllib-0.3.197.dist-info/RECORD +126 -0
- {pyxllib-0.0.43.dist-info → pyxllib-0.3.197.dist-info}/WHEEL +4 -5
- pyxllib/basic/_1_strlib.py +0 -945
- pyxllib/basic/_2_timelib.py +0 -488
- pyxllib/basic/_3_pathlib.py +0 -916
- pyxllib/basic/_4_loglib.py +0 -419
- pyxllib/basic/__init__.py +0 -54
- pyxllib/basic/arrow_.py +0 -250
- pyxllib/basic/chardet_.py +0 -66
- pyxllib/basic/dirlib.py +0 -529
- pyxllib/basic/dprint.py +0 -202
- pyxllib/basic/extension.py +0 -12
- pyxllib/basic/judge.py +0 -31
- pyxllib/basic/log.py +0 -204
- pyxllib/basic/pathlib_.py +0 -705
- pyxllib/basic/pytictoc.py +0 -102
- pyxllib/basic/qiniu_.py +0 -61
- pyxllib/basic/strlib.py +0 -761
- pyxllib/basic/timer.py +0 -132
- pyxllib/cv/cv.py +0 -834
- pyxllib/cv/cvlib/_1_geo.py +0 -543
- pyxllib/cv/cvlib/_2_cvprcs.py +0 -309
- pyxllib/cv/cvlib/_2_imgproc.py +0 -594
- pyxllib/cv/cvlib/_3_pilprcs.py +0 -80
- pyxllib/cv/cvlib/_4_cvimg.py +0 -211
- pyxllib/cv/cvlib/__init__.py +0 -10
- pyxllib/cv/debugtools.py +0 -82
- pyxllib/cv/fitz_.py +0 -300
- pyxllib/cv/installer.py +0 -42
- pyxllib/debug/_0_installer.py +0 -38
- pyxllib/debug/_1_typelib.py +0 -277
- pyxllib/debug/_2_chrome.py +0 -198
- pyxllib/debug/_3_showdir.py +0 -161
- pyxllib/debug/_4_bcompare.py +0 -140
- pyxllib/debug/__init__.py +0 -49
- pyxllib/debug/bcompare.py +0 -132
- pyxllib/debug/chrome.py +0 -198
- pyxllib/debug/installer.py +0 -38
- pyxllib/debug/showdir.py +0 -158
- pyxllib/debug/typelib.py +0 -278
- pyxllib/image/__init__.py +0 -12
- pyxllib/torch/__init__.py +0 -20
- pyxllib/torch/modellib.py +0 -37
- pyxllib/torch/trainlib.py +0 -344
- pyxllib/util/__init__.py +0 -20
- pyxllib/util/aip_.py +0 -141
- pyxllib/util/casiadb.py +0 -59
- pyxllib/util/excellib.py +0 -495
- pyxllib/util/filelib.py +0 -612
- pyxllib/util/jsondata.py +0 -27
- pyxllib/util/jsondata2.py +0 -92
- pyxllib/util/labelmelib.py +0 -139
- pyxllib/util/onepy/__init__.py +0 -29
- pyxllib/util/onepy/onepy.py +0 -574
- pyxllib/util/onepy/onmanager.py +0 -170
- pyxllib/util/pyautogui_.py +0 -219
- pyxllib/util/textlib.py +0 -1305
- pyxllib/util/unorder.py +0 -22
- pyxllib/util/xmllib.py +0 -639
- pyxllib-0.0.43.dist-info/METADATA +0 -39
- pyxllib-0.0.43.dist-info/RECORD +0 -80
- pyxllib-0.0.43.dist-info/top_level.txt +0 -1
- {pyxllib-0.0.43.dist-info → pyxllib-0.3.197.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,1469 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : 陈坤泽
|
4
|
+
# @Email : 877362867@qq.com
|
5
|
+
# @Date : 2022/06/28 21:40
|
6
|
+
|
7
|
+
import datetime
|
8
|
+
import logging
|
9
|
+
import re
|
10
|
+
import time
|
11
|
+
import warnings
|
12
|
+
import os
|
13
|
+
from threading import Thread
|
14
|
+
from functools import reduce
|
15
|
+
|
16
|
+
import bs4
|
17
|
+
import pytz
|
18
|
+
from xml.etree import ElementTree
|
19
|
+
from humanfriendly import format_size
|
20
|
+
from anytree.importer import DictImporter
|
21
|
+
from anytree.exporter import DictExporter
|
22
|
+
|
23
|
+
# 过滤这类警告
|
24
|
+
warnings.filterwarnings("ignore", category=bs4.MarkupResemblesLocatorWarning, module='bs4')
|
25
|
+
|
26
|
+
import win32com.client
|
27
|
+
|
28
|
+
if win32com.client.gencache.is_readonly:
|
29
|
+
win32com.client.gencache.is_readonly = False
|
30
|
+
win32com.client.gencache.Rebuild()
|
31
|
+
|
32
|
+
from pyxllib.prog.newbie import SingletonForEveryClass
|
33
|
+
from pyxllib.prog.pupil import Timeout
|
34
|
+
from pyxllib.prog.specialist import tqdm
|
35
|
+
from pyxllib.algo.treelib import Node, XlNode
|
36
|
+
from pyxllib.text.xmllib import BeautifulSoup, XlBs4Tag
|
37
|
+
from pyxllib.file.specialist import XlPath, get_etag
|
38
|
+
|
39
|
+
"""
|
40
|
+
参考了onepy的实现,做了重构。OnePy:Provides pythonic wrappers around OneNote COM interfaces
|
41
|
+
"""
|
42
|
+
|
43
|
+
namespace = "{http://schemas.microsoft.com/office/onenote/2013/onenote}"
|
44
|
+
|
45
|
+
# 还未绑定父结点的游离page node,用于进度条子线程
|
46
|
+
_free_page_nodes = []
|
47
|
+
|
48
|
+
# 缓存文件地址
|
49
|
+
CACHE_DIR = XlPath.tempdir() / 'OneNote/SearchCache'
|
50
|
+
os.makedirs(CACHE_DIR, exist_ok=True)
|
51
|
+
|
52
|
+
# 页面解析结果的缓存,用于解析加速
|
53
|
+
_page_parsed_cache = {}
|
54
|
+
_page_parsed_cache_file = CACHE_DIR / 'page_parsed_cache_file.pkl'
|
55
|
+
if _page_parsed_cache_file.is_file():
|
56
|
+
_page_parsed_cache = _page_parsed_cache_file.read_pkl()
|
57
|
+
|
58
|
+
# 用来读取、保存序列化的node数据
|
59
|
+
importer = DictImporter()
|
60
|
+
exporter = DictExporter()
|
61
|
+
|
62
|
+
|
63
|
+
class ONProcess(metaclass=SingletonForEveryClass):
|
64
|
+
""" onenote 底层win32的接口
|
65
|
+
|
66
|
+
详细功能可以查官方文档:
|
67
|
+
Application interface (OneNote) | Microsoft Docs:
|
68
|
+
https://docs.microsoft.com/en-us/office/client-developer/onenote/application-interface-onenote
|
69
|
+
"""
|
70
|
+
|
71
|
+
def __init__(self, timeout=30):
|
72
|
+
""" onenote的win32接口方法是驼峰命名,这个ONProcess做了一层功能封装
|
73
|
+
而且估计理论上对所有可以获得的接口都做了封装了
|
74
|
+
|
75
|
+
:param timeout: 读取单个页面的时候,限制用时,单位:秒
|
76
|
+
本来只想限制5秒,但发现会有一些页面特别长,需要多一些时间~
|
77
|
+
再后来发现还有更慢的页面,半分钟的都有,就再改成30秒了
|
78
|
+
"""
|
79
|
+
# TODO 这里需要针对不同的OneNote版本做自动化兼容,不要让用户填版本
|
80
|
+
# 因为让用户填版本,会存在多个实例化对象,使用get_xml会有各种问题
|
81
|
+
# 目前是支持onenote2016的,但不知道其他版本onenote会怎样
|
82
|
+
# self.process = win32com.client.gencache.EnsureDispatch('OneNote.Application')
|
83
|
+
self.process = win32com.client.DispatchEx('OneNote.Application')
|
84
|
+
self.namespace = "{http://schemas.microsoft.com/office/onenote/2013/onenote}"
|
85
|
+
|
86
|
+
# 官方原版的实现,但我觉得可以去掉版本号
|
87
|
+
# try:
|
88
|
+
# if version == 16:
|
89
|
+
# self.process = win32com.client.gencache.EnsureDispatch('OneNote.Application')
|
90
|
+
# self.namespace = "{http://schemas.microsoft.com/office/onenote/2013/onenote}"
|
91
|
+
# if version == 15:
|
92
|
+
# self.process = win32com.client.gencache.EnsureDispatch('OneNote.Application.15')
|
93
|
+
# self.namespace = "{http://schemas.microsoft.com/office/onenote/2013/onenote}"
|
94
|
+
# if version == 14:
|
95
|
+
# self.process = win32com.client.gencache.EnsureDispatch('OneNote.Application.14')
|
96
|
+
# self.namespace = "{http://schemas.microsoft.com/office/onenote/2010/onenote}"
|
97
|
+
# except Exception as e:
|
98
|
+
# # pywintypes.com_error: (-2147221005, '无效的类字符串', None, None)
|
99
|
+
# # pywintypes.com_error: (-2147221005, '无效的类字符串', None, None)
|
100
|
+
# print(e)
|
101
|
+
# print("error starting onenote {}".format(version))
|
102
|
+
|
103
|
+
global namespace
|
104
|
+
namespace = self.namespace
|
105
|
+
|
106
|
+
self.timeout_seconds = timeout
|
107
|
+
|
108
|
+
def get_hierarchy(self, start_node_id="", hierarchy_scope=4):
|
109
|
+
"""
|
110
|
+
HierarchyScope
|
111
|
+
0 - Gets just the start node specified and no descendants.
|
112
|
+
1 - Gets the immediate child nodes of the start node, and no descendants in higher or lower subsection groups.
|
113
|
+
2 - Gets all notebooks below the start node, or root.
|
114
|
+
3 - Gets all sections below the start node, including sections in section groups and subsection groups.
|
115
|
+
4 - Gets all pages below the start node, including all pages in section groups and subsection groups.
|
116
|
+
"""
|
117
|
+
return self.process.GetHierarchy(start_node_id, hierarchy_scope)
|
118
|
+
|
119
|
+
def update_hierarchy(self, changes_xml_in):
|
120
|
+
try:
|
121
|
+
self.process.UpdateHierarchy(changes_xml_in)
|
122
|
+
except Exception as e:
|
123
|
+
print("Could not Update Hierarchy")
|
124
|
+
|
125
|
+
def open_hierarchy(self, path, relative_to_object_id, object_id, create_file_type=0):
|
126
|
+
"""
|
127
|
+
CreateFileType
|
128
|
+
0 - Creates no new object.
|
129
|
+
1 - Creates a notebook with the specified name at the specified location.
|
130
|
+
2 - Creates a section group with the specified name at the specified location.
|
131
|
+
3 - Creates a section with the specified name at the specified location.
|
132
|
+
"""
|
133
|
+
try:
|
134
|
+
return self.process.OpenHierarchy(path, relative_to_object_id, "", create_file_type)
|
135
|
+
except Exception as e:
|
136
|
+
print("Could not Open Hierarchy")
|
137
|
+
|
138
|
+
def delete_hierarchy(self, object_id, excpect_last_modified=""):
|
139
|
+
try:
|
140
|
+
self.process.DeleteHierarchy(object_id, excpect_last_modified)
|
141
|
+
except Exception as e:
|
142
|
+
print("Could not Delete Hierarchy")
|
143
|
+
|
144
|
+
def create_new_page(self, section_id, new_page_style=0):
|
145
|
+
"""
|
146
|
+
NewPageStyle
|
147
|
+
0 - Create a Page that has Default Page Style
|
148
|
+
1 - Create a blank page with no title
|
149
|
+
2 - Createa blank page that has no title
|
150
|
+
"""
|
151
|
+
try:
|
152
|
+
self.process.CreateNewPage(section_id, "", new_page_style)
|
153
|
+
except Exception as e:
|
154
|
+
print("Unable to create the page")
|
155
|
+
|
156
|
+
def close_notebook(self, notebook_id):
|
157
|
+
try:
|
158
|
+
self.process.CloseNotebook(notebook_id)
|
159
|
+
except Exception as e:
|
160
|
+
print("Could not Close Notebook")
|
161
|
+
|
162
|
+
def get_page_content(self, page_id, page_info=0):
|
163
|
+
"""
|
164
|
+
PageInfo
|
165
|
+
0 - Returns only file page content, without selection markup and binary data objects. This is the standard value to pass.
|
166
|
+
1 - Returns page content with no selection markup, but with all binary data.
|
167
|
+
2 - Returns page content with selection markup, but no binary data.
|
168
|
+
3 - Returns page content with selection markup and all binary data.
|
169
|
+
"""
|
170
|
+
with Timeout(self.timeout_seconds):
|
171
|
+
return self.process.GetPageContent(page_id, "", page_info)
|
172
|
+
|
173
|
+
def update_page_content(self, page_changes_xml_in, excpect_last_modified=0):
|
174
|
+
try:
|
175
|
+
self.process.UpdatePageContent(page_changes_xml_in, excpect_last_modified)
|
176
|
+
except Exception as e:
|
177
|
+
print("Could not Update Page Content")
|
178
|
+
|
179
|
+
def get_binary_page_content(self, page_id, callback_id):
|
180
|
+
try:
|
181
|
+
return self.process.GetBinaryPageContent(page_id, callback_id)
|
182
|
+
except Exception as e:
|
183
|
+
print("Could not Get Binary Page Content")
|
184
|
+
|
185
|
+
def delete_page_content(self, page_id, object_id, excpect_last_modified=0):
|
186
|
+
try:
|
187
|
+
self.process.DeletePageContent(page_id, object_id, excpect_last_modified)
|
188
|
+
except Exception as e:
|
189
|
+
print("Could not Delete Page Content")
|
190
|
+
|
191
|
+
# Actions
|
192
|
+
|
193
|
+
def navigate_to(self, page_id, object_id='', new_window=False):
|
194
|
+
try:
|
195
|
+
self.process.NavigateTo(page_id, object_id, new_window)
|
196
|
+
except Exception as e:
|
197
|
+
print("Could not Navigate To")
|
198
|
+
|
199
|
+
def publish(self, hierarchy_id, target_file_path, publish_format, clsid_of_exporter=""):
|
200
|
+
"""
|
201
|
+
PublishFormat
|
202
|
+
0 - Published page is in .one format.
|
203
|
+
1 - Published page is in .onea format.
|
204
|
+
2 - Published page is in .mht format.
|
205
|
+
3 - Published page is in .pdf format.
|
206
|
+
4 - Published page is in .xps format.
|
207
|
+
5 - Published page is in .doc or .docx format.
|
208
|
+
6 - Published page is in enhanced metafile (.emf) format.
|
209
|
+
"""
|
210
|
+
try:
|
211
|
+
self.process.Publish(hierarchy_id, target_file_path, publish_format, clsid_of_exporter)
|
212
|
+
except Exception as e:
|
213
|
+
print("Could not Publish")
|
214
|
+
|
215
|
+
def open_package(self, path_package, path_dest):
|
216
|
+
try:
|
217
|
+
return self.process.OpenPackage(path_package, path_dest)
|
218
|
+
except Exception as e:
|
219
|
+
print("Could not Open Package")
|
220
|
+
|
221
|
+
def get_hyperlink_to_object(self, page_id, object_id=""):
|
222
|
+
"""
|
223
|
+
|
224
|
+
:param str page_id:
|
225
|
+
The OneNote ID for the notebook, section group, section, or page for which you want a hyperlink.
|
226
|
+
:param str object_id: The OneNote ID for the object within the page for which you want a hyperlink.
|
227
|
+
"""
|
228
|
+
try:
|
229
|
+
return self.process.GetHyperlinkToObject(page_id, object_id)
|
230
|
+
except Exception as e:
|
231
|
+
print("Could not Get Hyperlink")
|
232
|
+
|
233
|
+
def find_pages(self, start_node_id, search_string, display):
|
234
|
+
try:
|
235
|
+
return self.process.FindPages(start_node_id, search_string, "", False, display)
|
236
|
+
except Exception as e:
|
237
|
+
print("Could not Find Pages")
|
238
|
+
|
239
|
+
def get_special_location(self, special_location=0):
|
240
|
+
"""
|
241
|
+
SpecialLocation
|
242
|
+
0 - Gets the path to the Backup Folders folder location.
|
243
|
+
1 - Gets the path to the Unfiled Notes folder location.
|
244
|
+
2 - Gets the path to the Default Notebook folder location.
|
245
|
+
"""
|
246
|
+
try:
|
247
|
+
return self.process.GetSpecialLocation(special_location)
|
248
|
+
except Exception as e:
|
249
|
+
print("Could not retreive special location")
|
250
|
+
|
251
|
+
|
252
|
+
class _CommonMethods:
|
253
|
+
""" 笔记本、分区组、分区、页面 共有的一些成员方法 """
|
254
|
+
|
255
|
+
def init_node(self):
|
256
|
+
node = Node(self.name, _category=type(self).__name__)
|
257
|
+
# if type(self).__name__ != 'Page':
|
258
|
+
node._html_content = f'<a href="onenote/linkid?id={self.id}" target="_blank">{self.name}</a>'
|
259
|
+
return node
|
260
|
+
|
261
|
+
@property
|
262
|
+
def ancestors(self):
|
263
|
+
""" 获得所有父结点 """
|
264
|
+
parents = []
|
265
|
+
p = self
|
266
|
+
while getattr(p, 'parent', False):
|
267
|
+
parents.append(p.parent)
|
268
|
+
p = p.parent
|
269
|
+
return reversed(parents)
|
270
|
+
|
271
|
+
@property
|
272
|
+
def abspath_name(self):
|
273
|
+
names = [x.name for x in self.ancestors]
|
274
|
+
names.append(self.name)
|
275
|
+
return '/'.join(names)
|
276
|
+
|
277
|
+
def get_page_num(self):
|
278
|
+
return sum([x.get_page_num() for x in self._children])
|
279
|
+
|
280
|
+
def get_search_tree(self, *, print_mode=True, use_node_cache=True, reparse=False):
|
281
|
+
""" 获得检索树的根节点
|
282
|
+
|
283
|
+
:param print_mode: 输出所包含页面解析进度
|
284
|
+
:param use_node_cache: 检查xml是否有更新
|
285
|
+
默认可以不检查,如果一个widget有_node结点可以直接使用
|
286
|
+
:param reparse: 使用旧的解析过xml的持久化文件数据
|
287
|
+
这个好处是速度快,坏处是如果解析代码功能有更新,这样提取到的是错误的旧的解析数据
|
288
|
+
这个缓存文件的处理规则,为了一些细节优化,也稍复杂~~
|
289
|
+
"""
|
290
|
+
global _page_parsed_cache
|
291
|
+
|
292
|
+
# 1 进度条工具,开一个子线程,每秒监控页面解析进度
|
293
|
+
def timer_progress():
|
294
|
+
""" 为了实现这个进度条,还有点技术含量呢,用了子线程和一些trick """
|
295
|
+
global _free_page_nodes
|
296
|
+
|
297
|
+
def dfs(x):
|
298
|
+
cnt = 0
|
299
|
+
|
300
|
+
_category = getattr(x, '_category', '')
|
301
|
+
if _category == 'Page':
|
302
|
+
cnt += 1
|
303
|
+
elif _category == '':
|
304
|
+
return cnt
|
305
|
+
|
306
|
+
for y in x.children:
|
307
|
+
cnt += dfs(y)
|
308
|
+
return cnt
|
309
|
+
|
310
|
+
total = self.get_page_num()
|
311
|
+
_tqdm = tqdm(desc='OneNote解析页面', disable=False, total=total)
|
312
|
+
while not stop_flag:
|
313
|
+
num = dfs(self._node) # 只读取,不修改self._node,而且进度条稍微有些错没关系,所以不加锁
|
314
|
+
_free_page_nodes = list(filter(lambda x: x.root != self._node, _free_page_nodes))
|
315
|
+
num += len(_free_page_nodes)
|
316
|
+
_tqdm.n = num
|
317
|
+
_tqdm.refresh()
|
318
|
+
|
319
|
+
time.sleep(1)
|
320
|
+
|
321
|
+
# 最后统计一轮
|
322
|
+
num = dfs(self._node)
|
323
|
+
_free_page_nodes = []
|
324
|
+
_tqdm.total = _tqdm.n = num
|
325
|
+
_tqdm.refresh()
|
326
|
+
|
327
|
+
# print(f'一共{total}个页面,实际解析出{num}个页面')
|
328
|
+
# 实际解析成功的页面数
|
329
|
+
return num
|
330
|
+
|
331
|
+
# 2 主线程解析页面的过程,子线程每秒钟展示一次进度情况
|
332
|
+
if reparse and type(self).__name__ == 'OneNote': # 如果是OneNote层面reparse,直接重置整个缓存文件
|
333
|
+
_page_parsed_cache = {}
|
334
|
+
cache_num = len(_page_parsed_cache)
|
335
|
+
|
336
|
+
if print_mode:
|
337
|
+
stop_flag = False
|
338
|
+
timer_thread = Thread(target=timer_progress)
|
339
|
+
timer_thread.start()
|
340
|
+
root = self._search(use_node_cache=use_node_cache, reparse=reparse)
|
341
|
+
stop_flag = True # 使用进程里的共享变量,进行主线程和子线程之间的通信
|
342
|
+
timer_thread.join()
|
343
|
+
else:
|
344
|
+
root = self._search(use_node_cache=use_node_cache, reparse=reparse)
|
345
|
+
|
346
|
+
# 保存缓存文件
|
347
|
+
if cache_num != len(_page_parsed_cache) or reparse:
|
348
|
+
# 如果前后数量不一致,表示有更新内容,重新写入一份缓存文件。或者明确使用了reparse了也要保存。
|
349
|
+
_page_parsed_cache_file.write_pkl(_page_parsed_cache)
|
350
|
+
|
351
|
+
root.parent = None
|
352
|
+
return root
|
353
|
+
|
354
|
+
def search(self, pattern, child_depth=0, *,
|
355
|
+
edits=None, reparse=False,
|
356
|
+
print_mode=False, return_mode='text',
|
357
|
+
padding_mode=0, dedent=1, href_mode=1):
|
358
|
+
""" 查找内容
|
359
|
+
|
360
|
+
Page、Section、SectionGroup等实际所用的search方法
|
361
|
+
|
362
|
+
:param pattern:
|
363
|
+
text, 检索出现该关键词的node,并展示其相关的上下文内容
|
364
|
+
func, 可以输入自定义函数 check_node(node)->bool,True表示符合检索条件的node
|
365
|
+
re.compile,可以输入编译的正则模式,会使用re.search进行匹配
|
366
|
+
:param int child_depth: 对于检索到的node,向下展开几层子结点
|
367
|
+
-1,表示全部展开
|
368
|
+
0,表示不展开子结点
|
369
|
+
1,只展开直接子结点
|
370
|
+
...
|
371
|
+
:param return_mode:
|
372
|
+
text,文本展示
|
373
|
+
html,网页富文本
|
374
|
+
:param href_mode: 超链接的模式
|
375
|
+
0,不设超链接。text模式下强制设为0。
|
376
|
+
1,静态链接(需要调用onenote生成,要多花一点点时间)
|
377
|
+
2,动态链接,在开启server的时候使用才有意义
|
378
|
+
"""
|
379
|
+
# 1 按照规则检索内容
|
380
|
+
if isinstance(pattern, str): # 文本关键词检索
|
381
|
+
def check_node(node):
|
382
|
+
# 纯文本部分
|
383
|
+
if pattern in node.name:
|
384
|
+
return True
|
385
|
+
# 如果纯文本找不到,也会在富文本格式里尝试匹配
|
386
|
+
html_text = getattr(node, '_html_content', '')
|
387
|
+
if html_text and pattern in html_text:
|
388
|
+
return True
|
389
|
+
return False
|
390
|
+
|
391
|
+
elif isinstance(pattern, re.Pattern): # 正则检索
|
392
|
+
def check_node(node):
|
393
|
+
if pattern.search(node.name):
|
394
|
+
return True
|
395
|
+
html_text = getattr(node, '_html_content', '')
|
396
|
+
if html_text and pattern.search(html_text):
|
397
|
+
return True
|
398
|
+
return False
|
399
|
+
else: # 自定义检索
|
400
|
+
check_node = pattern
|
401
|
+
|
402
|
+
# 2 更新索引并获得解析树
|
403
|
+
start_time = time.time()
|
404
|
+
edits = edits or []
|
405
|
+
edits = [list(name.split('/')) for name in edits]
|
406
|
+
# 更新易变数据的检索树
|
407
|
+
for path in edits:
|
408
|
+
node = self(path)
|
409
|
+
print(node.abspath_name)
|
410
|
+
node.get_search_tree(print_mode=False, use_node_cache=False, reparse=reparse)
|
411
|
+
root = self.get_search_tree(print_mode=print_mode, use_node_cache=True, reparse=reparse)
|
412
|
+
elapsed1 = time.time() - start_time
|
413
|
+
|
414
|
+
# 3 检索内容
|
415
|
+
start_time = time.time()
|
416
|
+
n = XlNode.sign_node(root, check_node, flag_name='_flag', child_depth=child_depth, reset_flag=True)
|
417
|
+
elapsed2 = time.time() - start_time
|
418
|
+
|
419
|
+
# 4 html情况下的渲染算法
|
420
|
+
def node_to_html(x, depth):
|
421
|
+
import html
|
422
|
+
|
423
|
+
if depth < 0:
|
424
|
+
return '<br/>'
|
425
|
+
|
426
|
+
content = f'{getattr(x, "_html_content", html.escape(x.name))}'
|
427
|
+
if not hasattr(x, '_category'):
|
428
|
+
pass
|
429
|
+
elif x._category == 'OE':
|
430
|
+
if href_mode == 1:
|
431
|
+
url = onenote.get_hyperlink_to_object(x._page_id, x._object_id)
|
432
|
+
content += f' <a href="{url}">go</a>'
|
433
|
+
elif href_mode == 2:
|
434
|
+
url = f"onenote/linkid?id={x._page_id}&object_id={x._object_id}"
|
435
|
+
content += f' <a href="{url}" target="_blank">go</a>'
|
436
|
+
elif x._category == 'Page':
|
437
|
+
color = ['#009900', '#00b300', '#00cc00'][x._page_level - 1]
|
438
|
+
content = f'<font color="{color}">{x.name}</font>'
|
439
|
+
if href_mode == 1:
|
440
|
+
url = onenote.get_hyperlink_to_object(x._page_id)
|
441
|
+
content = f'<a href="{url}">{content}</a>'
|
442
|
+
elif href_mode == 2:
|
443
|
+
url = f"onenote/linkid?id={x._page_id}"
|
444
|
+
content = f'<a href="{url}" target="_blank">{content}</a>'
|
445
|
+
|
446
|
+
content = content.replace('\n', ' ')
|
447
|
+
|
448
|
+
if padding_mode == 1:
|
449
|
+
div = f'<div>{" " * depth * 4}{content}</div>'
|
450
|
+
else:
|
451
|
+
div = f'<div style="padding-left:{depth * 2 + 1}em;text-indent:-1em">{content}</div>'
|
452
|
+
|
453
|
+
return div
|
454
|
+
|
455
|
+
# 5 展示内容
|
456
|
+
texts = [f'更新数据:{elapsed1:.2f}秒,内容检索:{elapsed2:.2f}秒,匹配条目数:{n}']
|
457
|
+
if return_mode == 'text':
|
458
|
+
body = XlNode.render(root, filter_=lambda x: getattr(x, '_flag', 0), dedent=dedent)
|
459
|
+
texts[0] += f',内容大小:{format_size(len(body.encode()), binary=True)}\n'
|
460
|
+
texts.append(body)
|
461
|
+
return '\n'.join(texts)
|
462
|
+
elif return_mode == 'html':
|
463
|
+
body = XlNode.render_html(root, node_to_html, filter_=lambda x: getattr(x, '_flag', 0), dedent=dedent)
|
464
|
+
texts[0] += f',内容大小:{format_size(len(body.encode()), binary=True)}<br/>'
|
465
|
+
texts.append(body)
|
466
|
+
return '<br/>'.join(texts)
|
467
|
+
else:
|
468
|
+
raise ValueError
|
469
|
+
|
470
|
+
def __call__(self, item=None):
|
471
|
+
""" 通过路径形式定位 """
|
472
|
+
if isinstance(item, str):
|
473
|
+
if '/' in item:
|
474
|
+
return reduce(lambda x, name: x[name], [self] + list(item.split('/')))
|
475
|
+
else:
|
476
|
+
return self[item]
|
477
|
+
else:
|
478
|
+
return self
|
479
|
+
|
480
|
+
|
481
|
+
class OneNote(ONProcess, _CommonMethods):
|
482
|
+
""" OneNote软件,这是一个单例类
|
483
|
+
|
484
|
+
注意,从ONProcess继承的OneNote也是单例类
|
485
|
+
但从ONProcess、OneNote生成的是不同的两个对象
|
486
|
+
"""
|
487
|
+
|
488
|
+
def __init__(self, timeout=30):
|
489
|
+
"""
|
490
|
+
如果出现这个错误:This COM object can not automate the makepy process - please run makepy manually for this object
|
491
|
+
可以照 https://github.com/varunsrin/one-py 文章末尾的方式操作
|
492
|
+
把 HKEY_CLASSES_ROOT\TypeLib\{0EA692EE-BB50-4E3C-AEF0-356D91732725} 的 1.0 删掉
|
493
|
+
(这个 KEY ID 值大家电脑上都是一样的)
|
494
|
+
"""
|
495
|
+
# trick: 这里有跟单例类有关的一些问题,导致ONProcess需要提前初始化一次
|
496
|
+
super().__init__(timeout)
|
497
|
+
|
498
|
+
self.xml = self.get_hierarchy("", 4)
|
499
|
+
self.object_tree = ElementTree.fromstring(self.xml)
|
500
|
+
self.hierarchy = Hierarchy(self.object_tree)
|
501
|
+
self._children = list(self.hierarchy)
|
502
|
+
self.name = 'onenote'
|
503
|
+
self.id = self._children[0].id # 以第1个笔记本作为OneNote的id,方便一些功能统一设计
|
504
|
+
self._node = self.init_node()
|
505
|
+
|
506
|
+
def get_href(self):
|
507
|
+
# OneNote软件本身没有跳转,这里默认设置跳转到第1个笔记本
|
508
|
+
return self._children[0].get_href()
|
509
|
+
|
510
|
+
def init_node(self):
|
511
|
+
node = Node(self.name, _category='OneNote', _html_content=f'<font color="purple">OneNote</font>')
|
512
|
+
return node
|
513
|
+
|
514
|
+
def get_page_content(self, page_id):
|
515
|
+
page_content_xml = ElementTree.fromstring(super(OneNote, self).get_page_content(page_id))
|
516
|
+
return PageContent(page_content_xml)
|
517
|
+
|
518
|
+
def update_page_content(self, page_changes_xml_in):
|
519
|
+
"""
|
520
|
+
:param page_changes_xml_in:
|
521
|
+
xml,可以是原始的xml文本
|
522
|
+
onenote.update_page_content(page.get_xml().replace('曹一众', '曹二众'))
|
523
|
+
soup, 可以传入一个bs4的soup对象
|
524
|
+
|
525
|
+
这里设置pytz时间的东西我也看不懂,但大受震撼~~有点莫名其妙
|
526
|
+
How to debug win32com call in python:
|
527
|
+
https://stackoverflow.com/questions/34904094/how-to-debug-win32com-call-in-python/34979646#34979646
|
528
|
+
"""
|
529
|
+
return super(OneNote, self).update_page_content(page_changes_xml_in,
|
530
|
+
pytz.utc.localize(datetime.datetime(1899, 12, 30)))
|
531
|
+
|
532
|
+
def names(self):
|
533
|
+
""" 所有笔记本的名称 """
|
534
|
+
ls = list(map(lambda x: x.name, self.hierarchy))
|
535
|
+
return ls
|
536
|
+
|
537
|
+
def nicknames(self):
|
538
|
+
""" 所有笔记本的昵称 """
|
539
|
+
ls = list(map(lambda x: x.nickname, self.hierarchy))
|
540
|
+
return ls
|
541
|
+
|
542
|
+
def __getitem__(self, item):
|
543
|
+
""" 通过编号或名称索引获得笔记本 """
|
544
|
+
return self.hierarchy[item]
|
545
|
+
|
546
|
+
def _search(self, *, use_node_cache=True, reparse=False):
|
547
|
+
if not self._node.is_leaf and use_node_cache:
|
548
|
+
return self._node
|
549
|
+
else:
|
550
|
+
self._node = self.init_node()
|
551
|
+
|
552
|
+
for x in self._children:
|
553
|
+
cur_node = x._search(use_node_cache=use_node_cache, reparse=reparse)
|
554
|
+
cur_node.parent = self._node
|
555
|
+
|
556
|
+
return self._node
|
557
|
+
|
558
|
+
|
559
|
+
class Hierarchy:
|
560
|
+
|
561
|
+
def __init__(self, xml=None):
|
562
|
+
self.xml = xml
|
563
|
+
self._children = []
|
564
|
+
if xml is not None:
|
565
|
+
self.__deserialize_from_xml(xml)
|
566
|
+
|
567
|
+
def __deserialize_from_xml(self, xml):
|
568
|
+
self._children = [Notebook(n) for n in xml]
|
569
|
+
|
570
|
+
def __iter__(self):
|
571
|
+
for c in self._children:
|
572
|
+
yield c
|
573
|
+
|
574
|
+
def __getitem__(self, item):
|
575
|
+
"""通过编号或名称索引子节点内容"""
|
576
|
+
if isinstance(item, int):
|
577
|
+
return self._children[item]
|
578
|
+
elif isinstance(item, str):
|
579
|
+
for nb in self:
|
580
|
+
if nb.nickname == item:
|
581
|
+
return nb
|
582
|
+
return None
|
583
|
+
|
584
|
+
|
585
|
+
class HierarchyNode:
|
586
|
+
|
587
|
+
def __init__(self, parent=None):
|
588
|
+
self.name = ""
|
589
|
+
self.path = ""
|
590
|
+
self.id = ""
|
591
|
+
self.last_modified_time = ""
|
592
|
+
self.synchronized = ""
|
593
|
+
|
594
|
+
def deserialize_from_xml(self, xml):
|
595
|
+
self.name = xml.get("name")
|
596
|
+
self.path = xml.get("path") # page没有这个属性,但也不会报错的
|
597
|
+
self.id = xml.get("ID")
|
598
|
+
self.last_modified_time = xml.get("lastModifiedTime")
|
599
|
+
|
600
|
+
|
601
|
+
class Notebook(HierarchyNode, _CommonMethods):
|
602
|
+
|
603
|
+
def __init__(self, xml=None):
|
604
|
+
self.xml = xml
|
605
|
+
super().__init__(self)
|
606
|
+
self.nickname = ""
|
607
|
+
self.color = ""
|
608
|
+
self.is_currently_viewed = ""
|
609
|
+
self.recycleBin = None
|
610
|
+
self._children = []
|
611
|
+
if xml is not None:
|
612
|
+
self.__deserialize_from_xml(xml)
|
613
|
+
|
614
|
+
self._node = self.init_node()
|
615
|
+
|
616
|
+
def get_href(self):
|
617
|
+
return 'onenote:' + self.path[:-1]
|
618
|
+
|
619
|
+
def init_node(self):
|
620
|
+
node = Node(self.name, _category='Notebook',
|
621
|
+
_html_content=f'<a href="{self.get_href()}"><font color="red">《{self.name}》</font></a>')
|
622
|
+
return node
|
623
|
+
|
624
|
+
def __deserialize_from_xml(self, xml):
|
625
|
+
HierarchyNode.deserialize_from_xml(self, xml)
|
626
|
+
self.nickname = xml.get("nickname")
|
627
|
+
self.color = xml.get("color")
|
628
|
+
self.is_currently_viewed = xml.get("isCurrentlyViewed")
|
629
|
+
self.recycleBin = None
|
630
|
+
for node in xml:
|
631
|
+
if node.tag == namespace + "Section":
|
632
|
+
self._children.append(Section(node, self))
|
633
|
+
|
634
|
+
elif node.tag == namespace + "SectionGroup":
|
635
|
+
if node.get("isRecycleBin"):
|
636
|
+
self.recycleBin = SectionGroup(node, self)
|
637
|
+
else:
|
638
|
+
self._children.append(SectionGroup(node, self))
|
639
|
+
|
640
|
+
def __iter__(self):
|
641
|
+
for c in self._children:
|
642
|
+
yield c
|
643
|
+
|
644
|
+
def __str__(self):
|
645
|
+
return self.name
|
646
|
+
|
647
|
+
def __getitem__(self, item):
|
648
|
+
"""通过编号或名称索引子节点内容"""
|
649
|
+
if isinstance(item, int):
|
650
|
+
return self._children[item]
|
651
|
+
elif isinstance(item, str):
|
652
|
+
for nb in self:
|
653
|
+
if nb.name == item:
|
654
|
+
return nb
|
655
|
+
return None
|
656
|
+
|
657
|
+
def _search(self, *, use_node_cache=True, reparse=False):
|
658
|
+
if not self._node.is_leaf and use_node_cache:
|
659
|
+
return self._node
|
660
|
+
else:
|
661
|
+
self._node = self.init_node()
|
662
|
+
|
663
|
+
for x in self._children:
|
664
|
+
cur_node = x._search(use_node_cache=use_node_cache, reparse=reparse)
|
665
|
+
cur_node.parent = self._node
|
666
|
+
|
667
|
+
return self._node
|
668
|
+
|
669
|
+
|
670
|
+
class SectionGroup(HierarchyNode, _CommonMethods):
|
671
|
+
""" 分区组 """
|
672
|
+
|
673
|
+
def __init__(self, xml=None, parent_node=None):
|
674
|
+
self.xml = xml
|
675
|
+
super().__init__(self)
|
676
|
+
self.is_recycle_Bin = False
|
677
|
+
self._children = []
|
678
|
+
self.parent = parent_node
|
679
|
+
if xml is not None:
|
680
|
+
self.__deserialize_from_xml(xml)
|
681
|
+
|
682
|
+
self._node = self.init_node()
|
683
|
+
|
684
|
+
def get_href(self):
|
685
|
+
return 'onenote:' + self.path[:-1]
|
686
|
+
|
687
|
+
def init_node(self):
|
688
|
+
node = Node(self.name, _category='SectionGroup',
|
689
|
+
_html_content=f'<a href="{self.get_href()}"><font color="#e68a00">〖{self.name}〗</font></a>')
|
690
|
+
return node
|
691
|
+
|
692
|
+
def __iter__(self):
|
693
|
+
# ckz: 这个遍历的时候,就是OneNote里看到的从左到右的顺序:先所有分区,然后所有分区组
|
694
|
+
#
|
695
|
+
for c in self._children:
|
696
|
+
yield c
|
697
|
+
|
698
|
+
def __str__(self):
|
699
|
+
return self.name
|
700
|
+
|
701
|
+
def __deserialize_from_xml(self, xml):
|
702
|
+
HierarchyNode.deserialize_from_xml(self, xml)
|
703
|
+
self.is_recycle_Bin = xml.get("isRecycleBin")
|
704
|
+
for node in xml:
|
705
|
+
if node.tag == namespace + "SectionGroup":
|
706
|
+
self._children.append(SectionGroup(node, self))
|
707
|
+
if node.tag == namespace + "Section":
|
708
|
+
self._children.append(Section(node, self))
|
709
|
+
|
710
|
+
def __getitem__(self, item):
|
711
|
+
""" 通过 编号 或 名称 索引子节点内容
|
712
|
+
|
713
|
+
注意使用字符串引用的时候,可能会有重名的问题!
|
714
|
+
"""
|
715
|
+
if isinstance(item, int):
|
716
|
+
return self._children[item]
|
717
|
+
elif isinstance(item, str):
|
718
|
+
for nb in self:
|
719
|
+
if nb.name == item:
|
720
|
+
return nb
|
721
|
+
return None
|
722
|
+
|
723
|
+
def _search(self, *, use_node_cache=True, reparse=False):
|
724
|
+
if not self._node.is_leaf and use_node_cache:
|
725
|
+
return self._node
|
726
|
+
else:
|
727
|
+
self._node = self.init_node()
|
728
|
+
|
729
|
+
for x in self._children:
|
730
|
+
cur_node = x._search(use_node_cache=use_node_cache, reparse=reparse)
|
731
|
+
cur_node.parent = self._node
|
732
|
+
|
733
|
+
# 这里多线程效率几乎没差,就不开了
|
734
|
+
# def run_unit(x):
|
735
|
+
# cur_node = x._search(reset=reset)
|
736
|
+
# cur_node.parent = self._node
|
737
|
+
# mtqdm(run_unit, self._children, max_workers=2, disable=True)
|
738
|
+
|
739
|
+
return self._node
|
740
|
+
|
741
|
+
|
742
|
+
class Section(HierarchyNode, _CommonMethods):
|
743
|
+
""" 分区 """
|
744
|
+
|
745
|
+
def __init__(self, xml=None, parent_node=None):
|
746
|
+
self.xml = xml
|
747
|
+
super().__init__(self)
|
748
|
+
self.color = ""
|
749
|
+
self.read_only = False
|
750
|
+
self.is_currently_viewed = False
|
751
|
+
self._children = []
|
752
|
+
self.parent = parent_node
|
753
|
+
if xml is not None:
|
754
|
+
self.__deserialize_from_xml(xml)
|
755
|
+
|
756
|
+
self._node = self.init_node()
|
757
|
+
|
758
|
+
def get_href(self):
|
759
|
+
return 'onenote:' + self.path
|
760
|
+
|
761
|
+
def init_node(self):
|
762
|
+
node = Node(self.name, _category='Section',
|
763
|
+
_html_content=f'<a href="{self.get_href()}"><font color="#b38600">〈{self.name}〉</font></a>')
|
764
|
+
return node
|
765
|
+
|
766
|
+
def __iter__(self):
|
767
|
+
for c in self._children:
|
768
|
+
yield c
|
769
|
+
|
770
|
+
def __str__(self):
|
771
|
+
return self.name
|
772
|
+
|
773
|
+
def __deserialize_from_xml(self, xml):
|
774
|
+
HierarchyNode.deserialize_from_xml(self, xml)
|
775
|
+
self.color = xml.get("color")
|
776
|
+
try:
|
777
|
+
self.read_only = xml.get("readOnly")
|
778
|
+
except Exception as e:
|
779
|
+
self.read_only = False
|
780
|
+
try:
|
781
|
+
self.is_currently_viewed = xml.get("isCurrentlyViewed")
|
782
|
+
except Exception as e:
|
783
|
+
self.is_currently_viewed = False
|
784
|
+
|
785
|
+
self._children = [Page(node, self) for node in xml]
|
786
|
+
|
787
|
+
def __getitem__(self, item):
|
788
|
+
"""通过编号或名称索引子节点内容"""
|
789
|
+
if isinstance(item, int):
|
790
|
+
return self._children[item]
|
791
|
+
elif isinstance(item, str):
|
792
|
+
for nb in self:
|
793
|
+
if nb.name == item:
|
794
|
+
return nb
|
795
|
+
return None
|
796
|
+
|
797
|
+
def get_page_num(self):
|
798
|
+
return len(self._children)
|
799
|
+
|
800
|
+
def _search(self, *, use_node_cache=True, reparse=False):
|
801
|
+
if not self._node.is_leaf and use_node_cache:
|
802
|
+
return self._node
|
803
|
+
else:
|
804
|
+
self._node = self.init_node()
|
805
|
+
|
806
|
+
page_lv1, page_lv2 = self._node, self._node
|
807
|
+
|
808
|
+
for x in self._children:
|
809
|
+
# print(x.name)
|
810
|
+
cur_page = x._search(use_node_cache=use_node_cache, reparse=reparse)
|
811
|
+
if x.page_level == '1':
|
812
|
+
cur_page.parent = self._node
|
813
|
+
page_lv2 = page_lv1 = cur_page
|
814
|
+
elif x.page_level == '2':
|
815
|
+
cur_page.parent = page_lv1
|
816
|
+
page_lv2 = cur_page
|
817
|
+
else:
|
818
|
+
cur_page.parent = page_lv2
|
819
|
+
|
820
|
+
return self._node
|
821
|
+
|
822
|
+
|
823
|
+
class Page(_CommonMethods):
|
824
|
+
""" 页面 """
|
825
|
+
|
826
|
+
def __init__(self, xml=None, parent_node=None):
|
827
|
+
self.xml = xml
|
828
|
+
self.name = ""
|
829
|
+
self.id = ""
|
830
|
+
self.date_time = ""
|
831
|
+
self.last_modified_time = ""
|
832
|
+
self.page_level = ""
|
833
|
+
self.is_currently_viewed = ""
|
834
|
+
self._children = []
|
835
|
+
self.parent = parent_node
|
836
|
+
if xml is not None: # != None is required here, since this can return false
|
837
|
+
self.__deserialize_from_xml(xml)
|
838
|
+
|
839
|
+
self._node = self.init_node() # 供全文检索的树形结点
|
840
|
+
|
841
|
+
def get_href(self, strict=False):
|
842
|
+
""" 按照OneNote的链接规则,如果同一个分区下,有重名Page,只会查找到第1个
|
843
|
+
如果要精确指向页面,需要使用get_hyperlink_to_object方法
|
844
|
+
"""
|
845
|
+
if strict:
|
846
|
+
return onenote.get_hyperlink_to_object(self.id)
|
847
|
+
else:
|
848
|
+
return self.parent.get_href() + f'#{self.name}'
|
849
|
+
|
850
|
+
def init_node(self):
|
851
|
+
node = Node(self.name, _category='Page', _page_id=self.id, _page_level=int(self.page_level))
|
852
|
+
return node
|
853
|
+
|
854
|
+
def __iter__(self):
|
855
|
+
for c in self._children:
|
856
|
+
yield c
|
857
|
+
|
858
|
+
def __str__(self):
|
859
|
+
return self.name
|
860
|
+
|
861
|
+
# Get / Set Meta
|
862
|
+
|
863
|
+
@property
|
864
|
+
def root(self):
|
865
|
+
p = self
|
866
|
+
while getattr(p, 'parent', False):
|
867
|
+
p = p.parent
|
868
|
+
return p
|
869
|
+
|
870
|
+
def __deserialize_from_xml(self, xml):
|
871
|
+
self.xml = xml
|
872
|
+
self.name = xml.get("name")
|
873
|
+
self.id = xml.get("ID")
|
874
|
+
self.date_time = xml.get("dateTime")
|
875
|
+
self.last_modified_time = xml.get("lastModifiedTime")
|
876
|
+
self.page_level = xml.get("pageLevel")
|
877
|
+
self.is_currently_viewed = xml.get("isCurrentlyViewed")
|
878
|
+
self._children = [Meta(node) for node in xml]
|
879
|
+
|
880
|
+
def get_xml(self, page_info=0):
|
881
|
+
""" 获得页面的xml内容 """
|
882
|
+
# 1 有缓存的文件直接读取
|
883
|
+
prefix = f'{self.id}_{page_info}_'
|
884
|
+
file = CACHE_DIR / (prefix + self.last_modified_time.replace(':', '') + f'.xml')
|
885
|
+
|
886
|
+
if file.is_file():
|
887
|
+
return file.read_text()
|
888
|
+
|
889
|
+
# 2 否则没有缓存,或者文件不是最新,则使用onenote的接口获得文件内容
|
890
|
+
try:
|
891
|
+
res = super(OneNote, onenote).get_page_content(self.id, page_info)
|
892
|
+
except TimeoutError as e:
|
893
|
+
e.args = [e.args[0] + f'\n\t{self.abspath_name} 页面获取失败,请检查可能包含的office公式并删除。' \
|
894
|
+
f'并且看下您的OneNote可能无响应了,请重启OneNote。']
|
895
|
+
raise e
|
896
|
+
|
897
|
+
if res is None:
|
898
|
+
logging.warning(f'{self.abspath_name} 未成功提取页面内容')
|
899
|
+
else:
|
900
|
+
# 删除旧时间点的缓存文件,存储新的缓存文件
|
901
|
+
for f in CACHE_DIR.glob(f'{prefix}*.xml'):
|
902
|
+
f.delete()
|
903
|
+
file.write_text(res)
|
904
|
+
|
905
|
+
return res
|
906
|
+
|
907
|
+
def browser_xml(self, page_info=0):
|
908
|
+
from pyxllib.prog.specialist import browser
|
909
|
+
from pyxllib.file.specialist import XlPath
|
910
|
+
xml = self.get_xml(page_info)
|
911
|
+
browser(xml, file=XlPath.tempfile('.xml'))
|
912
|
+
|
913
|
+
def parse_xml(self, root=None, *, page_info=0, reparse=False):
|
914
|
+
""" 获得本Page页面的树形结构,返回一个Node根节点
|
915
|
+
|
916
|
+
:param reparse: 默认直接使用缓存里的解析结果,如果设置了reparse则强制重新解析
|
917
|
+
|
918
|
+
因为层次结构有多种不同的表现形式
|
919
|
+
层次结构A:Outline 文本框
|
920
|
+
层次结构B:h1、h2、h3 等标题结构
|
921
|
+
层次结构C:正文里的缩进层级
|
922
|
+
所以实现这里parent的引用,算法会稍复杂。需要动态更新,实时返回新的parent。
|
923
|
+
比如在遍历tag.contents的时候,因为[层次结构C]的原因,会出现 parent = dfs_parse_node(y, parent) 的较奇怪的写法
|
924
|
+
在parse_oe中,parent的层次实现规则,也会较复杂,有些trick
|
925
|
+
"""
|
926
|
+
|
927
|
+
# 0 函数
|
928
|
+
|
929
|
+
def _parse_xml(root):
|
930
|
+
soup = BeautifulSoup(xml or '', 'xml')
|
931
|
+
# self.browser_xml() # 可以用这个查原始的xml内容
|
932
|
+
|
933
|
+
style_defs = {}
|
934
|
+
|
935
|
+
# trick: outline_cnt不仅用来标记是否有多个Outline需要设中介结点。也记录了当前Outline的编号。
|
936
|
+
outline_cnt = 1 if len(soup.find_all('Outline')) > 1 else 0
|
937
|
+
|
938
|
+
cur_node: XlBs4Tag = soup
|
939
|
+
parent = root
|
940
|
+
while cur_node:
|
941
|
+
x = cur_node
|
942
|
+
if isinstance(x, bs4.element.Tag):
|
943
|
+
# if分支后注释的数字,是实际逻辑结构上先后遇到的顺序,但为了效率,按照出现频率重排序了
|
944
|
+
if x.name == 'OE': # 3
|
945
|
+
parent = OETag.parse2tree(x, parent, style_defs)
|
946
|
+
cur_node = cur_node.next_preorder_node(False)
|
947
|
+
continue
|
948
|
+
elif x.name == 'Outline': # 2
|
949
|
+
# 处理层次结构A
|
950
|
+
if outline_cnt:
|
951
|
+
if outline_cnt == 1:
|
952
|
+
parent = Node(f'Outline{outline_cnt}', parent)
|
953
|
+
else:
|
954
|
+
pp = XlNode.find_parent(parent, re.compile('^Outline'))
|
955
|
+
parent = Node(f'Outline{outline_cnt}', pp.parent if pp else parent)
|
956
|
+
outline_cnt += 1
|
957
|
+
elif x.name == 'QuickStyleDef': # 1
|
958
|
+
style_defs[x['index']] = x['name']
|
959
|
+
cur_node = cur_node.next_preorder_node(False)
|
960
|
+
continue
|
961
|
+
|
962
|
+
cur_node = XlBs4Tag.next_preorder_node(cur_node)
|
963
|
+
|
964
|
+
return root
|
965
|
+
|
966
|
+
# 1 在一次程序执行中,相同的xml内容解析出的树也是一样的,可以做个缓存
|
967
|
+
xml = self.get_xml(page_info=page_info)
|
968
|
+
etag = get_etag(xml)
|
969
|
+
|
970
|
+
if root is None:
|
971
|
+
root = Node('root')
|
972
|
+
|
973
|
+
if etag in _page_parsed_cache and not reparse:
|
974
|
+
root.children = importer.import_(_page_parsed_cache[etag]).children
|
975
|
+
return root
|
976
|
+
|
977
|
+
# 2 否则进入正常解析流程
|
978
|
+
root = _parse_xml(root)
|
979
|
+
_page_parsed_cache[etag] = exporter.export(root)
|
980
|
+
|
981
|
+
return root
|
982
|
+
|
983
|
+
def get_page_num(self):
|
984
|
+
return 1
|
985
|
+
|
986
|
+
def _search(self, *, use_node_cache=True, reparse=False):
|
987
|
+
""" 先生成所有结点
|
988
|
+
"""
|
989
|
+
if not self._node.is_leaf and use_node_cache:
|
990
|
+
# 首先要有孩子结点,不是叶子结点,才表示可能解析过的node,此时开启use_node_cache的话,则不重复解析
|
991
|
+
return self._node
|
992
|
+
else:
|
993
|
+
self._node = self.init_node()
|
994
|
+
|
995
|
+
self.parse_xml(self._node, reparse=reparse)
|
996
|
+
_free_page_nodes.append(self._node)
|
997
|
+
return self._node
|
998
|
+
|
999
|
+
|
1000
|
+
class Meta:
|
1001
|
+
|
1002
|
+
def __init__(self, xml=None):
|
1003
|
+
self.xml = xml
|
1004
|
+
self.name = ""
|
1005
|
+
self.content = ""
|
1006
|
+
if xml is not None:
|
1007
|
+
self.__deserialize_from_xml(xml)
|
1008
|
+
|
1009
|
+
def __str__(self):
|
1010
|
+
return self.name
|
1011
|
+
|
1012
|
+
def __deserialize_from_xml(self, xml):
|
1013
|
+
self.name = xml.get("name")
|
1014
|
+
self.id = xml.get("content")
|
1015
|
+
|
1016
|
+
def get_xml(self):
|
1017
|
+
return super(OneNote, onenote).get_page_content(self.id)
|
1018
|
+
|
1019
|
+
|
1020
|
+
class PageContent:
|
1021
|
+
|
1022
|
+
def __init__(self, xml=None):
|
1023
|
+
self.xml = xml
|
1024
|
+
self.name = ""
|
1025
|
+
self.id = ""
|
1026
|
+
self.date_time = ""
|
1027
|
+
self.last_modified_time = ""
|
1028
|
+
self.page_level = ""
|
1029
|
+
self.lang = ""
|
1030
|
+
self.is_currently_viewed = ""
|
1031
|
+
self._children = []
|
1032
|
+
self.files = []
|
1033
|
+
if xml is not None:
|
1034
|
+
self.__deserialize_from_xml(xml)
|
1035
|
+
|
1036
|
+
def __iter__(self):
|
1037
|
+
for c in self._children:
|
1038
|
+
yield c
|
1039
|
+
|
1040
|
+
def __str__(self):
|
1041
|
+
return self.name
|
1042
|
+
|
1043
|
+
def __deserialize_from_xml(self, xml):
|
1044
|
+
self.name = xml.get("name")
|
1045
|
+
self.id = xml.get("ID")
|
1046
|
+
self.date_time = xml.get("dateTime")
|
1047
|
+
self.last_modified_time = xml.get("lastModifiedTime")
|
1048
|
+
self.page_level = xml.get("pageLevel")
|
1049
|
+
self.lang = xml.get("lang")
|
1050
|
+
self.is_currently_viewed = xml.get("isCurrentlyViewed")
|
1051
|
+
for node in xml:
|
1052
|
+
if node.tag == namespace + "Outline":
|
1053
|
+
self._children.append(Outline(node))
|
1054
|
+
elif node.tag == namespace + "Ink":
|
1055
|
+
self.files.append(Ink(node))
|
1056
|
+
elif node.tag == namespace + "Image":
|
1057
|
+
self.files.append(Image(node))
|
1058
|
+
elif node.tag == namespace + "InsertedFile":
|
1059
|
+
self.files.append(InsertedFile(node))
|
1060
|
+
elif node.tag == namespace + "Title":
|
1061
|
+
self._children.append(Title(node))
|
1062
|
+
|
1063
|
+
|
1064
|
+
class Title:
|
1065
|
+
|
1066
|
+
def __init__(self, xml=None):
|
1067
|
+
self.xml = xml
|
1068
|
+
self.style = ""
|
1069
|
+
self.lang = ""
|
1070
|
+
self._children = []
|
1071
|
+
if xml is not None:
|
1072
|
+
self.__deserialize_from_xml(xml)
|
1073
|
+
|
1074
|
+
def __str__(self):
|
1075
|
+
return "Page Title"
|
1076
|
+
|
1077
|
+
def __iter__(self):
|
1078
|
+
for c in self._children:
|
1079
|
+
yield c
|
1080
|
+
|
1081
|
+
def __deserialize_from_xml(self, xml):
|
1082
|
+
self.style = xml.get("style")
|
1083
|
+
self.lang = xml.get("lang")
|
1084
|
+
for node in xml:
|
1085
|
+
if node.tag == namespace + "OE":
|
1086
|
+
self._children.append(OE(node, self))
|
1087
|
+
|
1088
|
+
|
1089
|
+
class Outline:
|
1090
|
+
|
1091
|
+
def __init__(self, xml=None):
|
1092
|
+
self.xml = xml
|
1093
|
+
self.author = ""
|
1094
|
+
self.author_initials = ""
|
1095
|
+
self.last_modified_by = ""
|
1096
|
+
self.last_modified_by_initials = ""
|
1097
|
+
self.last_modified_time = ""
|
1098
|
+
self.id = ""
|
1099
|
+
self._children = []
|
1100
|
+
if xml is not None:
|
1101
|
+
self.__deserialize_from_xml(xml)
|
1102
|
+
|
1103
|
+
def __iter__(self):
|
1104
|
+
for c in self._children:
|
1105
|
+
yield c
|
1106
|
+
|
1107
|
+
def __str__(self):
|
1108
|
+
return "Outline"
|
1109
|
+
|
1110
|
+
def __deserialize_from_xml(self, xml):
|
1111
|
+
self.author = xml.get("author")
|
1112
|
+
self.author_initials = xml.get("authorInitials")
|
1113
|
+
self.last_modified_by = xml.get("lastModifiedBy")
|
1114
|
+
self.last_modified_by_initials = xml.get("lastModifiedByInitials")
|
1115
|
+
self.last_modified_time = xml.get("lastModifiedTime")
|
1116
|
+
self.id = xml.get("objectID")
|
1117
|
+
append = self._children.append
|
1118
|
+
for node in xml:
|
1119
|
+
if node.tag == namespace + "OEChildren":
|
1120
|
+
for childNode in node:
|
1121
|
+
if childNode.tag == namespace + "OE":
|
1122
|
+
append(OE(childNode, self))
|
1123
|
+
|
1124
|
+
|
1125
|
+
class Position:
|
1126
|
+
|
1127
|
+
def __init__(self, xml=None, parent_node=None):
|
1128
|
+
self.xml = xml
|
1129
|
+
self.x = ""
|
1130
|
+
self.y = ""
|
1131
|
+
self.z = ""
|
1132
|
+
self.parent = parent_node
|
1133
|
+
if xml is not None:
|
1134
|
+
self.__deserialize_from_xml(xml)
|
1135
|
+
|
1136
|
+
def __deserialize_from_xml(self, xml):
|
1137
|
+
self.x = xml.get("x")
|
1138
|
+
self.y = xml.get("y")
|
1139
|
+
self.z = xml.get("z")
|
1140
|
+
|
1141
|
+
|
1142
|
+
class Size:
|
1143
|
+
|
1144
|
+
def __init__(self, xml=None, parent_node=None):
|
1145
|
+
self.xml = xml
|
1146
|
+
self.width = ""
|
1147
|
+
self.height = ""
|
1148
|
+
self.parent = parent_node
|
1149
|
+
if xml is not None:
|
1150
|
+
self.__deserialize_from_xml(xml)
|
1151
|
+
|
1152
|
+
def __deserialize_from_xml(self, xml):
|
1153
|
+
self.width = xml.get("width")
|
1154
|
+
self.height = xml.get("height")
|
1155
|
+
|
1156
|
+
|
1157
|
+
class OE:
|
1158
|
+
|
1159
|
+
def __init__(self, xml=None, parent_node=None):
|
1160
|
+
self.xml = xml
|
1161
|
+
self.creation_time = ""
|
1162
|
+
self.last_modified_time = ""
|
1163
|
+
self.last_modified_by = ""
|
1164
|
+
self.id = ""
|
1165
|
+
self.alignment = ""
|
1166
|
+
self.quick_style_index = ""
|
1167
|
+
self.style = ""
|
1168
|
+
self.text = ""
|
1169
|
+
self._children = []
|
1170
|
+
self.parent = parent_node
|
1171
|
+
self.files = []
|
1172
|
+
if xml is not None:
|
1173
|
+
self.__deserialize_from_xml(xml)
|
1174
|
+
|
1175
|
+
def __iter__(self):
|
1176
|
+
for c in self._children:
|
1177
|
+
yield c
|
1178
|
+
|
1179
|
+
def __str__(self):
|
1180
|
+
try:
|
1181
|
+
return self.text
|
1182
|
+
except AttributeError:
|
1183
|
+
return "Empty OE"
|
1184
|
+
|
1185
|
+
def __deserialize_from_xml(self, xml):
|
1186
|
+
self.creation_time = xml.get("creationTime")
|
1187
|
+
self.last_modified_time = xml.get("lastModifiedTime")
|
1188
|
+
self.last_modified_by = xml.get("lastModifiedBy")
|
1189
|
+
self.id = xml.get("objectID")
|
1190
|
+
self.alignment = xml.get("alignment")
|
1191
|
+
self.quick_style_index = xml.get("quickStyleIndex")
|
1192
|
+
self.style = xml.get("style")
|
1193
|
+
|
1194
|
+
for node in xml:
|
1195
|
+
if node.tag == namespace + "T":
|
1196
|
+
if node.text is not None:
|
1197
|
+
self.text = node.text
|
1198
|
+
else:
|
1199
|
+
self.text = "NO TEXT"
|
1200
|
+
|
1201
|
+
elif node.tag == namespace + "OEChildren":
|
1202
|
+
for childNode in node:
|
1203
|
+
if childNode.tag == namespace + "OE":
|
1204
|
+
self._children.append(OE(childNode, self))
|
1205
|
+
|
1206
|
+
elif node.tag == namespace + "Image":
|
1207
|
+
self.files.append(Image(node, self))
|
1208
|
+
|
1209
|
+
elif node.tag == namespace + "InkWord":
|
1210
|
+
self.files.append(Ink(node, self))
|
1211
|
+
|
1212
|
+
elif node.tag == namespace + "InsertedFile":
|
1213
|
+
self.files.append(InsertedFile(node, self))
|
1214
|
+
|
1215
|
+
|
1216
|
+
class OETag(bs4.element.Tag):
|
1217
|
+
|
1218
|
+
def get_text2(self):
|
1219
|
+
""" 这是给bs4.Tag准备的功能接口 """
|
1220
|
+
if y := self.find('T', recursive=False):
|
1221
|
+
t1 = BeautifulSoup(y.text, 'lxml').text
|
1222
|
+
t2 = y.text
|
1223
|
+
elif y := self.find('Table', recursive=False):
|
1224
|
+
# 先Columns标记了一共m列,每列的宽度
|
1225
|
+
# 然后每一行是一个Row,里面有m个Cell
|
1226
|
+
t1 = '[Table]'
|
1227
|
+
t2 = t1
|
1228
|
+
elif y := self.find('Image', recursive=False):
|
1229
|
+
t1 = '[Image]'
|
1230
|
+
t2 = t1
|
1231
|
+
else:
|
1232
|
+
t1 = ''
|
1233
|
+
t2 = ''
|
1234
|
+
return t1, t2
|
1235
|
+
|
1236
|
+
def parse2tree(self, parent, style_defs):
|
1237
|
+
""" 从Tag结点,解析出 anytree 格式的结点树
|
1238
|
+
|
1239
|
+
:param Node parent: anytree的node父结点
|
1240
|
+
会将当前Tag解析的内容,转存,挂到parent.children下
|
1241
|
+
:param style_defs: 前文解析到的样式表
|
1242
|
+
"""
|
1243
|
+
|
1244
|
+
# 1 获得3个主要属性
|
1245
|
+
style_name = style_defs.get(self.get('quickStyleIndex', ''), '')
|
1246
|
+
pure_text, html_text = OETag.get_text2(self) # 文本内容
|
1247
|
+
m = self.find('OEChildren', recursive=False) # 文本性质子结点
|
1248
|
+
|
1249
|
+
# 空数据跳过
|
1250
|
+
# if not pure_text and not m:
|
1251
|
+
# return parent
|
1252
|
+
|
1253
|
+
# 2 处理层次结构B
|
1254
|
+
if re.match(r'h\d$', style_name): # 标题类
|
1255
|
+
while True:
|
1256
|
+
parent_style_name = getattr(parent, '_style_name', '')
|
1257
|
+
if re.match(r'h\d$', parent_style_name) and parent_style_name >= style_name:
|
1258
|
+
# 如果父结点也是标题类型,且数值上不大于当前结点,则当前结点的实际父结点要往上层找
|
1259
|
+
parent = parent.parent
|
1260
|
+
else:
|
1261
|
+
break
|
1262
|
+
# 标题类,会重置parent,本身作为一个中间结点
|
1263
|
+
cur_node = parent = Node(pure_text, parent, _style_name=style_name, _html_content=html_text)
|
1264
|
+
else:
|
1265
|
+
cur_node = Node(pure_text, parent, _html_content=html_text)
|
1266
|
+
|
1267
|
+
setattr(cur_node, '_category', 'OE')
|
1268
|
+
setattr(cur_node, '_page_id', XlNode.find_parent(self, 'Page')['ID']) # noqa find_parent适用于bs4.element.Tag
|
1269
|
+
setattr(cur_node, '_object_id', self['objectID'])
|
1270
|
+
|
1271
|
+
# 3 表格、图片等特殊结构增设层级
|
1272
|
+
if pure_text.startswith('[Table]'):
|
1273
|
+
for z in self.find_all('T'):
|
1274
|
+
Node(BeautifulSoup(z.text, 'lxml').text, cur_node, _html_content=z.text)
|
1275
|
+
elif pure_text.startswith('[Image]'):
|
1276
|
+
y = self.find('Image', recursive=False)
|
1277
|
+
for z in y.get('alt', '').splitlines():
|
1278
|
+
Node(z, cur_node)
|
1279
|
+
|
1280
|
+
# 4 处理层次结构C
|
1281
|
+
if m:
|
1282
|
+
for y in m.find_all('OE', recursive=False):
|
1283
|
+
OETag.parse2tree(y, cur_node, style_defs)
|
1284
|
+
|
1285
|
+
return parent
|
1286
|
+
|
1287
|
+
|
1288
|
+
class InsertedFile:
|
1289
|
+
|
1290
|
+
# need to add position data to this class
|
1291
|
+
|
1292
|
+
def __init__(self, xml=None, parent_node=None):
|
1293
|
+
self.xml = xml
|
1294
|
+
self.path_cache = ""
|
1295
|
+
self.path_source = ""
|
1296
|
+
self.preferred_name = ""
|
1297
|
+
self.last_modified_time = ""
|
1298
|
+
self.last_modified_by = ""
|
1299
|
+
self.id = ""
|
1300
|
+
self.parent = parent_node
|
1301
|
+
if xml is not None:
|
1302
|
+
self.__deserialize_from_xml(xml)
|
1303
|
+
|
1304
|
+
def __iter__(self):
|
1305
|
+
yield None
|
1306
|
+
|
1307
|
+
def __str__(self):
|
1308
|
+
try:
|
1309
|
+
return self.preferredName
|
1310
|
+
except AttributeError:
|
1311
|
+
return "Unnamed File"
|
1312
|
+
|
1313
|
+
def __deserialize_from_xml(self, xml):
|
1314
|
+
self.path_cache = xml.get("pathCache")
|
1315
|
+
self.path_source = xml.get("pathSource")
|
1316
|
+
self.preferred_name = xml.get("preferredName")
|
1317
|
+
self.last_modified_time = xml.get("lastModifiedTime")
|
1318
|
+
self.last_modified_by = xml.get("lastModifiedBy")
|
1319
|
+
self.id = xml.get("objectID")
|
1320
|
+
|
1321
|
+
|
1322
|
+
class Ink:
|
1323
|
+
|
1324
|
+
# need to add position data to this class
|
1325
|
+
|
1326
|
+
def __init__(self, xml=None, parent_node=None):
|
1327
|
+
self.xml = xml
|
1328
|
+
self.recognized_text = ""
|
1329
|
+
self.x = ""
|
1330
|
+
self.y = ""
|
1331
|
+
self.ink_origin_x = ""
|
1332
|
+
self.ink_origin_y = ""
|
1333
|
+
self.width = ""
|
1334
|
+
self.height = ""
|
1335
|
+
self.data = ""
|
1336
|
+
self.callback_id = ""
|
1337
|
+
self.parent = parent_node
|
1338
|
+
|
1339
|
+
if xml is not None:
|
1340
|
+
self.__deserialize_from_xml(xml)
|
1341
|
+
|
1342
|
+
def __iter__(self):
|
1343
|
+
yield None
|
1344
|
+
|
1345
|
+
def __str__(self):
|
1346
|
+
try:
|
1347
|
+
return self.recognizedText
|
1348
|
+
except AttributeError:
|
1349
|
+
return "Unrecognized Ink"
|
1350
|
+
|
1351
|
+
def __deserialize_from_xml(self, xml):
|
1352
|
+
self.recognized_text = xml.get("recognizedText")
|
1353
|
+
self.x = xml.get("x")
|
1354
|
+
self.y = xml.get("y")
|
1355
|
+
self.ink_origin_x = xml.get("inkOriginX")
|
1356
|
+
self.ink_origin_y = xml.get("inkOriginY")
|
1357
|
+
self.width = xml.get("width")
|
1358
|
+
self.height = xml.get("height")
|
1359
|
+
|
1360
|
+
for node in xml:
|
1361
|
+
if node.tag == namespace + "CallbackID":
|
1362
|
+
self.callback_id = node.get("callbackID")
|
1363
|
+
elif node.tag == namespace + "Data":
|
1364
|
+
self.data = node.text
|
1365
|
+
|
1366
|
+
|
1367
|
+
class Image:
|
1368
|
+
|
1369
|
+
def __init__(self, xml=None, parent_node=None):
|
1370
|
+
self.xml = xml
|
1371
|
+
self.format = ""
|
1372
|
+
self.original_page_number = ""
|
1373
|
+
self.last_modified_time = ""
|
1374
|
+
self.id = ""
|
1375
|
+
self.callback_id = None
|
1376
|
+
self.data = ""
|
1377
|
+
self.parent = parent_node
|
1378
|
+
if xml is not None:
|
1379
|
+
self.__deserialize_from_xml(xml)
|
1380
|
+
|
1381
|
+
def __iter__(self):
|
1382
|
+
yield None
|
1383
|
+
|
1384
|
+
def __str__(self):
|
1385
|
+
return self.format + " Image"
|
1386
|
+
|
1387
|
+
def __deserialize_from_xml(self, xml):
|
1388
|
+
self.format = xml.get("format")
|
1389
|
+
self.original_page_number = xml.get("originalPageNumber")
|
1390
|
+
self.last_modified_time = xml.get("lastModifiedTime")
|
1391
|
+
self.id = xml.get("objectID")
|
1392
|
+
for node in xml:
|
1393
|
+
if node.tag == namespace + "CallbackID":
|
1394
|
+
self.callback_id = node.get("callbackID")
|
1395
|
+
elif node.tag == namespace + "Data":
|
1396
|
+
if node.text is not None:
|
1397
|
+
self.data = node.text
|
1398
|
+
|
1399
|
+
|
1400
|
+
onenote = OneNote()
|
1401
|
+
|
1402
|
+
|
1403
|
+
def start_server(root=None, edits=None, *, port=80, reparse=False):
|
1404
|
+
""" 在本地开启一个onenote搜索服务
|
1405
|
+
|
1406
|
+
:param str root: 要检索的onenote根目录,未设置时默认初始化所有OneNote笔记
|
1407
|
+
所有的路径一律用反斜杠/隔开
|
1408
|
+
注意这样写是会降低灵活性的
|
1409
|
+
一方面本来是可以使用数字直接引用下标的,这种模式下出现的数字会直接判定为是页面名称
|
1410
|
+
另一方面路径中本来就可能存在/
|
1411
|
+
但这些情况是小概率事件,一般遇到有问题的子目录,改到大目录里定位就好
|
1412
|
+
实在不行,读者可以自己复制这个函数自行扩展
|
1413
|
+
:param str|list[str] edits: 每次检索前要强制更新检索树的目录
|
1414
|
+
str, 用英文逗号隔开多个条目
|
1415
|
+
list[str]里的str,统一只要写在root下的相对路径
|
1416
|
+
|
1417
|
+
>> search_server('核心', ['2022ch4'])
|
1418
|
+
>> search_server('共享/陈坤泽', ['杂项', 'CF', '吃土乡/大家的幻想乡'])
|
1419
|
+
"""
|
1420
|
+
from flask import Flask, request
|
1421
|
+
|
1422
|
+
# 1 初始化检索数据
|
1423
|
+
parent = onenote(root)
|
1424
|
+
parent.get_search_tree(print_mode=True, reparse=reparse)
|
1425
|
+
|
1426
|
+
if isinstance(edits, str):
|
1427
|
+
# 如果输入是字符串,可能使用命令行启动的,需要转义为list
|
1428
|
+
edits = edits.split(',') # 英文逗号隔开多个参数
|
1429
|
+
|
1430
|
+
# 2 开服务接口
|
1431
|
+
app = Flask(__name__)
|
1432
|
+
|
1433
|
+
@app.route('/search/onenote', methods=['GET'])
|
1434
|
+
def search_onenote():
|
1435
|
+
def get_args(key, default=None):
|
1436
|
+
return request.args.get(key, default)
|
1437
|
+
|
1438
|
+
pattern = get_args('pattern')
|
1439
|
+
if pattern:
|
1440
|
+
# 解析功能细节
|
1441
|
+
res = parent.search(pattern, edits=edits,
|
1442
|
+
child_depth=int(get_args('child_depth', 0)),
|
1443
|
+
return_mode=get_args('return_mode', 'html'),
|
1444
|
+
padding_mode=int(get_args('padding_mode', 0)),
|
1445
|
+
print_mode=get_args('print_mode', True),
|
1446
|
+
href_mode=int(get_args('href_mode', 2)), # 默认使用动态链接
|
1447
|
+
dedent=int(get_args('dedent', 1)))
|
1448
|
+
else:
|
1449
|
+
ref_url = 'http://localhost/search/onenote?pattern=test'
|
1450
|
+
return f'请输入检索内容,例如 <a href={ref_url}>{ref_url}</a>'
|
1451
|
+
return res
|
1452
|
+
|
1453
|
+
@app.route('/search/onenote/linkid', methods=['GET'])
|
1454
|
+
def linkid():
|
1455
|
+
""" 通过id进行目标跳转 """
|
1456
|
+
page_id = request.args.get('id', None)
|
1457
|
+
object_id = request.args.get('object_id', '')
|
1458
|
+
onenote.navigate_to(page_id, object_id)
|
1459
|
+
# 返回一个直接自关闭的页面内容
|
1460
|
+
return '<script type="text/javascript">window.close();</script>'
|
1461
|
+
|
1462
|
+
# 子线程无法调用win32com生成的onenote资源,如果要使用linkid功能,只能留一个主线程处理
|
1463
|
+
app.run(host='0.0.0.0', port=port, threaded=False)
|
1464
|
+
|
1465
|
+
|
1466
|
+
if __name__ == '__main__':
|
1467
|
+
import fire
|
1468
|
+
|
1469
|
+
fire.Fire()
|