pyDiffTools 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pydifftools/__init__.py +11 -0
- pydifftools/check_numbers.py +70 -0
- pydifftools/command_line.py +747 -0
- pydifftools/command_registry.py +65 -0
- pydifftools/comment_functions.py +39 -0
- pydifftools/continuous.py +194 -0
- pydifftools/copy_files.py +75 -0
- pydifftools/diff-doc.js +193 -0
- pydifftools/doc_contents.py +147 -0
- pydifftools/flowchart/__init__.py +15 -0
- pydifftools/flowchart/dot_to_yaml.py +114 -0
- pydifftools/flowchart/graph.py +620 -0
- pydifftools/flowchart/watch_graph.py +168 -0
- pydifftools/html_comments.py +33 -0
- pydifftools/html_uncomments.py +524 -0
- pydifftools/match_spaces.py +235 -0
- pydifftools/notebook/__init__.py +0 -0
- pydifftools/notebook/fast_build.py +1502 -0
- pydifftools/notebook/tex_to_qmd.py +319 -0
- pydifftools/onewordify.py +149 -0
- pydifftools/onewordify_undo.py +54 -0
- pydifftools/outline.py +173 -0
- pydifftools/rearrange_tex.py +188 -0
- pydifftools/searchacro.py +80 -0
- pydifftools/separate_comments.py +73 -0
- pydifftools/split_conflict.py +213 -0
- pydifftools/unseparate_comments.py +69 -0
- pydifftools/update_check.py +31 -0
- pydifftools/wrap_sentences.py +501 -0
- pydifftools/xml2xlsx.vbs +33 -0
- pydifftools-0.1.8.dist-info/METADATA +146 -0
- pydifftools-0.1.8.dist-info/RECORD +36 -0
- pydifftools-0.1.8.dist-info/WHEEL +5 -0
- pydifftools-0.1.8.dist-info/entry_points.txt +2 -0
- pydifftools-0.1.8.dist-info/licenses/LICENSE.md +28 -0
- pydifftools-0.1.8.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,524 @@
|
|
|
1
|
+
# again rerun
|
|
2
|
+
from lxml import html, etree
|
|
3
|
+
import os
|
|
4
|
+
from pyspecdata import *
|
|
5
|
+
from unidecode import unidecode
|
|
6
|
+
import re
|
|
7
|
+
import sys
|
|
8
|
+
from .comment_functions import (
|
|
9
|
+
generate_alphabetnumber,
|
|
10
|
+
matchingbrackets,
|
|
11
|
+
comment_definition,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
manual_math_conversion = (
|
|
15
|
+
False # this hacks some stuff that pandoc does much better
|
|
16
|
+
)
|
|
17
|
+
fp = open(sys.argv[1], "r")
|
|
18
|
+
content = fp.read()
|
|
19
|
+
fp.close()
|
|
20
|
+
# comrefwithnewline_re = re.compile(r"('mso-comment-reference:[^']*)[\n ]+")
|
|
21
|
+
# {{{ need to remove weird linebreaks with the following, or it doesn't interpret the styles correctly
|
|
22
|
+
newcontent = re.sub(r":\n *", r":", content)
|
|
23
|
+
content = newcontent
|
|
24
|
+
newcontent = re.sub(r"('mso-comment-reference:[^']*)[\n ]+", r"\1", content)
|
|
25
|
+
while content != newcontent:
|
|
26
|
+
content = newcontent
|
|
27
|
+
newcontent = re.sub(
|
|
28
|
+
r"('mso-comment-reference:[^']*)[\n ]+", r"\1", content
|
|
29
|
+
)
|
|
30
|
+
content = newcontent
|
|
31
|
+
newcontent = re.sub(
|
|
32
|
+
r"('mso-comment-reference:[^'\"]*[^;])(['\"])", r"\1;\2", content
|
|
33
|
+
)
|
|
34
|
+
while content != newcontent:
|
|
35
|
+
content = newcontent
|
|
36
|
+
newcontent = re.sub(
|
|
37
|
+
r"('mso-comment-reference:[^'\"]*[^;])(['\"])", r"\1;\2", content
|
|
38
|
+
)
|
|
39
|
+
content = newcontent
|
|
40
|
+
content = content.replace(r"\%", r"%EXPLICITPAREN%")
|
|
41
|
+
content = content.replace(r"%", r"%EXPLICITPAREN%")
|
|
42
|
+
content = content.replace(r"%EXPLICITPAREN%", r"\%")
|
|
43
|
+
if manual_math_conversion:
|
|
44
|
+
content = content.replace(
|
|
45
|
+
"Δ", r"%ENTERMATHMODE%\Delta%LEAVEMATHMODE%"
|
|
46
|
+
)
|
|
47
|
+
content = content.replace("\xb0C", r"\degC ")
|
|
48
|
+
content = content.replace(" \xb5M", r"\uM ")
|
|
49
|
+
content = content.replace("\xb5M", r"\uM ")
|
|
50
|
+
content = content.replace(
|
|
51
|
+
"α", r"%ENTERMATHMODE%\alpha%LEAVEMATHMODE%"
|
|
52
|
+
)
|
|
53
|
+
content = content.replace("β", r"%ENTERMATHMODE%\beta%LEAVEMATHMODE%")
|
|
54
|
+
content = content.replace(
|
|
55
|
+
"γ", r"%ENTERMATHMODE%\gamma%LEAVEMATHMODE%"
|
|
56
|
+
)
|
|
57
|
+
content = content.replace(
|
|
58
|
+
"δ", r"%ENTERMATHMODE%\delta%LEAVEMATHMODE%"
|
|
59
|
+
)
|
|
60
|
+
content = content.replace(
|
|
61
|
+
"ε", r"%ENTERMATHMODE%\varepsilon%LEAVEMATHMODE%"
|
|
62
|
+
)
|
|
63
|
+
content = content.replace("ζ", r"%ENTERMATHMODE%\zeta%LEAVEMATHMODE%")
|
|
64
|
+
content = content.replace("η", r"%ENTERMATHMODE%\eta%LEAVEMATHMODE%")
|
|
65
|
+
content = content.replace(
|
|
66
|
+
"θ", r"%ENTERMATHMODE%\theta%LEAVEMATHMODE%"
|
|
67
|
+
)
|
|
68
|
+
content = content.replace("ι", r"%ENTERMATHMODE%\iota%LEAVEMATHMODE%")
|
|
69
|
+
content = content.replace(
|
|
70
|
+
"κ", r"%ENTERMATHMODE%\kappa%LEAVEMATHMODE%"
|
|
71
|
+
)
|
|
72
|
+
content = content.replace(
|
|
73
|
+
"λ", r"%ENTERMATHMODE%\lambda%LEAVEMATHMODE%"
|
|
74
|
+
)
|
|
75
|
+
content = content.replace("μ", r"%ENTERMATHMODE%\mu%LEAVEMATHMODE%")
|
|
76
|
+
content = content.replace("ν", r"%ENTERMATHMODE%\nu%LEAVEMATHMODE%")
|
|
77
|
+
content = content.replace("ξ", r"%ENTERMATHMODE%\xi%LEAVEMATHMODE%")
|
|
78
|
+
content = content.replace(
|
|
79
|
+
"ο", r"%ENTERMATHMODE%\omicron%LEAVEMATHMODE%"
|
|
80
|
+
)
|
|
81
|
+
content = content.replace("π", r"%ENTERMATHMODE%\pi%LEAVEMATHMODE%")
|
|
82
|
+
content = content.replace("ρ", r"%ENTERMATHMODE%\rho%LEAVEMATHMODE%")
|
|
83
|
+
content = content.replace(
|
|
84
|
+
"σ", r"%ENTERMATHMODE%\sigma%LEAVEMATHMODE%"
|
|
85
|
+
)
|
|
86
|
+
content = content.replace("τ", r"%ENTERMATHMODE%\tau%LEAVEMATHMODE%")
|
|
87
|
+
content = content.replace(
|
|
88
|
+
"φ", r"%ENTERMATHMODE%\varphi%LEAVEMATHMODE%"
|
|
89
|
+
)
|
|
90
|
+
content = content.replace("χ", r"%ENTERMATHMODE%\chi%LEAVEMATHMODE%")
|
|
91
|
+
content = content.replace("ψ", r"%ENTERMATHMODE%\psi%LEAVEMATHMODE%")
|
|
92
|
+
content = content.replace(
|
|
93
|
+
"ω", r"%ENTERMATHMODE%\omega%LEAVEMATHMODE%"
|
|
94
|
+
)
|
|
95
|
+
content = content.replace("′", r"%ENTERMATHMODE%\'%LEAVEMATHMODE%")
|
|
96
|
+
content = content.replace("—", r"--")
|
|
97
|
+
content = content.replace("’", r"'")
|
|
98
|
+
content = content.replace("“", r"``")
|
|
99
|
+
content = content.replace("”", r"''")
|
|
100
|
+
content = content.replace("ℜ", r"%ENTERMATHMODE%\Re%LEAVEMATHMODE%")
|
|
101
|
+
content = content.replace(
|
|
102
|
+
"⇒", r"%ENTERMATHMODE%\Rightarrow%LEAVEMATHMODE%"
|
|
103
|
+
)
|
|
104
|
+
content = content.replace(
|
|
105
|
+
"⇐", r"%ENTERMATHMODE%\Leftarrow%LEAVEMATHMODE%"
|
|
106
|
+
)
|
|
107
|
+
content = content.replace("∑", r"%ENTERMATHMODE%\Sum%LEAVEMATHMODE%")
|
|
108
|
+
content = content.replace("−", r"--")
|
|
109
|
+
content = content.replace("∕", r"/")
|
|
110
|
+
content = content.replace("∗", r"%ENTERMATHMODE%^*%LEAVEMATHMODE%")
|
|
111
|
+
content = content.replace("∼", r"%ENTERMATHMODE%\sim%LEAVEMATHMODE%")
|
|
112
|
+
content = content.replace(
|
|
113
|
+
"∝", r"%ENTERMATHMODE%\propto%LEAVEMATHMODE%"
|
|
114
|
+
)
|
|
115
|
+
content = content.replace(
|
|
116
|
+
"∞", r"%ENTERMATHMODE%\infty%LEAVEMATHMODE%"
|
|
117
|
+
)
|
|
118
|
+
content = content.replace(
|
|
119
|
+
"≈", r"%ENTERMATHMODE%\approx%LEAVEMATHMODE%"
|
|
120
|
+
)
|
|
121
|
+
content = content.replace(
|
|
122
|
+
"≡", r"%ENTERMATHMODE%\equiv%LEAVEMATHMODE%"
|
|
123
|
+
)
|
|
124
|
+
content = content.replace("≤", r"%ENTERMATHMODE%\le%LEAVEMATHMODE%")
|
|
125
|
+
content = content.replace("≥", r"%ENTERMATHMODE%\ge%LEAVEMATHMODE%")
|
|
126
|
+
content = content.replace("≪", r"%ENTERMATHMODE%\ll%LEAVEMATHMODE%")
|
|
127
|
+
content = content.replace("≫", r"%ENTERMATHMODE%\gg%LEAVEMATHMODE%")
|
|
128
|
+
content = content.replace(
|
|
129
|
+
"⋅", r"%ENTERMATHMODE%\cdot%LEAVEMATHMODE%"
|
|
130
|
+
)
|
|
131
|
+
content = content.replace(
|
|
132
|
+
"𝔢", r"%ENTERMATHMODE%\mathfrak{e}%LEAVEMATHMODE%"
|
|
133
|
+
)
|
|
134
|
+
content = content.replace(
|
|
135
|
+
"$$", ""
|
|
136
|
+
) # math symbols doubled back on each other
|
|
137
|
+
# }}}
|
|
138
|
+
# content = re.sub(r'mso-comment-reference:([a-zA-Z_0-9]+)&([a-zA-Z_0-9]+)',r'mso-comment-reference:\1AMPERSAND\2',content)
|
|
139
|
+
# content = re.sub(r'mso-comment-reference:[\n ]*([a-zA-Z0-9]+)',r'narg!mso-comment-reference:\1',content)
|
|
140
|
+
doc = html.fromstring(content)
|
|
141
|
+
commentlabel_re = re.compile(r"\[([A-Z]+)([0-9])\]")
|
|
142
|
+
inlineequation_re = re.compile(r"\$([^\$]*)\$")
|
|
143
|
+
# for j in doc.xpath('descendant::*[@style="mso-element:comment"]'):
|
|
144
|
+
thisbody = doc.find("body")
|
|
145
|
+
print("I found the body", lsafen(thisbody))
|
|
146
|
+
# commentlist = etree.Element('div',style = 'mso-element:comment-list')
|
|
147
|
+
num = 0
|
|
148
|
+
numcomments = 0
|
|
149
|
+
numcompara = 0
|
|
150
|
+
comment_dict = {}
|
|
151
|
+
comment_label_re = re.compile(r"_com_([0-9]+)")
|
|
152
|
+
for j in doc.xpath('//*[contains(@style,"font-family:Symbol")]'):
|
|
153
|
+
print('found symbol with text"', j.text, '" and dropped the tag')
|
|
154
|
+
j.drop_tag()
|
|
155
|
+
for j in doc.xpath('//div[@style="mso-element:comment-list"]'):
|
|
156
|
+
num += 1
|
|
157
|
+
for k in j.xpath('descendant-or-self::*[@style="mso-element:comment"]'):
|
|
158
|
+
numcomments += 1
|
|
159
|
+
numcompara = 0
|
|
160
|
+
commenttext = []
|
|
161
|
+
|
|
162
|
+
def process_comment_text(thistag, numcompara, commenttext):
|
|
163
|
+
for m in k.find_class("msocomtxt"):
|
|
164
|
+
mymatch = comment_label_re.match(m.attrib["id"])
|
|
165
|
+
if mymatch:
|
|
166
|
+
commentlabel = mymatch.groups()[0]
|
|
167
|
+
print("that means it's comment", commentlabel)
|
|
168
|
+
else:
|
|
169
|
+
raise ValueError(
|
|
170
|
+
"I don't understand what the comment id "
|
|
171
|
+
+ m.attrib["id"]
|
|
172
|
+
+ " means"
|
|
173
|
+
)
|
|
174
|
+
numcompara += 1
|
|
175
|
+
for m in thistag.xpath(
|
|
176
|
+
'descendant-or-self::span[@style="mso-special-character:comment"]'
|
|
177
|
+
):
|
|
178
|
+
m.drop_tree()
|
|
179
|
+
print("dropped special character")
|
|
180
|
+
commenttext.append(unidecode(thistag.text_content()))
|
|
181
|
+
return commentlabel, numcompara
|
|
182
|
+
|
|
183
|
+
found_something = False
|
|
184
|
+
class_types = ["MsoCommentText", "MsoNormal", "indent", "noindent"]
|
|
185
|
+
for class_type in class_types:
|
|
186
|
+
for l in k.find_class(class_type):
|
|
187
|
+
commentlabel, numcompara = process_comment_text(
|
|
188
|
+
l, numcompara, commenttext
|
|
189
|
+
)
|
|
190
|
+
found_something = True
|
|
191
|
+
if not found_something:
|
|
192
|
+
print(
|
|
193
|
+
(
|
|
194
|
+
"Wargning: I found no "
|
|
195
|
+
+ ",".join(class_types)
|
|
196
|
+
+ " in this comment --\n%s\n -- in the future, should search by paragraph tag, instead"
|
|
197
|
+
% html.tostring(k)
|
|
198
|
+
)
|
|
199
|
+
)
|
|
200
|
+
k.drop_tree() # drop the stuff at the end
|
|
201
|
+
print(
|
|
202
|
+
"for comment %d, I find %d paragraphs" % (numcomments, numcompara)
|
|
203
|
+
)
|
|
204
|
+
comment_dict[commentlabel] = "\n\n".join(commenttext)
|
|
205
|
+
print("text looks like this:", comment_dict[commentlabel])
|
|
206
|
+
# and load into the dictionary
|
|
207
|
+
# {{{ remove the children, set the comment text as the text, and drop the tag
|
|
208
|
+
# for l in k.getchildren():
|
|
209
|
+
# l.drop_tree()
|
|
210
|
+
# k.text = '\n\n'.join(commenttext)
|
|
211
|
+
# k.drop_tag()
|
|
212
|
+
# }}}
|
|
213
|
+
# print 'comment %d is:'%numcomments,html.tostring(k)
|
|
214
|
+
# print 'for comment',numcomments,':'
|
|
215
|
+
# print unicode(l.text_content()).encode('utf-8')
|
|
216
|
+
# print 'found span with style:\n\n',lsafen(html.tostring(j),wrap = 60)
|
|
217
|
+
# if j.attrib['style'] == 'mso-element:comment':
|
|
218
|
+
# print 'found div with style:\n\n',lsafen(j.attrib,wrap = 60)
|
|
219
|
+
# print "found p with class MsoCommentText:"
|
|
220
|
+
# print unicode(k.text_content()).encode('utf-8')
|
|
221
|
+
# j.drop_tree()
|
|
222
|
+
# j.append("a comment found here")
|
|
223
|
+
# commentlist.append(j)
|
|
224
|
+
print("I found %d comment lists and %d comments" % (num, numcomments))
|
|
225
|
+
initial_translation_dict = {
|
|
226
|
+
"JF": "john",
|
|
227
|
+
"y": "yuan",
|
|
228
|
+
"CoLA&S": "peter",
|
|
229
|
+
"SH": "songi",
|
|
230
|
+
"PQ": "peter",
|
|
231
|
+
"KE": "keith",
|
|
232
|
+
}
|
|
233
|
+
commentlabel_re = re.compile(r"\[([A-Za-z&]+)([0-9]+)\]")
|
|
234
|
+
commentid_re = re.compile(r"_anchor_([0-9]+)")
|
|
235
|
+
numcomrefs = 0
|
|
236
|
+
numcomrefsrepd = 0
|
|
237
|
+
comment_file_text = ""
|
|
238
|
+
current_comment_number = 0
|
|
239
|
+
for thiscommentreference in doc.find_class("MsoCommentReference"):
|
|
240
|
+
thiscommentreference.drop_tag()
|
|
241
|
+
for thiscommentreference in doc.find_class("msocomanchor"):
|
|
242
|
+
comref_text = thiscommentreference.text
|
|
243
|
+
if comref_text is not None:
|
|
244
|
+
m = commentlabel_re.match(comref_text)
|
|
245
|
+
if m:
|
|
246
|
+
initials, number = m.groups()
|
|
247
|
+
try:
|
|
248
|
+
print(
|
|
249
|
+
"I found comment %s by %s"
|
|
250
|
+
% (number, initial_translation_dict[initials])
|
|
251
|
+
)
|
|
252
|
+
except KeyError:
|
|
253
|
+
raise ValueError(
|
|
254
|
+
"I don't know who %s is -- add to initial_translation_dict"
|
|
255
|
+
% initials
|
|
256
|
+
)
|
|
257
|
+
thiscommentreference.text = ""
|
|
258
|
+
thiscommentreference.drop_tag()
|
|
259
|
+
prevcomrefsrepd = numcomrefsrepd
|
|
260
|
+
for k in doc.xpath(
|
|
261
|
+
'descendant-or-self::*[contains(@style,"mso-comment-reference:%s_%s;")]'
|
|
262
|
+
% (initials, number)
|
|
263
|
+
):
|
|
264
|
+
print("\nThis reference has the text:", html.tostring(k))
|
|
265
|
+
if k.text is None:
|
|
266
|
+
k.text = ""
|
|
267
|
+
empty_tag = False
|
|
268
|
+
if k.text == "":
|
|
269
|
+
empty_tag = True
|
|
270
|
+
if number not in list(comment_dict.keys()):
|
|
271
|
+
raise KeyError(
|
|
272
|
+
repr(number)
|
|
273
|
+
+ "is not in comment_dict keys: "
|
|
274
|
+
+ repr(list(comment_dict.keys()))
|
|
275
|
+
)
|
|
276
|
+
if (
|
|
277
|
+
(len(comment_dict[number]) > 13)
|
|
278
|
+
and (comment_dict[number][:14] == "(need to do:) ")
|
|
279
|
+
and (initial_translation_dict[initials] == "john")
|
|
280
|
+
): # if it's a "need to do"
|
|
281
|
+
# k.text = r'\%s['%('ntd')+k.text_content().replace('[',' ').replace(']',' ')+']{'+comment_dict[number][14:]+'}'
|
|
282
|
+
k.text = (
|
|
283
|
+
r"\%s%s{"
|
|
284
|
+
% (
|
|
285
|
+
"ntd",
|
|
286
|
+
generate_alphabetnumber(current_comment_number),
|
|
287
|
+
)
|
|
288
|
+
+ k.text_content().replace("[", " ").replace("]", " ")
|
|
289
|
+
+ "}"
|
|
290
|
+
)
|
|
291
|
+
comment_file_text += comment_definition(
|
|
292
|
+
"ntd"
|
|
293
|
+
+ generate_alphabetnumber(current_comment_number),
|
|
294
|
+
"ntd",
|
|
295
|
+
comment_dict[number][14:],
|
|
296
|
+
)
|
|
297
|
+
current_comment_number += 1
|
|
298
|
+
else:
|
|
299
|
+
k.text = (
|
|
300
|
+
r"\%s%s{"
|
|
301
|
+
% (
|
|
302
|
+
initial_translation_dict[initials],
|
|
303
|
+
generate_alphabetnumber(current_comment_number),
|
|
304
|
+
)
|
|
305
|
+
+ k.text_content().replace("[", " ").replace("]", " ")
|
|
306
|
+
+ "}"
|
|
307
|
+
)
|
|
308
|
+
comment_file_text += comment_definition(
|
|
309
|
+
initial_translation_dict[initials]
|
|
310
|
+
+ generate_alphabetnumber(current_comment_number),
|
|
311
|
+
initial_translation_dict[initials],
|
|
312
|
+
comment_dict[number],
|
|
313
|
+
)
|
|
314
|
+
current_comment_number += 1
|
|
315
|
+
k.drop_tag()
|
|
316
|
+
print("I convert it to this:", html.tostring(k))
|
|
317
|
+
numcomrefsrepd += 1
|
|
318
|
+
# if numcomrefsrepd > prevcomrefsrepd+1:
|
|
319
|
+
# if not empty_tag: raise RuntimeError("Warning: For some reason this comment is referenced twice!!:\n\n"+html.tostring(thiscommentreference))
|
|
320
|
+
if prevcomrefsrepd == numcomrefsrepd:
|
|
321
|
+
print(
|
|
322
|
+
"Warning: I can't find the highlighted text for the comment:\n\n"
|
|
323
|
+
+ html.tostring(thiscommentreference)
|
|
324
|
+
+ "so I'm dropping it"
|
|
325
|
+
)
|
|
326
|
+
else:
|
|
327
|
+
raise RuntimeError("Warning, I couldn't parse this!!")
|
|
328
|
+
numcomrefs += 1
|
|
329
|
+
else:
|
|
330
|
+
print("Warning, found a comment with no text")
|
|
331
|
+
print(
|
|
332
|
+
"I found %d comment references and replaced %d"
|
|
333
|
+
% (numcomrefs, numcomrefsrepd)
|
|
334
|
+
)
|
|
335
|
+
if manual_math_conversion:
|
|
336
|
+
for j in doc.xpath("//sub"):
|
|
337
|
+
thistext = j.text_content()
|
|
338
|
+
# {{{ remove children
|
|
339
|
+
for l in j.getchildren():
|
|
340
|
+
l.drop_tree()
|
|
341
|
+
# }}}
|
|
342
|
+
if len(thistext) > 0:
|
|
343
|
+
if j.tail is None:
|
|
344
|
+
j.tail = ""
|
|
345
|
+
thistail = j.tail
|
|
346
|
+
j.tail = ""
|
|
347
|
+
j.text = (
|
|
348
|
+
"%%ENTERMATHMODE%%_{%s}%%LEAVEMATHMODE%%" % thistext + thistail
|
|
349
|
+
)
|
|
350
|
+
# j.text = '\\ensuremath{_{'+inlineequation_re.sub('\1',j.text)
|
|
351
|
+
# j.tail = inlineequation_re.sub('\1',j.tail)+'}}'
|
|
352
|
+
j.drop_tag()
|
|
353
|
+
for j in doc.xpath("//sup"):
|
|
354
|
+
thistext = j.text_content().encode("utf-8")
|
|
355
|
+
# {{{ remove children
|
|
356
|
+
for l in j.getchildren():
|
|
357
|
+
l.drop_tree()
|
|
358
|
+
# }}}
|
|
359
|
+
if len(thistext) > 0:
|
|
360
|
+
if j.tail is None:
|
|
361
|
+
j.tail = ""
|
|
362
|
+
thistail = str(j.tail)
|
|
363
|
+
j.tail = ""
|
|
364
|
+
j.text = (
|
|
365
|
+
"%%ENTERMATHMODE%%^{%s}%%LEAVEMATHMODE%%" % thistext + thistail
|
|
366
|
+
)
|
|
367
|
+
j.drop_tag()
|
|
368
|
+
# for j in doc.xpath('//*[contains(@class,"cmmi")]'):
|
|
369
|
+
for mathmodefontsize in [7, 8, 12, 81, 121]:
|
|
370
|
+
for mathmodefonttype in ["cmmi", "cmr", "cmsy"]:
|
|
371
|
+
for j in doc.find_class(
|
|
372
|
+
"%s-%d" % (mathmodefonttype, mathmodefontsize)
|
|
373
|
+
): # find the math-mode stuff
|
|
374
|
+
thistext = str(unidecode(j.text_content()))
|
|
375
|
+
# {{{ remove children
|
|
376
|
+
for l in j.getchildren():
|
|
377
|
+
l.drop_tree()
|
|
378
|
+
# }}}
|
|
379
|
+
if len(thistext) > 0:
|
|
380
|
+
if j.tail is None:
|
|
381
|
+
j.tail = ""
|
|
382
|
+
thistail = unidecode(j.tail)
|
|
383
|
+
j.tail = ""
|
|
384
|
+
j.text = (
|
|
385
|
+
"%%ENTERMATHMODE%%%s%%LEAVEMATHMODE%%" % thistext
|
|
386
|
+
+ thistail
|
|
387
|
+
)
|
|
388
|
+
# j.text = '\\ensuremath{_{'+inlineequation_re.sub('\1',j.text)
|
|
389
|
+
# j.tail = inlineequation_re.sub('\1',j.tail)+'}}'
|
|
390
|
+
j.drop_tag()
|
|
391
|
+
symbol_lookup = {
|
|
392
|
+
"x": "\\xi ",
|
|
393
|
+
"p": "\\pi",
|
|
394
|
+
"k": "\\kappa",
|
|
395
|
+
"s": "\\sigma",
|
|
396
|
+
"y": "\\psi",
|
|
397
|
+
"h": "\\eta",
|
|
398
|
+
"N": "\\Nu",
|
|
399
|
+
"n": "\\nu",
|
|
400
|
+
"e": "\\epsilon",
|
|
401
|
+
"o": "\\omicron",
|
|
402
|
+
"r": "\\rho",
|
|
403
|
+
" ": " ",
|
|
404
|
+
"_": "_",
|
|
405
|
+
"{": "{",
|
|
406
|
+
"}": "}",
|
|
407
|
+
}
|
|
408
|
+
for j in doc.find_class("GramE"):
|
|
409
|
+
j.drop_tag()
|
|
410
|
+
for j in doc.xpath('//*[contains(@style,"font-family:Symbol")]'):
|
|
411
|
+
newtext = "%ENTERMATHMODE%"
|
|
412
|
+
thistail = str(j.tail)
|
|
413
|
+
j.tail = ""
|
|
414
|
+
thistext = str(j.text)
|
|
415
|
+
k_index = 0
|
|
416
|
+
while k_index < len(thistext):
|
|
417
|
+
k = thistext[k_index]
|
|
418
|
+
while k_index < len(thistext) and k == "\\":
|
|
419
|
+
print("found command")
|
|
420
|
+
print("pass %s\n" % k)
|
|
421
|
+
newtext = newtext + k
|
|
422
|
+
k_index += 1
|
|
423
|
+
k = thistext[k_index]
|
|
424
|
+
while k_index < len(thistext) and k not in [" ", "\\", "{"]:
|
|
425
|
+
# gobble up commands
|
|
426
|
+
print("pass %s\n" % k)
|
|
427
|
+
newtext = newtext + k
|
|
428
|
+
k_index += 1
|
|
429
|
+
k = thistext[k_index]
|
|
430
|
+
try:
|
|
431
|
+
newtext = newtext + symbol_lookup[k]
|
|
432
|
+
except:
|
|
433
|
+
raise ValueError(
|
|
434
|
+
"symbol for symbol font '%s' not found! Open the script and put it in the symbol_lookup dictionary"
|
|
435
|
+
% k
|
|
436
|
+
)
|
|
437
|
+
k_index += 1
|
|
438
|
+
newtext = newtext + "%LEAVEMATHMODE%"
|
|
439
|
+
j.text = newtext + thistail
|
|
440
|
+
j.drop_tag()
|
|
441
|
+
# print lsafen(map(html.tostring,newlist),wrap = 60)
|
|
442
|
+
newfile = re.sub(r"(.*)(\.htm.*)", r"\1_texcomm\2", sys.argv[1])
|
|
443
|
+
fp = open(newfile, "w")
|
|
444
|
+
content = html.tostring(doc)
|
|
445
|
+
# content = content.replace('$$','')
|
|
446
|
+
for mathmodefonttype in ["cmmi", "cmr", "cmsy"]:
|
|
447
|
+
if content.find("class=%s-" % mathmodefonttype) > 0:
|
|
448
|
+
raise ValueError(
|
|
449
|
+
"error, I see a string '%s' which indicates math mode, but apparently you're not searching for the correct font size, so go add the font into the list of math mode font sizes"
|
|
450
|
+
% content[
|
|
451
|
+
content.find("%s-" % mathmodefonttype) : content.find(
|
|
452
|
+
"%s-" % mathmodefonttype
|
|
453
|
+
)
|
|
454
|
+
+ 14
|
|
455
|
+
]
|
|
456
|
+
)
|
|
457
|
+
content_list = list(content)
|
|
458
|
+
inmathmode = False
|
|
459
|
+
for j in range(0, len(content_list)):
|
|
460
|
+
if content_list[j] == "$":
|
|
461
|
+
if content_list[j - 1] != "\\":
|
|
462
|
+
if inmathmode:
|
|
463
|
+
content_list[j] = "%LEAVEMATHMODE%"
|
|
464
|
+
inmathmode = False
|
|
465
|
+
else:
|
|
466
|
+
content_list[j] = "%ENTERMATHMODE%"
|
|
467
|
+
inmathmode = TRUE
|
|
468
|
+
content = "".join(content_list)
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
# content = content.replace('%ENTERMATHMODE%','$')
|
|
472
|
+
# content = content.replace('%LEAVEMATHMODE%','$')
|
|
473
|
+
def decodemathmode(arg):
|
|
474
|
+
for j in range(0, 20):
|
|
475
|
+
# just take a couple more passes to be sure
|
|
476
|
+
# arg = re.sub(r'\\ensuremath{(.*)}( *)\\ensuremath{(.*)}',r'\\ensuremath{\1\2\3}',arg)
|
|
477
|
+
arg = re.sub(
|
|
478
|
+
r"([(),\.0-9]*)%LEAVEMATHMODE%([(),\.0-9]*)%ENTERMATHMODE%([(),\.0-9]*)",
|
|
479
|
+
r"\1\2\3",
|
|
480
|
+
arg,
|
|
481
|
+
)
|
|
482
|
+
arg = re.sub(r"_{([^}]*)}_{([^}]*)}", r"_{\1\2}", arg)
|
|
483
|
+
arg = re.sub(r"\^{([^}]*)}\^{([^}]*)}", r"^{\1\2}", arg)
|
|
484
|
+
nextenter = arg.find("%ENTERMATHMODE%")
|
|
485
|
+
while nextenter > 0:
|
|
486
|
+
arg = arg.replace("%ENTERMATHMODE%", "$", 1)
|
|
487
|
+
nextenter = arg.find("%ENTERMATHMODE%")
|
|
488
|
+
nextexit = arg.find("%LEAVEMATHMODE%")
|
|
489
|
+
replaced = True # just to start the loop
|
|
490
|
+
while replaced:
|
|
491
|
+
if (
|
|
492
|
+
nextenter < nextexit
|
|
493
|
+
): # there is a math mode inside this one, so gobble it up
|
|
494
|
+
arg = arg.replace("%ENTERMATHMODE%", "", 1)
|
|
495
|
+
arg = arg.replace("%LEAVEMATHMODE%", "", 1)
|
|
496
|
+
nextenter = arg.find("%ENTERMATHMODE%")
|
|
497
|
+
nextexit = arg.find("%LEAVEMATHMODE%")
|
|
498
|
+
replaced = True
|
|
499
|
+
else:
|
|
500
|
+
arg = arg.replace(
|
|
501
|
+
"%LEAVEMATHMODE%", "$", 1
|
|
502
|
+
) # close this math environment
|
|
503
|
+
replaced = False
|
|
504
|
+
nextenter = arg.find("%ENTERMATHMODE%")
|
|
505
|
+
print("next enter is at", nextenter)
|
|
506
|
+
return arg
|
|
507
|
+
|
|
508
|
+
|
|
509
|
+
content = decodemathmode(content)
|
|
510
|
+
fp.write(content)
|
|
511
|
+
# fp.write('\n'.join(map(html.tostring,newlist)))
|
|
512
|
+
fp.close()
|
|
513
|
+
fp = open(newfile, "r")
|
|
514
|
+
content = fp.read()
|
|
515
|
+
fp.close()
|
|
516
|
+
textfile = re.sub(r"(.*)(\.htm.*)", r"\1.txt", newfile)
|
|
517
|
+
doc = html.fromstring(content)
|
|
518
|
+
fp = open(textfile, "w")
|
|
519
|
+
fp.write(unidecode(doc.text_content()))
|
|
520
|
+
fp.close()
|
|
521
|
+
textfile = re.sub(r"(.*)(\.htm.*)", r"\1_comments.tex", newfile)
|
|
522
|
+
fp = open(textfile, "w")
|
|
523
|
+
fp.write(decodemathmode(comment_file_text).encode("utf-8"))
|
|
524
|
+
fp.close()
|