pyDiffTools 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pydifftools/__init__.py +11 -0
- pydifftools/check_numbers.py +70 -0
- pydifftools/command_line.py +747 -0
- pydifftools/command_registry.py +65 -0
- pydifftools/comment_functions.py +39 -0
- pydifftools/continuous.py +194 -0
- pydifftools/copy_files.py +75 -0
- pydifftools/diff-doc.js +193 -0
- pydifftools/doc_contents.py +147 -0
- pydifftools/flowchart/__init__.py +15 -0
- pydifftools/flowchart/dot_to_yaml.py +114 -0
- pydifftools/flowchart/graph.py +620 -0
- pydifftools/flowchart/watch_graph.py +168 -0
- pydifftools/html_comments.py +33 -0
- pydifftools/html_uncomments.py +524 -0
- pydifftools/match_spaces.py +235 -0
- pydifftools/notebook/__init__.py +0 -0
- pydifftools/notebook/fast_build.py +1502 -0
- pydifftools/notebook/tex_to_qmd.py +319 -0
- pydifftools/onewordify.py +149 -0
- pydifftools/onewordify_undo.py +54 -0
- pydifftools/outline.py +173 -0
- pydifftools/rearrange_tex.py +188 -0
- pydifftools/searchacro.py +80 -0
- pydifftools/separate_comments.py +73 -0
- pydifftools/split_conflict.py +213 -0
- pydifftools/unseparate_comments.py +69 -0
- pydifftools/update_check.py +31 -0
- pydifftools/wrap_sentences.py +501 -0
- pydifftools/xml2xlsx.vbs +33 -0
- pydifftools-0.1.8.dist-info/METADATA +146 -0
- pydifftools-0.1.8.dist-info/RECORD +36 -0
- pydifftools-0.1.8.dist-info/WHEEL +5 -0
- pydifftools-0.1.8.dist-info/entry_points.txt +2 -0
- pydifftools-0.1.8.dist-info/licenses/LICENSE.md +28 -0
- pydifftools-0.1.8.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""this script has been entirely vibe-coded based on the tex example included
|
|
3
|
+
in the repo!"""
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
import sys
|
|
7
|
+
import subprocess
|
|
8
|
+
import tempfile
|
|
9
|
+
import shutil
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from pydifftools.command_registry import register_command
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def find_matching(text: str, start: int, open_ch: str, close_ch: str) -> int:
|
|
16
|
+
"""Return index of matching close_ch for open_ch at *start* or -1."""
|
|
17
|
+
depth = 1
|
|
18
|
+
i = start + 1
|
|
19
|
+
while i < len(text):
|
|
20
|
+
c = text[i]
|
|
21
|
+
if c == open_ch:
|
|
22
|
+
depth += 1
|
|
23
|
+
elif c == close_ch:
|
|
24
|
+
depth -= 1
|
|
25
|
+
if depth == 0:
|
|
26
|
+
return i
|
|
27
|
+
i += 1
|
|
28
|
+
return -1
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def preprocess_latex(src: str) -> str:
|
|
32
|
+
"""Convert custom environments and observation macros before pandoc."""
|
|
33
|
+
|
|
34
|
+
def repl_python(m: re.Match) -> str:
|
|
35
|
+
"""Preserve python blocks exactly using markers."""
|
|
36
|
+
code = m.group(1)
|
|
37
|
+
return (
|
|
38
|
+
"\\begin{verbatim}\n%%PYTHON_START%%\n"
|
|
39
|
+
+ code
|
|
40
|
+
+ "%%PYTHON_END%%\n\\end{verbatim}"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
def repl_verbatim(m: re.Match) -> str:
|
|
44
|
+
"""Mark generic verbatim blocks for fenced conversion."""
|
|
45
|
+
newline = m.group(1)
|
|
46
|
+
body = m.group(2)
|
|
47
|
+
if "%%PYTHON_START%%" in body:
|
|
48
|
+
return m.group(0)
|
|
49
|
+
return (
|
|
50
|
+
f"\\begin{{verbatim}}{newline}%%VERBATIM_START%%\n"
|
|
51
|
+
+ body
|
|
52
|
+
+ "%%VERBATIM_END%%\n\\end{verbatim}"
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# replace python environment with verbatim + markers without touching
|
|
56
|
+
# the whitespace contained in the block
|
|
57
|
+
src = re.sub(
|
|
58
|
+
r"\\begin{python}(?:\[[^\]]*\])?\n(.*?)\\end{python}",
|
|
59
|
+
repl_python,
|
|
60
|
+
src,
|
|
61
|
+
flags=re.S,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# mark standard verbatim blocks so they convert to fenced code later
|
|
65
|
+
src = re.sub(
|
|
66
|
+
r"\\begin{verbatim}(\n?)(.*?)\\end{verbatim}",
|
|
67
|
+
repl_verbatim,
|
|
68
|
+
src,
|
|
69
|
+
flags=re.S,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# convert err environment so pandoc will parse inside while preserving
|
|
73
|
+
# the whitespace exactly
|
|
74
|
+
src = re.sub(
|
|
75
|
+
r"\\begin{err}\n?(.*?)\\end{err}",
|
|
76
|
+
lambda m: f"<err>{m.group(1)}</err>",
|
|
77
|
+
src,
|
|
78
|
+
flags=re.S,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# handle \o[...]{} and \o{} observations
|
|
82
|
+
out = []
|
|
83
|
+
i = 0
|
|
84
|
+
while True:
|
|
85
|
+
idx_bracket = src.find("\\o[", i)
|
|
86
|
+
idx_brace = src.find("\\o{", i)
|
|
87
|
+
idxs = [x for x in (idx_bracket, idx_brace) if x != -1]
|
|
88
|
+
idx = min(idxs) if idxs else -1
|
|
89
|
+
if idx == -1:
|
|
90
|
+
out.append(src[i:])
|
|
91
|
+
break
|
|
92
|
+
out.append(src[i:idx])
|
|
93
|
+
j = idx + 2
|
|
94
|
+
attrs = ""
|
|
95
|
+
if j < len(src) and src[j] == "[":
|
|
96
|
+
end_attrs = find_matching(src, j, "[", "]")
|
|
97
|
+
if end_attrs == -1:
|
|
98
|
+
out.append(src[idx:])
|
|
99
|
+
break
|
|
100
|
+
attrs = src[j + 1 : end_attrs]
|
|
101
|
+
j = end_attrs + 1
|
|
102
|
+
if j >= len(src) or src[j] != "{":
|
|
103
|
+
out.append(src[idx : idx + 2])
|
|
104
|
+
i = idx + 2
|
|
105
|
+
continue
|
|
106
|
+
end_body = find_matching(src, j, "{", "}")
|
|
107
|
+
if end_body == -1:
|
|
108
|
+
out.append(src[idx:])
|
|
109
|
+
break
|
|
110
|
+
body = src[j + 1 : end_body]
|
|
111
|
+
j = end_body + 1
|
|
112
|
+
if attrs:
|
|
113
|
+
m = re.match(r"(.*?)\s*(\(([^)]+)\))?$", attrs.strip())
|
|
114
|
+
time = m.group(1).strip() if m else attrs.strip()
|
|
115
|
+
author = m.group(3) if m else None
|
|
116
|
+
tag = (
|
|
117
|
+
f'<obs time="{time}"'
|
|
118
|
+
+ (f' author="{author}"' if author else "")
|
|
119
|
+
+ f">{body}</obs>"
|
|
120
|
+
)
|
|
121
|
+
else:
|
|
122
|
+
tag = f"<obs>{body}</obs>"
|
|
123
|
+
out.append(tag)
|
|
124
|
+
i = j
|
|
125
|
+
return "".join(out)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def clean_html_escapes(text: str) -> str:
|
|
129
|
+
return text.replace("\\<", "<").replace("\\>", ">").replace('\\"', '"')
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def finalize_markers(text: str) -> str:
|
|
133
|
+
lines = []
|
|
134
|
+
in_py = False
|
|
135
|
+
need_reset = False
|
|
136
|
+
in_verb = False
|
|
137
|
+
for line in text.splitlines(keepends=True):
|
|
138
|
+
if re.match(r"^\s*%%PYTHON_START%%", line):
|
|
139
|
+
lines.append("```{python}\n")
|
|
140
|
+
in_py = True
|
|
141
|
+
need_reset = True
|
|
142
|
+
continue
|
|
143
|
+
if re.match(r"^\s*%%PYTHON_END%%", line):
|
|
144
|
+
lines.append("```\n")
|
|
145
|
+
in_py = False
|
|
146
|
+
continue
|
|
147
|
+
if re.match(r"^\s*%%VERBATIM_START%%", line):
|
|
148
|
+
lines.append("```\n")
|
|
149
|
+
in_verb = True
|
|
150
|
+
continue
|
|
151
|
+
if re.match(r"^\s*%%VERBATIM_END%%", line):
|
|
152
|
+
lines.append("```\n")
|
|
153
|
+
in_verb = False
|
|
154
|
+
continue
|
|
155
|
+
if in_py:
|
|
156
|
+
stripped = line[4:] if line.startswith(" ") else line
|
|
157
|
+
if need_reset:
|
|
158
|
+
if stripped.lstrip().startswith("%reset"):
|
|
159
|
+
lines.append(stripped)
|
|
160
|
+
else:
|
|
161
|
+
lines.append("%reset -f\n")
|
|
162
|
+
lines.append(stripped)
|
|
163
|
+
need_reset = False
|
|
164
|
+
else:
|
|
165
|
+
lines.append(stripped)
|
|
166
|
+
elif in_verb and line.startswith(" "):
|
|
167
|
+
lines.append(line[4:])
|
|
168
|
+
else:
|
|
169
|
+
lines.append(line)
|
|
170
|
+
return "".join(lines)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def format_observations(text: str) -> str:
|
|
174
|
+
"""Ensure observation tags sit on a single line without altering
|
|
175
|
+
content."""
|
|
176
|
+
|
|
177
|
+
obs_re = re.compile(r"(<obs[^>]*>)(.*?)(</obs>)", flags=re.S)
|
|
178
|
+
|
|
179
|
+
def repl(match: re.Match) -> str:
|
|
180
|
+
open_tag = match.group(1).strip()
|
|
181
|
+
body = match.group(2)
|
|
182
|
+
close_tag = match.group(3).strip()
|
|
183
|
+
# trim newlines that may surround the body but keep internal whitespace
|
|
184
|
+
body = body.lstrip("\n").rstrip("\n")
|
|
185
|
+
return f"{open_tag}{body}{close_tag}"
|
|
186
|
+
|
|
187
|
+
return obs_re.sub(repl, text)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def format_tags(text: str, indent_str: str = " ") -> str:
|
|
191
|
+
"""Format <err> blocks with indentation and tidy <obs> tags."""
|
|
192
|
+
text = format_observations(text)
|
|
193
|
+
# normalize whitespace around err tags
|
|
194
|
+
text = re.sub(r"<err>[ \t]*\n+", "<err>\n", text)
|
|
195
|
+
text = re.sub(r"<err>[ \t]+", "<err>\n", text)
|
|
196
|
+
text = re.sub(r"</err>[ \t]+", "</err>", text)
|
|
197
|
+
# ensure opening obs tags start on a new line without collapsing blank
|
|
198
|
+
# lines
|
|
199
|
+
text = re.sub(r"(\n+)[ \t]*(<obs)", r"\1\2", text)
|
|
200
|
+
text = re.sub(r"(?<!^)(?<!\n)(<obs)", r"\n\1", text)
|
|
201
|
+
# ensure a newline after closing obs tags but keep extra blank lines
|
|
202
|
+
text = re.sub(r"</obs>[ \t]+", "</obs>", text)
|
|
203
|
+
text = re.sub(r"</obs>(?!\n)", "</obs>\n", text)
|
|
204
|
+
pattern = re.compile(r"(<err>|</err>)")
|
|
205
|
+
parts = pattern.split(text)
|
|
206
|
+
out = []
|
|
207
|
+
indent = 0
|
|
208
|
+
prev_tag = None
|
|
209
|
+
for part in parts:
|
|
210
|
+
if not part:
|
|
211
|
+
continue
|
|
212
|
+
if part == "<err>":
|
|
213
|
+
if out and not out[-1].endswith("\n"):
|
|
214
|
+
out[-1] = out[-1].rstrip() + "\n"
|
|
215
|
+
out.append(indent_str * indent + "<err>\n")
|
|
216
|
+
indent += 1
|
|
217
|
+
prev_tag = "<err>"
|
|
218
|
+
elif part == "</err>":
|
|
219
|
+
if out and not out[-1].endswith("\n"):
|
|
220
|
+
out[-1] = out[-1].rstrip() + "\n"
|
|
221
|
+
indent -= 1
|
|
222
|
+
out.append(indent_str * indent + "</err>\n")
|
|
223
|
+
prev_tag = "</err>"
|
|
224
|
+
else:
|
|
225
|
+
# Keep err contents tight while
|
|
226
|
+
# forcing a blank line after
|
|
227
|
+
# closing tags so pandoc treats
|
|
228
|
+
# the debug block as a standalone
|
|
229
|
+
# HTML block. The newline handling
|
|
230
|
+
# below ensures content after a
|
|
231
|
+
# closing </err> tag always starts
|
|
232
|
+
# on its own paragraph line.
|
|
233
|
+
if prev_tag == "<err>" and part.startswith("\n"):
|
|
234
|
+
part = part[1:]
|
|
235
|
+
if prev_tag == "</err>" and not part.startswith("\n"):
|
|
236
|
+
part = "\n" + part
|
|
237
|
+
lines = part.splitlines(True)
|
|
238
|
+
for line in lines:
|
|
239
|
+
if line.strip():
|
|
240
|
+
out.append(indent_str * indent + line)
|
|
241
|
+
else:
|
|
242
|
+
out.append(line)
|
|
243
|
+
prev_tag = None
|
|
244
|
+
formatted = "".join(out)
|
|
245
|
+
return re.sub(r"[ \t]+(?=\n)", "", formatted)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
@register_command(
|
|
249
|
+
"Convert LaTeX sources to Quarto Markdown (.qmd) files",
|
|
250
|
+
help={"tex": "Input .tex file to convert"},
|
|
251
|
+
)
|
|
252
|
+
def tex2qmd(tex):
|
|
253
|
+
"""Convert ``tex`` to a .qmd file and return the output path."""
|
|
254
|
+
|
|
255
|
+
inp = Path(tex)
|
|
256
|
+
if not inp.exists():
|
|
257
|
+
print(f"File not found: {inp}", file=sys.stderr)
|
|
258
|
+
sys.exit(1)
|
|
259
|
+
|
|
260
|
+
base = inp.with_suffix("")
|
|
261
|
+
src = inp.read_text()
|
|
262
|
+
pre_content = preprocess_latex(src)
|
|
263
|
+
|
|
264
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".tex") as pre:
|
|
265
|
+
pre.write(pre_content.encode())
|
|
266
|
+
pre_path = pre.name
|
|
267
|
+
|
|
268
|
+
mid_fd, mid_path = tempfile.mkstemp()
|
|
269
|
+
Path(mid_path).unlink() # we just want the name; pandoc will create it
|
|
270
|
+
|
|
271
|
+
try:
|
|
272
|
+
# Prefer Quarto's bundled pandoc when available so the conversion
|
|
273
|
+
# matches Quarto defaults, but fall back to a standalone pandoc
|
|
274
|
+
# installation when Quarto is not on PATH.
|
|
275
|
+
quarto = shutil.which("quarto")
|
|
276
|
+
pandoc = shutil.which("pandoc")
|
|
277
|
+
if quarto:
|
|
278
|
+
cmd = [quarto, "pandoc"]
|
|
279
|
+
else:
|
|
280
|
+
cmd = [pandoc]
|
|
281
|
+
cmd += [
|
|
282
|
+
pre_path,
|
|
283
|
+
"-f",
|
|
284
|
+
"latex",
|
|
285
|
+
"-t",
|
|
286
|
+
"markdown",
|
|
287
|
+
"--wrap=none",
|
|
288
|
+
"-o",
|
|
289
|
+
mid_path,
|
|
290
|
+
]
|
|
291
|
+
subprocess.run(cmd, check=True)
|
|
292
|
+
finally:
|
|
293
|
+
Path(pre_path).unlink(missing_ok=True)
|
|
294
|
+
|
|
295
|
+
mid_text = Path(mid_path).read_text()
|
|
296
|
+
Path(mid_path).unlink(missing_ok=True)
|
|
297
|
+
|
|
298
|
+
clean_text = clean_html_escapes(mid_text)
|
|
299
|
+
final_text = finalize_markers(clean_text)
|
|
300
|
+
formatted = format_tags(final_text)
|
|
301
|
+
out_path = base.with_suffix(".qmd")
|
|
302
|
+
out_path.write_text(formatted)
|
|
303
|
+
print(f"Wrote {out_path}")
|
|
304
|
+
return out_path
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def main():
|
|
308
|
+
if len(sys.argv) != 2:
|
|
309
|
+
print("Usage: tex_to_qmd.py file.tex", file=sys.stderr)
|
|
310
|
+
sys.exit(1)
|
|
311
|
+
tex2qmd(sys.argv[1])
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
if __name__ == "__main__":
|
|
315
|
+
main()
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
# Maintain the previous helper name for any existing imports.
|
|
319
|
+
convert_tex_to_qmd = tex2qmd
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
# again rerun
|
|
2
|
+
from lxml import html, etree
|
|
3
|
+
import os
|
|
4
|
+
from matlablike import *
|
|
5
|
+
from unidecode import unidecode
|
|
6
|
+
import re
|
|
7
|
+
import sys
|
|
8
|
+
|
|
9
|
+
fp = open(sys.argv[1], "r")
|
|
10
|
+
paragraphcommands_re = re.compile(r"^ *\\(sub)*paragraph{.*}")
|
|
11
|
+
commentline_re = re.compile(r"^ *%")
|
|
12
|
+
beginlatex_re = re.compile(r"^[^#]*\\begin{document}(.*)")
|
|
13
|
+
endlatex_re = re.compile(r"^([^#]*)\\end{document}.*")
|
|
14
|
+
commandstart_re = re.compile(r"(\\[a-zA-Z]+[\[{])")
|
|
15
|
+
word_citation_re = re.compile(r"(\[[0-9 ,]+\][,\.)]*)")
|
|
16
|
+
tex_citation_re = re.compile(r"(.*)(\\cite{[a-zA-Z0-9,_]+}[,\.)]*)(.*)$")
|
|
17
|
+
tex_ref_re = re.compile(r"(.*)(\\c*ref{[a-zA-Z0-9,_:\-]+}[,\.)]*)(.*)$")
|
|
18
|
+
text_list = []
|
|
19
|
+
if sys.argv[1][-4:] == ".tex":
|
|
20
|
+
latex_file = True
|
|
21
|
+
else:
|
|
22
|
+
latex_file = False
|
|
23
|
+
found_beginning = False
|
|
24
|
+
start_line = 0
|
|
25
|
+
end_line = 0
|
|
26
|
+
print("opened", sys.argv[1])
|
|
27
|
+
# {{{ pull out just the part between the document text
|
|
28
|
+
j = 0
|
|
29
|
+
for thisline in fp:
|
|
30
|
+
thisline = thisline.replace("\xa0", " ") # because word sucks
|
|
31
|
+
thisline = thisline.replace(
|
|
32
|
+
"\x93", "``"
|
|
33
|
+
) # this and following are just pulled from vim
|
|
34
|
+
thisline = thisline.replace("\x94", "''")
|
|
35
|
+
thisline = thisline.replace("\x96", "--")
|
|
36
|
+
j += 1
|
|
37
|
+
if latex_file:
|
|
38
|
+
if not found_beginning:
|
|
39
|
+
thismatch = beginlatex_re.match(thisline)
|
|
40
|
+
if thismatch:
|
|
41
|
+
text_list.append(thismatch.groups()[0].rstrip())
|
|
42
|
+
found_beginning = True
|
|
43
|
+
start_line = j + 1
|
|
44
|
+
print("Found the beginning at line", start_line)
|
|
45
|
+
else:
|
|
46
|
+
thismatch = endlatex_re.match(thisline)
|
|
47
|
+
if thismatch:
|
|
48
|
+
text_list.append(thismatch.groups()[0].rstrip())
|
|
49
|
+
print("Found the end")
|
|
50
|
+
end_line = j
|
|
51
|
+
print("Found the end at line", end_line)
|
|
52
|
+
text_list.append(thisline)
|
|
53
|
+
else:
|
|
54
|
+
text_list.append(
|
|
55
|
+
thisline.replace("$$", "")
|
|
56
|
+
) # no better place to check for the tex dollar sign double-up
|
|
57
|
+
if end_line == 0:
|
|
58
|
+
end_line = len(text_list)
|
|
59
|
+
fp.close()
|
|
60
|
+
j = 0
|
|
61
|
+
while j < len(
|
|
62
|
+
text_list
|
|
63
|
+
): # first, put citations on their own line, so I can next treat them as special lines
|
|
64
|
+
thismatch = tex_citation_re.match(text_list[j])
|
|
65
|
+
othermatch = tex_ref_re.match(text_list[j])
|
|
66
|
+
if othermatch:
|
|
67
|
+
thismatch = othermatch
|
|
68
|
+
if thismatch:
|
|
69
|
+
text_list.pop(j)
|
|
70
|
+
text_list.insert(
|
|
71
|
+
j, thismatch.groups()[2]
|
|
72
|
+
) # push on backwards, so it shows up in the right order
|
|
73
|
+
text_list.insert(
|
|
74
|
+
j,
|
|
75
|
+
thismatch.groups()[1].replace(" ", "\n%SPACE%\n")
|
|
76
|
+
+ "%NONEWLINE%\n",
|
|
77
|
+
) # since these are "fake" newlines, make sure they don't get broken! -- also to preserve spaces, I'm pre-processing the spacing here
|
|
78
|
+
text_list.insert(
|
|
79
|
+
j,
|
|
80
|
+
thismatch.groups()[0].replace(" ", "\n%SPACE%\n")
|
|
81
|
+
+ "%NONEWLINE%\n",
|
|
82
|
+
)
|
|
83
|
+
print(
|
|
84
|
+
"found citation or reference, broke line:",
|
|
85
|
+
text_list[j],
|
|
86
|
+
text_list[j + 1],
|
|
87
|
+
text_list[j + 2],
|
|
88
|
+
)
|
|
89
|
+
print("---")
|
|
90
|
+
j += 1 # so that we skip the citation we just added
|
|
91
|
+
end_line += 2 # because we added two lines
|
|
92
|
+
j += 1
|
|
93
|
+
for j in range(0, len(text_list)):
|
|
94
|
+
thismatch = paragraphcommands_re.match(text_list[j])
|
|
95
|
+
if thismatch:
|
|
96
|
+
text_list[j] = text_list[j].replace(
|
|
97
|
+
"\n", "%NEWLINE%\n"
|
|
98
|
+
) # these lines are protected/preserved from being chopped up, since they are invisible
|
|
99
|
+
print("found paragraph line:", text_list[j])
|
|
100
|
+
else:
|
|
101
|
+
thismatch = tex_citation_re.match(text_list[j])
|
|
102
|
+
if not thismatch:
|
|
103
|
+
thismatch = tex_ref_re.match(text_list[j])
|
|
104
|
+
if thismatch:
|
|
105
|
+
print("found citation line:", text_list[j])
|
|
106
|
+
else:
|
|
107
|
+
text_list[j] = text_list[j].replace("~", "\n~\n")
|
|
108
|
+
text_list[j] = commandstart_re.sub("\\1\n", text_list[j])
|
|
109
|
+
text_list[j] = word_citation_re.sub("\n\\1\n", text_list[j])
|
|
110
|
+
text_list[j] = text_list[j].replace("}", "\n}\n")
|
|
111
|
+
text_list[j] = text_list[j].replace("]{", "\n]{\n")
|
|
112
|
+
text_list[j] = text_list[j].replace(" ", "\n%SPACE%\n")
|
|
113
|
+
if text_list[j][-12:] == "%NONEWLINE%\n":
|
|
114
|
+
print("trying to drop NONEWLINE going from:")
|
|
115
|
+
print(text_list[j])
|
|
116
|
+
text_list[j] = text_list[j][:-12] + "\n"
|
|
117
|
+
print("to:\n", text_list[j])
|
|
118
|
+
else:
|
|
119
|
+
print("line ends in:", text_list[j][-12:])
|
|
120
|
+
text_list[j] += "%NEWLINE%\n"
|
|
121
|
+
text_list[j] = text_list[j].replace("\r", "\n%NEWLINE%\n")
|
|
122
|
+
# }}}
|
|
123
|
+
# {{{ write out the result
|
|
124
|
+
outputtext = "".join(text_list)
|
|
125
|
+
outputtext = outputtext.split("\n")
|
|
126
|
+
outputtext = [j for j in outputtext if len(j) > 0]
|
|
127
|
+
if not latex_file: # easier to just strip the tags here
|
|
128
|
+
print("this is not a latex file")
|
|
129
|
+
outputtext = [j for j in outputtext if j != "%SPACE%" and j != "%NEWLINE%"]
|
|
130
|
+
else:
|
|
131
|
+
print("this is a latex file")
|
|
132
|
+
outputtex = "".join(
|
|
133
|
+
text_list[start_line:end_line]
|
|
134
|
+
) # up to but not including the end document
|
|
135
|
+
outputtex = outputtex.split("\n")
|
|
136
|
+
outputtex = [j for j in outputtex if len(j) > 0]
|
|
137
|
+
outputtex = [
|
|
138
|
+
j for j in outputtex if j[0] != "%"
|
|
139
|
+
] # takes care of space and newline as well as tex comments
|
|
140
|
+
newfile = re.sub(r"(.*)(\..*)", r"\1_1word\2", sys.argv[1])
|
|
141
|
+
fp = open(newfile, "w")
|
|
142
|
+
fp.write("\n".join(outputtext))
|
|
143
|
+
fp.close()
|
|
144
|
+
if latex_file:
|
|
145
|
+
newfile = re.sub(r"(.*)(\..*)", r"\1_1wordstripped\2", sys.argv[1])
|
|
146
|
+
fp = open(newfile, "w")
|
|
147
|
+
fp.write("\n".join(outputtex))
|
|
148
|
+
fp.close()
|
|
149
|
+
# }}}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# again rerun
|
|
2
|
+
from lxml import html, etree
|
|
3
|
+
import os
|
|
4
|
+
from matlablike import *
|
|
5
|
+
from unidecode import unidecode
|
|
6
|
+
import re
|
|
7
|
+
import sys
|
|
8
|
+
|
|
9
|
+
fp = open(sys.argv[1], "r")
|
|
10
|
+
needsspace_re = re.compile(r'(\w[):;"\-\.,!?}]*) +(["(]*\w)')
|
|
11
|
+
paragraphcommands_re = re.compile(r"^ *\\(sub)*paragraph{.*}")
|
|
12
|
+
commentline_re = re.compile(r"^ *%")
|
|
13
|
+
normalline_re = re.compile(r"^\(%SPACE%\)\|\(%NEWLINE%\)")
|
|
14
|
+
notweird_re = re.compile(r"^(%SPACE%)|(%\[ORIG%)|(%ORIG\]\[NEW%)|(%NEW\]%)")
|
|
15
|
+
text_list = []
|
|
16
|
+
found_beginning = False
|
|
17
|
+
print("opened", sys.argv[1])
|
|
18
|
+
# {{{ pull out just the part between the document text
|
|
19
|
+
for thisline in fp:
|
|
20
|
+
if (
|
|
21
|
+
(thisline[:7] == "<<<<<<<")
|
|
22
|
+
or (thisline[:7] == "=======")
|
|
23
|
+
or (thisline[:7] == ">>>>>>>")
|
|
24
|
+
):
|
|
25
|
+
text_list.append(
|
|
26
|
+
"%NEWLINE% %CONFLICT%" + thisline.strip() + "%NEWLINE%"
|
|
27
|
+
)
|
|
28
|
+
else:
|
|
29
|
+
text_list.append(thisline.rstrip())
|
|
30
|
+
fp.close()
|
|
31
|
+
text_list = [x.replace("%NEWLINE%", "\n") for x in text_list]
|
|
32
|
+
# {{{ don't mess with the "special" lines
|
|
33
|
+
for j, thisline in enumerate(text_list):
|
|
34
|
+
if not notweird_re.match(thisline):
|
|
35
|
+
if paragraphcommands_re.match(thisline) or commentline_re.match(
|
|
36
|
+
thisline
|
|
37
|
+
):
|
|
38
|
+
print("found special line '", thisline, "'")
|
|
39
|
+
text_list[j] = thisline.replace(" ", " %SPACE% ")
|
|
40
|
+
# }}}
|
|
41
|
+
text_list = " ".join(text_list)
|
|
42
|
+
text_list = needsspace_re.sub(r"\1 %SPACE% \2", text_list)
|
|
43
|
+
text_list = needsspace_re.sub(
|
|
44
|
+
r"\1 %SPACE% \2", text_list
|
|
45
|
+
) # again to catch the single letter ones
|
|
46
|
+
text_list = text_list.replace(" ", "")
|
|
47
|
+
text_list = text_list.replace("%SPACE%", " ")
|
|
48
|
+
# {{{ write out the result
|
|
49
|
+
newfile = re.sub(r"(.*)(\..*)", r"\1_1wordcollapse\2", sys.argv[1])
|
|
50
|
+
fp = open(newfile, "w")
|
|
51
|
+
outputtext = "".join(text_list)
|
|
52
|
+
fp.write(outputtext)
|
|
53
|
+
fp.close()
|
|
54
|
+
# }}}
|
pydifftools/outline.py
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
import pickle
|
|
2
|
+
from .doc_contents import doc_contents_class
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from .command_registry import register_command
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _write_outline_files(all_contents, basename):
|
|
9
|
+
# save the reordered data and user-editable outline sidecar
|
|
10
|
+
with open(f"{basename}_outline.pickle", "wb") as fp:
|
|
11
|
+
pickle.dump(all_contents, fp)
|
|
12
|
+
with open(f"{basename}_outline.md", "w", encoding="utf-8") as fp:
|
|
13
|
+
fp.write(all_contents.outline)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def extract_outline(filename):
|
|
17
|
+
basename = filename.replace(".tex", "")
|
|
18
|
+
section_re = re.compile(
|
|
19
|
+
r"\\(paragraph|subparagraph|subsubsection|subsection|section)\{"
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
all_contents = doc_contents_class()
|
|
23
|
+
bracelevel = 0
|
|
24
|
+
with open(filename, "r", encoding="utf-8") as fp:
|
|
25
|
+
for thisline in fp:
|
|
26
|
+
if bracelevel == 0:
|
|
27
|
+
thismatch = section_re.match(thisline)
|
|
28
|
+
if thismatch:
|
|
29
|
+
sectype = thismatch.groups()[0]
|
|
30
|
+
bracelevel = 1
|
|
31
|
+
all_contents += thisline[: thismatch.start()]
|
|
32
|
+
escaped = False
|
|
33
|
+
thistitle = ""
|
|
34
|
+
else:
|
|
35
|
+
all_contents += thisline
|
|
36
|
+
if (
|
|
37
|
+
bracelevel > 0
|
|
38
|
+
): # do this whether continued open brace from previous line,
|
|
39
|
+
# or if we opened brace on previous
|
|
40
|
+
for n, j in enumerate(thisline[thismatch.end() :]):
|
|
41
|
+
if escaped:
|
|
42
|
+
escaped = False
|
|
43
|
+
elif j == "\\":
|
|
44
|
+
escaped = True
|
|
45
|
+
elif j == "{":
|
|
46
|
+
bracelevel += 1
|
|
47
|
+
elif j == "}":
|
|
48
|
+
bracelevel -= 1
|
|
49
|
+
if bracelevel > 0:
|
|
50
|
+
thistitle += j
|
|
51
|
+
else:
|
|
52
|
+
all_contents.start_sec(sectype, thistitle)
|
|
53
|
+
all_contents += thisline[thismatch.end() + n + 1 :]
|
|
54
|
+
break
|
|
55
|
+
else: # hit the end of the line without the break
|
|
56
|
+
thisline += "\n"
|
|
57
|
+
_write_outline_files(all_contents, basename)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _reorder_from_outline(targetfile, extension, format_type):
|
|
61
|
+
# rebuild a file based on the user-adjusted outline list
|
|
62
|
+
markdownfile = targetfile.replace(extension, "_outline.md")
|
|
63
|
+
picklefile = targetfile.replace(extension, "_outline.pickle")
|
|
64
|
+
if not (
|
|
65
|
+
markdownfile.endswith(".md")
|
|
66
|
+
and picklefile.endswith(".pickle")
|
|
67
|
+
and targetfile.endswith(extension)
|
|
68
|
+
):
|
|
69
|
+
raise ValueError("pass 1 argument: target file (output)")
|
|
70
|
+
|
|
71
|
+
with open(picklefile, "rb") as fp:
|
|
72
|
+
all_contents = pickle.load(fp)
|
|
73
|
+
all_contents.set_format(format_type)
|
|
74
|
+
with open(markdownfile, "r", encoding="utf-8") as fp:
|
|
75
|
+
for thisline in fp:
|
|
76
|
+
all_contents.outline_in_order(thisline.rstrip())
|
|
77
|
+
with open(targetfile, "w", encoding="utf-8", newline="\n") as fp:
|
|
78
|
+
fp.write(str(all_contents))
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@register_command(
|
|
82
|
+
"Save tex file as outline, with filename_outline.pickle storing content",
|
|
83
|
+
" and filename_outline.md giving outline.",
|
|
84
|
+
)
|
|
85
|
+
def xo(arguments):
|
|
86
|
+
assert len(arguments) == 1
|
|
87
|
+
extract_outline(arguments[0])
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@register_command(
|
|
91
|
+
"Save markdown file as outline, with filename_outline.pickle storing"
|
|
92
|
+
" content",
|
|
93
|
+
" and filename_outline.md giving outline.",
|
|
94
|
+
)
|
|
95
|
+
def xomd(arguments):
|
|
96
|
+
assert len(arguments) == 1
|
|
97
|
+
filename = arguments[0]
|
|
98
|
+
# read a markdown file and capture headings while keeping content for
|
|
99
|
+
# reordering
|
|
100
|
+
basename = filename.replace(".md", "")
|
|
101
|
+
header_re = re.compile(r"^(#{1,6})\s+(.*)")
|
|
102
|
+
underline_re = {
|
|
103
|
+
"section": re.compile(r"^={3,}\s*$"),
|
|
104
|
+
"subsection": re.compile(r"^-{3,}\s*$"),
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
all_contents = doc_contents_class("markdown")
|
|
108
|
+
previous_line = None
|
|
109
|
+
in_code_block = False
|
|
110
|
+
with open(filename, "r", encoding="utf-8") as fp:
|
|
111
|
+
for thisline in fp:
|
|
112
|
+
stripped = thisline.rstrip("\n")
|
|
113
|
+
if stripped.startswith("```"):
|
|
114
|
+
in_code_block = not in_code_block
|
|
115
|
+
all_contents += thisline
|
|
116
|
+
previous_line = None
|
|
117
|
+
continue
|
|
118
|
+
if in_code_block:
|
|
119
|
+
all_contents += thisline
|
|
120
|
+
continue
|
|
121
|
+
thismatch = header_re.match(stripped)
|
|
122
|
+
if thismatch:
|
|
123
|
+
hashes, thistitle = thismatch.groups()
|
|
124
|
+
level = len(hashes)
|
|
125
|
+
if level == 1:
|
|
126
|
+
all_contents.start_sec("section", thistitle.strip())
|
|
127
|
+
elif level == 2:
|
|
128
|
+
all_contents.start_sec("subsection", thistitle.strip())
|
|
129
|
+
elif level == 3:
|
|
130
|
+
all_contents.start_sec("subsubsection", thistitle.strip())
|
|
131
|
+
elif level == 4:
|
|
132
|
+
all_contents.start_sec("paragraph", thistitle.strip())
|
|
133
|
+
else:
|
|
134
|
+
all_contents.start_sec("subparagraph", thistitle.strip())
|
|
135
|
+
all_contents += "\n"
|
|
136
|
+
previous_line = None
|
|
137
|
+
continue
|
|
138
|
+
if previous_line is not None:
|
|
139
|
+
if underline_re["section"].match(stripped):
|
|
140
|
+
all_contents.start_sec("section", previous_line.strip())
|
|
141
|
+
previous_line = None
|
|
142
|
+
continue
|
|
143
|
+
if underline_re["subsection"].match(stripped):
|
|
144
|
+
all_contents.start_sec("subsection", previous_line.strip())
|
|
145
|
+
previous_line = None
|
|
146
|
+
continue
|
|
147
|
+
all_contents += previous_line + "\n"
|
|
148
|
+
previous_line = stripped
|
|
149
|
+
if previous_line:
|
|
150
|
+
all_contents += previous_line + "\n"
|
|
151
|
+
_write_outline_files(all_contents, basename)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
@register_command(
|
|
155
|
+
"use the modified filename_outline.md to write reordered text",
|
|
156
|
+
help={"texfile": "TeX file to regenerate from its outline files"},
|
|
157
|
+
)
|
|
158
|
+
def xoreorder(texfile):
|
|
159
|
+
"""Rewrite a TeX file using its saved outline and ordering hints."""
|
|
160
|
+
|
|
161
|
+
_reorder_from_outline(texfile, ".tex", "latex")
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
@register_command(
|
|
165
|
+
"rewrite a markdown file using its saved outline and ordering hints",
|
|
166
|
+
help={"mdfile": "Markdown file to regenerate from its outline files"},
|
|
167
|
+
)
|
|
168
|
+
def xomdreorder(mdfile):
|
|
169
|
+
_reorder_from_outline(mdfile, ".md", "markdown")
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
# Provide the previous function name for callers expecting it.
|
|
173
|
+
write_reordered = xoreorder
|