novelWriter 2.4.4__py3-none-any.whl → 2.5rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {novelWriter-2.4.4.dist-info → novelWriter-2.5rc1.dist-info}/METADATA +4 -5
- {novelWriter-2.4.4.dist-info → novelWriter-2.5rc1.dist-info}/RECORD +109 -101
- {novelWriter-2.4.4.dist-info → novelWriter-2.5rc1.dist-info}/WHEEL +1 -1
- novelwriter/__init__.py +33 -39
- novelwriter/assets/i18n/project_en_GB.json +1 -0
- novelwriter/assets/icons/typicons_dark/icons.conf +2 -0
- novelwriter/assets/icons/typicons_dark/nw_font.svg +4 -0
- novelwriter/assets/icons/typicons_dark/nw_quote.svg +4 -0
- novelwriter/assets/icons/typicons_light/icons.conf +2 -0
- novelwriter/assets/icons/typicons_light/nw_font.svg +4 -0
- novelwriter/assets/icons/typicons_light/nw_quote.svg +4 -0
- novelwriter/assets/manual.pdf +0 -0
- novelwriter/assets/sample.zip +0 -0
- novelwriter/assets/syntax/cyberpunk_night.conf +5 -3
- novelwriter/assets/syntax/default_dark.conf +32 -18
- novelwriter/assets/syntax/default_light.conf +24 -10
- novelwriter/assets/syntax/dracula.conf +44 -0
- novelwriter/assets/syntax/grey_dark.conf +5 -4
- novelwriter/assets/syntax/grey_light.conf +5 -4
- novelwriter/assets/syntax/light_owl.conf +7 -6
- novelwriter/assets/syntax/night_owl.conf +7 -6
- novelwriter/assets/syntax/snazzy.conf +42 -0
- novelwriter/assets/syntax/solarized_dark.conf +4 -3
- novelwriter/assets/syntax/solarized_light.conf +4 -3
- novelwriter/assets/syntax/tango.conf +27 -11
- novelwriter/assets/syntax/tomorrow.conf +6 -5
- novelwriter/assets/syntax/tomorrow_night.conf +7 -6
- novelwriter/assets/syntax/tomorrow_night_blue.conf +6 -5
- novelwriter/assets/syntax/tomorrow_night_bright.conf +6 -5
- novelwriter/assets/syntax/tomorrow_night_eighties.conf +6 -5
- novelwriter/assets/text/credits_en.htm +4 -1
- novelwriter/assets/themes/cyberpunk_night.conf +3 -0
- novelwriter/assets/themes/default_dark.conf +2 -0
- novelwriter/assets/themes/default_light.conf +2 -0
- novelwriter/assets/themes/dracula.conf +48 -0
- novelwriter/assets/themes/solarized_dark.conf +2 -0
- novelwriter/assets/themes/solarized_light.conf +2 -0
- novelwriter/common.py +33 -12
- novelwriter/config.py +184 -98
- novelwriter/constants.py +47 -35
- novelwriter/core/buildsettings.py +68 -69
- novelwriter/core/coretools.py +5 -23
- novelwriter/core/docbuild.py +52 -40
- novelwriter/core/document.py +3 -5
- novelwriter/core/index.py +115 -45
- novelwriter/core/item.py +8 -19
- novelwriter/core/options.py +2 -4
- novelwriter/core/project.py +23 -57
- novelwriter/core/projectdata.py +1 -3
- novelwriter/core/projectxml.py +12 -15
- novelwriter/core/sessions.py +3 -5
- novelwriter/core/spellcheck.py +4 -9
- novelwriter/core/status.py +211 -164
- novelwriter/core/storage.py +0 -8
- novelwriter/core/tohtml.py +139 -105
- novelwriter/core/tokenizer.py +278 -122
- novelwriter/core/{tomd.py → tomarkdown.py} +97 -78
- novelwriter/core/toodt.py +257 -166
- novelwriter/core/toqdoc.py +419 -0
- novelwriter/core/tree.py +5 -7
- novelwriter/dialogs/about.py +11 -18
- novelwriter/dialogs/docmerge.py +17 -19
- novelwriter/dialogs/docsplit.py +17 -19
- novelwriter/dialogs/editlabel.py +6 -10
- novelwriter/dialogs/preferences.py +193 -144
- novelwriter/dialogs/projectsettings.py +225 -189
- novelwriter/dialogs/quotes.py +12 -9
- novelwriter/dialogs/wordlist.py +9 -15
- novelwriter/enum.py +35 -30
- novelwriter/error.py +8 -15
- novelwriter/extensions/configlayout.py +40 -21
- novelwriter/extensions/eventfilters.py +1 -5
- novelwriter/extensions/modified.py +58 -14
- novelwriter/extensions/novelselector.py +1 -3
- novelwriter/extensions/pagedsidebar.py +9 -12
- novelwriter/extensions/{circularprogress.py → progressbars.py} +30 -8
- novelwriter/extensions/statusled.py +29 -25
- novelwriter/extensions/switch.py +4 -6
- novelwriter/extensions/switchbox.py +7 -6
- novelwriter/extensions/versioninfo.py +3 -9
- novelwriter/gui/doceditor.py +118 -137
- novelwriter/gui/dochighlight.py +231 -186
- novelwriter/gui/docviewer.py +66 -107
- novelwriter/gui/docviewerpanel.py +3 -10
- novelwriter/gui/editordocument.py +1 -3
- novelwriter/gui/itemdetails.py +7 -11
- novelwriter/gui/mainmenu.py +22 -18
- novelwriter/gui/noveltree.py +11 -24
- novelwriter/gui/outline.py +14 -26
- novelwriter/gui/projtree.py +35 -60
- novelwriter/gui/search.py +10 -3
- novelwriter/gui/sidebar.py +2 -6
- novelwriter/gui/statusbar.py +29 -37
- novelwriter/gui/theme.py +26 -48
- novelwriter/guimain.py +134 -148
- novelwriter/shared.py +36 -32
- novelwriter/text/patterns.py +113 -0
- novelwriter/tools/dictionaries.py +10 -20
- novelwriter/tools/lipsum.py +10 -16
- novelwriter/tools/manusbuild.py +9 -11
- novelwriter/tools/manuscript.py +71 -145
- novelwriter/tools/manussettings.py +71 -75
- novelwriter/tools/noveldetails.py +16 -21
- novelwriter/tools/welcome.py +12 -26
- novelwriter/tools/writingstats.py +9 -12
- novelwriter/types.py +49 -4
- novelwriter/extensions/simpleprogress.py +0 -55
- {novelWriter-2.4.4.dist-info → novelWriter-2.5rc1.dist-info}/LICENSE.md +0 -0
- {novelWriter-2.4.4.dist-info → novelWriter-2.5rc1.dist-info}/entry_points.txt +0 -0
- {novelWriter-2.4.4.dist-info → novelWriter-2.5rc1.dist-info}/top_level.txt +0 -0
novelwriter/core/tokenizer.py
CHANGED
@@ -24,30 +24,35 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
24
24
|
"""
|
25
25
|
from __future__ import annotations
|
26
26
|
|
27
|
-
import re
|
28
27
|
import json
|
29
28
|
import logging
|
29
|
+
import re
|
30
30
|
|
31
31
|
from abc import ABC, abstractmethod
|
32
|
-
from time import time
|
33
|
-
from pathlib import Path
|
34
32
|
from functools import partial
|
33
|
+
from pathlib import Path
|
34
|
+
from time import time
|
35
35
|
|
36
36
|
from PyQt5.QtCore import QCoreApplication, QRegularExpression
|
37
|
+
from PyQt5.QtGui import QFont
|
37
38
|
|
38
|
-
from novelwriter
|
39
|
-
from novelwriter.
|
40
|
-
|
41
|
-
)
|
39
|
+
from novelwriter import CONFIG
|
40
|
+
from novelwriter.common import checkInt, formatTimeStamp, numberToRoman
|
41
|
+
from novelwriter.constants import nwHeadFmt, nwKeyWords, nwLabels, nwShortcode, nwUnicode, trConst
|
42
42
|
from novelwriter.core.index import processComment
|
43
43
|
from novelwriter.core.project import NWProject
|
44
44
|
from novelwriter.enum import nwComment, nwItemLayout
|
45
|
+
from novelwriter.text.patterns import REGEX_PATTERNS
|
45
46
|
|
46
47
|
logger = logging.getLogger(__name__)
|
47
48
|
|
48
49
|
ESCAPES = {r"\*": "*", r"\~": "~", r"\_": "_", r"\[": "[", r"\]": "]", r"\ ": ""}
|
49
50
|
RX_ESC = re.compile("|".join([re.escape(k) for k in ESCAPES.keys()]), flags=re.DOTALL)
|
50
51
|
|
52
|
+
T_Formats = list[tuple[int, int, str]]
|
53
|
+
T_Comment = tuple[str, T_Formats]
|
54
|
+
T_Token = tuple[int, int, str, T_Formats, int]
|
55
|
+
|
51
56
|
|
52
57
|
def stripEscape(text: str) -> str:
|
53
58
|
"""Strip escaped Markdown characters from paragraph text."""
|
@@ -80,6 +85,12 @@ class Tokenizer(ABC):
|
|
80
85
|
FMT_SUP_E = 12 # End superscript
|
81
86
|
FMT_SUB_B = 13 # Begin subscript
|
82
87
|
FMT_SUB_E = 14 # End subscript
|
88
|
+
FMT_DL_B = 15 # Begin dialogue
|
89
|
+
FMT_DL_E = 16 # End dialogue
|
90
|
+
FMT_ADL_B = 17 # Begin alt dialogue
|
91
|
+
FMT_ADL_E = 18 # End alt dialogue
|
92
|
+
FMT_FNOTE = 19 # Footnote marker
|
93
|
+
FMT_STRIP = 20 # Strip the format code
|
83
94
|
|
84
95
|
# Block Type
|
85
96
|
T_EMPTY = 1 # Empty line (new paragraph)
|
@@ -108,48 +119,60 @@ class Tokenizer(ABC):
|
|
108
119
|
A_Z_BTMMRG = 0x0080 # Zero bottom margin
|
109
120
|
A_IND_L = 0x0100 # Left indentation
|
110
121
|
A_IND_R = 0x0200 # Right indentation
|
122
|
+
A_IND_T = 0x0400 # Text indentation
|
123
|
+
|
124
|
+
# Masks
|
125
|
+
M_ALIGNED = A_LEFT | A_RIGHT | A_CENTRE | A_JUSTIFY
|
111
126
|
|
112
127
|
# Lookups
|
113
128
|
L_HEADINGS = [T_TITLE, T_HEAD1, T_HEAD2, T_HEAD3, T_HEAD4]
|
129
|
+
L_SKIP_INDENT = [T_TITLE, T_HEAD1, T_HEAD2, T_HEAD2, T_HEAD3, T_HEAD4, T_SEP, T_SKIP]
|
130
|
+
L_SUMMARY = [T_SYNOPSIS, T_SHORT]
|
114
131
|
|
115
132
|
def __init__(self, project: NWProject) -> None:
|
116
133
|
|
117
134
|
self._project = project
|
118
135
|
|
119
136
|
# Data Variables
|
120
|
-
self._text = ""
|
121
|
-
self._handle = None
|
122
|
-
self._result = ""
|
137
|
+
self._text = "" # The raw text to be tokenized
|
138
|
+
self._handle = None # The item handle currently being processed
|
139
|
+
self._result = "" # The result of the last document
|
140
|
+
self._keepMD = False # Whether to keep the markdown text
|
123
141
|
|
124
|
-
|
125
|
-
self.
|
142
|
+
# Tokens and Meta Data (Per Document)
|
143
|
+
self._tokens: list[T_Token] = []
|
144
|
+
self._footnotes: dict[str, T_Comment] = {}
|
126
145
|
|
127
|
-
#
|
128
|
-
self._tokens: list[tuple[int, int, str, list[tuple[int, int]], int]] = []
|
146
|
+
# Tokens and Meta Data (Per Instance)
|
129
147
|
self._counts: dict[str, int] = {}
|
130
148
|
self._outline: dict[str, str] = {}
|
149
|
+
self._markdown: list[str] = []
|
131
150
|
|
132
151
|
# User Settings
|
133
|
-
self._textFont = "Serif" # Output text font
|
134
|
-
self._textSize = 11 # Output text size
|
135
|
-
self._textFixed = False # Fixed width text
|
152
|
+
self._textFont = QFont("Serif", 11) # Output text font
|
136
153
|
self._lineHeight = 1.15 # Line height in units of em
|
137
154
|
self._blockIndent = 4.00 # Block indent in units of em
|
155
|
+
self._firstIndent = False # Enable first line indent
|
156
|
+
self._firstWidth = 1.40 # First line indent in units of em
|
157
|
+
self._indentFirst = False # Indent first paragraph
|
138
158
|
self._doJustify = False # Justify text
|
139
159
|
self._doBodyText = True # Include body text
|
140
160
|
self._doSynopsis = False # Also process synopsis comments
|
141
161
|
self._doComments = False # Also process comments
|
142
162
|
self._doKeywords = False # Also process keywords like tags and references
|
143
163
|
self._skipKeywords = set() # Keywords to ignore
|
164
|
+
self._keepBreaks = True # Keep line breaks in paragraphs
|
144
165
|
|
145
166
|
# Margins
|
146
|
-
self._marginTitle = (1.
|
147
|
-
self._marginHead1 = (1.
|
148
|
-
self._marginHead2 = (
|
149
|
-
self._marginHead3 = (
|
150
|
-
self._marginHead4 = (
|
167
|
+
self._marginTitle = (1.417, 0.500)
|
168
|
+
self._marginHead1 = (1.417, 0.500)
|
169
|
+
self._marginHead2 = (1.668, 0.500)
|
170
|
+
self._marginHead3 = (1.168, 0.500)
|
171
|
+
self._marginHead4 = (1.168, 0.500)
|
151
172
|
self._marginText = (0.000, 0.584)
|
152
173
|
self._marginMeta = (0.000, 0.584)
|
174
|
+
self._marginFoot = (1.417, 0.467)
|
175
|
+
self._marginSep = (1.168, 1.168)
|
153
176
|
|
154
177
|
# Title Formats
|
155
178
|
self._fmtTitle = nwHeadFmt.TITLE # Formatting for titles
|
@@ -174,7 +197,8 @@ class Tokenizer(ABC):
|
|
174
197
|
|
175
198
|
# Instance Variables
|
176
199
|
self._hFormatter = HeadingFormatter(self._project)
|
177
|
-
self._noSep = True
|
200
|
+
self._noSep = True # Flag to indicate that we don't want a scene separator
|
201
|
+
self._showDialog = False # Flag for dialogue highlighting
|
178
202
|
|
179
203
|
# This File
|
180
204
|
self._isNovel = False # Document is a novel document
|
@@ -189,12 +213,12 @@ class Tokenizer(ABC):
|
|
189
213
|
|
190
214
|
# Format RegEx
|
191
215
|
self._rxMarkdown = [
|
192
|
-
(
|
193
|
-
(
|
194
|
-
(
|
216
|
+
(REGEX_PATTERNS.markdownItalic, [0, self.FMT_I_B, 0, self.FMT_I_E]),
|
217
|
+
(REGEX_PATTERNS.markdownBold, [0, self.FMT_B_B, 0, self.FMT_B_E]),
|
218
|
+
(REGEX_PATTERNS.markdownStrike, [0, self.FMT_D_B, 0, self.FMT_D_E]),
|
195
219
|
]
|
196
|
-
self._rxShortCodes =
|
197
|
-
self._rxShortCodeVals =
|
220
|
+
self._rxShortCodes = REGEX_PATTERNS.shortcodePlain
|
221
|
+
self._rxShortCodeVals = REGEX_PATTERNS.shortcodeValue
|
198
222
|
|
199
223
|
self._shortCodeFmt = {
|
200
224
|
nwShortcode.ITALIC_O: self.FMT_I_B, nwShortcode.ITALIC_C: self.FMT_I_E,
|
@@ -205,6 +229,11 @@ class Tokenizer(ABC):
|
|
205
229
|
nwShortcode.SUP_O: self.FMT_SUP_B, nwShortcode.SUP_C: self.FMT_SUP_E,
|
206
230
|
nwShortcode.SUB_O: self.FMT_SUB_B, nwShortcode.SUB_C: self.FMT_SUB_E,
|
207
231
|
}
|
232
|
+
self._shortCodeVals = {
|
233
|
+
nwShortcode.FOOTNOTE_B: self.FMT_FNOTE,
|
234
|
+
}
|
235
|
+
|
236
|
+
self._rxDialogue: list[tuple[QRegularExpression, int, int]] = []
|
208
237
|
|
209
238
|
return
|
210
239
|
|
@@ -220,7 +249,7 @@ class Tokenizer(ABC):
|
|
220
249
|
@property
|
221
250
|
def allMarkdown(self) -> list[str]:
|
222
251
|
"""The combined novelWriter Markdown text."""
|
223
|
-
return self.
|
252
|
+
return self._markdown
|
224
253
|
|
225
254
|
@property
|
226
255
|
def textStats(self) -> dict[str, int]:
|
@@ -298,11 +327,9 @@ class Tokenizer(ABC):
|
|
298
327
|
)
|
299
328
|
return
|
300
329
|
|
301
|
-
def setFont(self,
|
330
|
+
def setFont(self, font: QFont) -> None:
|
302
331
|
"""Set the build font."""
|
303
|
-
self._textFont =
|
304
|
-
self._textSize = round(int(size))
|
305
|
-
self._textFixed = isFixed
|
332
|
+
self._textFont = font
|
306
333
|
return
|
307
334
|
|
308
335
|
def setLineHeight(self, height: float) -> None:
|
@@ -315,11 +342,43 @@ class Tokenizer(ABC):
|
|
315
342
|
self._blockIndent = min(max(float(indent), 0.0), 10.0)
|
316
343
|
return
|
317
344
|
|
345
|
+
def setFirstLineIndent(self, state: bool, indent: float, first: bool) -> None:
|
346
|
+
"""Set first line indent and whether to also indent first
|
347
|
+
paragraph after a heading.
|
348
|
+
"""
|
349
|
+
self._firstIndent = state
|
350
|
+
self._firstWidth = indent
|
351
|
+
self._indentFirst = first
|
352
|
+
return
|
353
|
+
|
318
354
|
def setJustify(self, state: bool) -> None:
|
319
355
|
"""Enable or disable text justification."""
|
320
356
|
self._doJustify = state
|
321
357
|
return
|
322
358
|
|
359
|
+
def setDialogueHighlight(self, state: bool) -> None:
|
360
|
+
"""Enable or disable dialogue highlighting."""
|
361
|
+
self._rxDialogue = []
|
362
|
+
self._showDialog = state
|
363
|
+
if state:
|
364
|
+
if CONFIG.dialogStyle > 0:
|
365
|
+
self._rxDialogue.append((
|
366
|
+
REGEX_PATTERNS.dialogStyle, self.FMT_DL_B, self.FMT_DL_E
|
367
|
+
))
|
368
|
+
if CONFIG.dialogLine:
|
369
|
+
self._rxDialogue.append((
|
370
|
+
REGEX_PATTERNS.dialogLine, self.FMT_DL_B, self.FMT_DL_E
|
371
|
+
))
|
372
|
+
if CONFIG.narratorBreak:
|
373
|
+
self._rxDialogue.append((
|
374
|
+
REGEX_PATTERNS.narratorBreak, self.FMT_DL_E, self.FMT_DL_B
|
375
|
+
))
|
376
|
+
if CONFIG.altDialogOpen and CONFIG.altDialogClose:
|
377
|
+
self._rxDialogue.append((
|
378
|
+
REGEX_PATTERNS.altDialogStyle, self.FMT_ADL_B, self.FMT_ADL_E
|
379
|
+
))
|
380
|
+
return
|
381
|
+
|
323
382
|
def setTitleMargins(self, upper: float, lower: float) -> None:
|
324
383
|
"""Set the upper and lower title margin."""
|
325
384
|
self._marginTitle = (float(upper), float(lower))
|
@@ -355,6 +414,11 @@ class Tokenizer(ABC):
|
|
355
414
|
self._marginMeta = (float(upper), float(lower))
|
356
415
|
return
|
357
416
|
|
417
|
+
def setSeparatorMargins(self, upper: float, lower: float) -> None:
|
418
|
+
"""Set the upper and lower meta text margin."""
|
419
|
+
self._marginSep = (float(upper), float(lower))
|
420
|
+
return
|
421
|
+
|
358
422
|
def setLinkHeadings(self, state: bool) -> None:
|
359
423
|
"""Enable or disable adding an anchor before headings."""
|
360
424
|
self._linkHeadings = state
|
@@ -385,9 +449,14 @@ class Tokenizer(ABC):
|
|
385
449
|
self._skipKeywords = set(x.lower().strip() for x in keywords.split(","))
|
386
450
|
return
|
387
451
|
|
452
|
+
def setKeepLineBreaks(self, state: bool) -> None:
|
453
|
+
"""Keep line breaks in paragraphs."""
|
454
|
+
self._keepBreaks = state
|
455
|
+
return
|
456
|
+
|
388
457
|
def setKeepMarkdown(self, state: bool) -> None:
|
389
458
|
"""Keep original markdown during build."""
|
390
|
-
self.
|
459
|
+
self._keepMD = state
|
391
460
|
return
|
392
461
|
|
393
462
|
##
|
@@ -417,8 +486,8 @@ class Tokenizer(ABC):
|
|
417
486
|
self._tokens.append((
|
418
487
|
self.T_TITLE, 1, title, [], textAlign
|
419
488
|
))
|
420
|
-
if self.
|
421
|
-
self.
|
489
|
+
if self._keepMD:
|
490
|
+
self._markdown.append(f"#! {title}\n\n")
|
422
491
|
|
423
492
|
return
|
424
493
|
|
@@ -446,7 +515,7 @@ class Tokenizer(ABC):
|
|
446
515
|
self._text = xRep.sub(lambda x: repDict[x.group(0)], self._text)
|
447
516
|
|
448
517
|
# Process the character translation map
|
449
|
-
trDict = {nwUnicode.
|
518
|
+
trDict = {nwUnicode.U_MAPOS: nwUnicode.U_RSQUO}
|
450
519
|
self._text = self._text.translate(str.maketrans(trDict))
|
451
520
|
|
452
521
|
return
|
@@ -466,22 +535,23 @@ class Tokenizer(ABC):
|
|
466
535
|
4: The internal formatting map of the text, self.FMT_*
|
467
536
|
5: The style of the block, self.A_*
|
468
537
|
"""
|
469
|
-
self._tokens = []
|
470
538
|
if self._isNovel:
|
471
539
|
self._hFormatter.setHandle(self._handle)
|
472
540
|
|
473
541
|
nHead = 0
|
474
542
|
breakNext = False
|
475
543
|
tmpMarkdown = []
|
544
|
+
tHandle = self._handle or ""
|
545
|
+
tokens: list[T_Token] = []
|
476
546
|
for aLine in self._text.splitlines():
|
477
547
|
sLine = aLine.strip().lower()
|
478
548
|
|
479
549
|
# Check for blank lines
|
480
550
|
if len(sLine) == 0:
|
481
|
-
|
551
|
+
tokens.append((
|
482
552
|
self.T_EMPTY, nHead, "", [], self.A_NONE
|
483
553
|
))
|
484
|
-
if self.
|
554
|
+
if self._keepMD:
|
485
555
|
tmpMarkdown.append("\n")
|
486
556
|
|
487
557
|
continue
|
@@ -507,7 +577,7 @@ class Tokenizer(ABC):
|
|
507
577
|
continue
|
508
578
|
|
509
579
|
elif sLine == "[vspace]":
|
510
|
-
|
580
|
+
tokens.append(
|
511
581
|
(self.T_SKIP, nHead, "", [], sAlign)
|
512
582
|
)
|
513
583
|
continue
|
@@ -515,11 +585,11 @@ class Tokenizer(ABC):
|
|
515
585
|
elif sLine.startswith("[vspace:") and sLine.endswith("]"):
|
516
586
|
nSkip = checkInt(sLine[8:-1], 0)
|
517
587
|
if nSkip >= 1:
|
518
|
-
|
588
|
+
tokens.append(
|
519
589
|
(self.T_SKIP, nHead, "", [], sAlign)
|
520
590
|
)
|
521
591
|
if nSkip > 1:
|
522
|
-
|
592
|
+
tokens += (nSkip - 1) * [
|
523
593
|
(self.T_SKIP, nHead, "", [], self.A_NONE)
|
524
594
|
]
|
525
595
|
continue
|
@@ -533,24 +603,32 @@ class Tokenizer(ABC):
|
|
533
603
|
if aLine.startswith("%~"):
|
534
604
|
continue
|
535
605
|
|
536
|
-
cStyle, cText, _ = processComment(aLine)
|
606
|
+
cStyle, cKey, cText, _, _ = processComment(aLine)
|
537
607
|
if cStyle == nwComment.SYNOPSIS:
|
538
|
-
self.
|
539
|
-
|
608
|
+
tLine, tFmt = self._extractFormats(cText)
|
609
|
+
tokens.append((
|
610
|
+
self.T_SYNOPSIS, nHead, tLine, tFmt, sAlign
|
540
611
|
))
|
541
|
-
if self._doSynopsis and self.
|
612
|
+
if self._doSynopsis and self._keepMD:
|
542
613
|
tmpMarkdown.append(f"{aLine}\n")
|
543
614
|
elif cStyle == nwComment.SHORT:
|
544
|
-
self.
|
545
|
-
|
615
|
+
tLine, tFmt = self._extractFormats(cText)
|
616
|
+
tokens.append((
|
617
|
+
self.T_SHORT, nHead, tLine, tFmt, sAlign
|
546
618
|
))
|
547
|
-
if self._doSynopsis and self.
|
619
|
+
if self._doSynopsis and self._keepMD:
|
620
|
+
tmpMarkdown.append(f"{aLine}\n")
|
621
|
+
elif cStyle == nwComment.FOOTNOTE:
|
622
|
+
tLine, tFmt = self._extractFormats(cText, skip=self.FMT_FNOTE)
|
623
|
+
self._footnotes[f"{tHandle}:{cKey}"] = (tLine, tFmt)
|
624
|
+
if self._keepMD:
|
548
625
|
tmpMarkdown.append(f"{aLine}\n")
|
549
626
|
else:
|
550
|
-
self.
|
551
|
-
|
627
|
+
tLine, tFmt = self._extractFormats(cText)
|
628
|
+
tokens.append((
|
629
|
+
self.T_COMMENT, nHead, tLine, tFmt, sAlign
|
552
630
|
))
|
553
|
-
if self._doComments and self.
|
631
|
+
if self._doComments and self._keepMD:
|
554
632
|
tmpMarkdown.append(f"{aLine}\n")
|
555
633
|
|
556
634
|
elif aLine.startswith("@"):
|
@@ -560,11 +638,14 @@ class Tokenizer(ABC):
|
|
560
638
|
# are automatically skipped.
|
561
639
|
|
562
640
|
valid, bits, _ = self._project.index.scanThis(aLine)
|
563
|
-
if
|
564
|
-
|
641
|
+
if (
|
642
|
+
valid and bits and bits[0] in nwLabels.KEY_NAME
|
643
|
+
and bits[0] not in self._skipKeywords
|
644
|
+
):
|
645
|
+
tokens.append((
|
565
646
|
self.T_KEYWORD, nHead, aLine[1:].strip(), [], sAlign
|
566
647
|
))
|
567
|
-
if self._doKeywords and self.
|
648
|
+
if self._doKeywords and self._keepMD:
|
568
649
|
tmpMarkdown.append(f"{aLine}\n")
|
569
650
|
|
570
651
|
elif aLine.startswith(("# ", "#! ")):
|
@@ -597,10 +678,10 @@ class Tokenizer(ABC):
|
|
597
678
|
self._hFormatter.resetAll()
|
598
679
|
self._noSep = True
|
599
680
|
|
600
|
-
|
681
|
+
tokens.append((
|
601
682
|
tType, nHead, tText, [], tStyle
|
602
683
|
))
|
603
|
-
if self.
|
684
|
+
if self._keepMD:
|
604
685
|
tmpMarkdown.append(f"{aLine}\n")
|
605
686
|
|
606
687
|
elif aLine.startswith(("## ", "##! ")):
|
@@ -632,10 +713,10 @@ class Tokenizer(ABC):
|
|
632
713
|
self._hFormatter.resetScene()
|
633
714
|
self._noSep = True
|
634
715
|
|
635
|
-
|
716
|
+
tokens.append((
|
636
717
|
tType, nHead, tText, [], tStyle
|
637
718
|
))
|
638
|
-
if self.
|
719
|
+
if self._keepMD:
|
639
720
|
tmpMarkdown.append(f"{aLine}\n")
|
640
721
|
|
641
722
|
elif aLine.startswith(("### ", "###! ")):
|
@@ -673,10 +754,10 @@ class Tokenizer(ABC):
|
|
673
754
|
tStyle = self.A_NONE if self._noSep else self.A_CENTRE
|
674
755
|
self._noSep = False
|
675
756
|
|
676
|
-
|
757
|
+
tokens.append((
|
677
758
|
tType, nHead, tText, [], tStyle
|
678
759
|
))
|
679
|
-
if self.
|
760
|
+
if self._keepMD:
|
680
761
|
tmpMarkdown.append(f"{aLine}\n")
|
681
762
|
|
682
763
|
elif aLine.startswith("#### "):
|
@@ -703,10 +784,10 @@ class Tokenizer(ABC):
|
|
703
784
|
tType = self.T_SEP
|
704
785
|
tStyle = self.A_CENTRE
|
705
786
|
|
706
|
-
|
787
|
+
tokens.append((
|
707
788
|
tType, nHead, tText, [], tStyle
|
708
789
|
))
|
709
|
-
if self.
|
790
|
+
if self._keepMD:
|
710
791
|
tmpMarkdown.append(f"{aLine}\n")
|
711
792
|
|
712
793
|
else:
|
@@ -750,54 +831,117 @@ class Tokenizer(ABC):
|
|
750
831
|
sAlign |= self.A_IND_R
|
751
832
|
|
752
833
|
# Process formats
|
753
|
-
tLine,
|
754
|
-
|
755
|
-
self.T_TEXT, nHead, tLine,
|
834
|
+
tLine, tFmt = self._extractFormats(aLine)
|
835
|
+
tokens.append((
|
836
|
+
self.T_TEXT, nHead, tLine, tFmt, sAlign
|
756
837
|
))
|
757
|
-
if self.
|
838
|
+
if self._keepMD:
|
758
839
|
tmpMarkdown.append(f"{aLine}\n")
|
759
840
|
|
760
841
|
# If we have content, turn off the first page flag
|
761
|
-
if self._isFirst and
|
842
|
+
if self._isFirst and tokens:
|
762
843
|
self._isFirst = False # First document has been processed
|
763
844
|
|
764
845
|
# Make sure the token array doesn't start with a page break
|
765
846
|
# on the very first page, adding a blank first page.
|
766
|
-
if
|
767
|
-
|
768
|
-
|
769
|
-
|
847
|
+
if tokens[0][4] & self.A_PBB:
|
848
|
+
cToken = tokens[0]
|
849
|
+
tokens[0] = (
|
850
|
+
cToken[0], cToken[1], cToken[2], cToken[3], cToken[4] & ~self.A_PBB
|
770
851
|
)
|
771
852
|
|
772
853
|
# Always add an empty line at the end of the file
|
773
|
-
|
854
|
+
tokens.append((
|
774
855
|
self.T_EMPTY, nHead, "", [], self.A_NONE
|
775
856
|
))
|
776
|
-
if self.
|
857
|
+
if self._keepMD:
|
777
858
|
tmpMarkdown.append("\n")
|
778
|
-
self.
|
859
|
+
self._markdown.append("".join(tmpMarkdown))
|
779
860
|
|
780
861
|
# Second Pass
|
781
862
|
# ===========
|
782
|
-
#
|
863
|
+
# This second pass strips away consecutive blank lines, and
|
864
|
+
# combines consecutive text lines into the same paragraph.
|
865
|
+
# It also ensures that there isn't paragraph spacing between
|
866
|
+
# meta data lines for formats that has spacing.
|
867
|
+
|
868
|
+
self._tokens = []
|
869
|
+
pToken: T_Token = (self.T_EMPTY, 0, "", [], self.A_NONE)
|
870
|
+
nToken: T_Token = (self.T_EMPTY, 0, "", [], self.A_NONE)
|
871
|
+
|
872
|
+
lineSep = "\n" if self._keepBreaks else " "
|
873
|
+
pLines: list[T_Token] = []
|
783
874
|
|
784
|
-
|
785
|
-
|
786
|
-
|
787
|
-
for n, token in enumerate(self._tokens):
|
875
|
+
tCount = len(tokens)
|
876
|
+
pIndent = True
|
877
|
+
for n, cToken in enumerate(tokens):
|
788
878
|
|
789
879
|
if n > 0:
|
790
|
-
pToken =
|
880
|
+
pToken = tokens[n-1] # Look behind
|
791
881
|
if n < tCount - 1:
|
792
|
-
nToken =
|
882
|
+
nToken = tokens[n+1] # Look ahead
|
883
|
+
|
884
|
+
if not self._indentFirst and cToken[0] in self.L_SKIP_INDENT:
|
885
|
+
# Unless the indentFirst flag is set, we set up the next
|
886
|
+
# paragraph to not be indented if we see a block of a
|
887
|
+
# specific type
|
888
|
+
pIndent = False
|
889
|
+
|
890
|
+
if cToken[0] == self.T_EMPTY:
|
891
|
+
# We don't need to keep the empty lines after this pass
|
892
|
+
pass
|
793
893
|
|
794
|
-
|
795
|
-
|
894
|
+
elif cToken[0] == self.T_KEYWORD:
|
895
|
+
# Adjust margins for lines in a list of keyword lines
|
896
|
+
aStyle = cToken[4]
|
796
897
|
if pToken[0] == self.T_KEYWORD:
|
797
898
|
aStyle |= self.A_Z_TOPMRG
|
798
899
|
if nToken[0] == self.T_KEYWORD:
|
799
900
|
aStyle |= self.A_Z_BTMMRG
|
800
|
-
self._tokens
|
901
|
+
self._tokens.append((
|
902
|
+
cToken[0], cToken[1], cToken[2], cToken[3], aStyle
|
903
|
+
))
|
904
|
+
|
905
|
+
elif cToken[0] == self.T_TEXT:
|
906
|
+
# Combine lines from the same paragraph
|
907
|
+
pLines.append(cToken)
|
908
|
+
|
909
|
+
if nToken[0] != self.T_TEXT:
|
910
|
+
# Next token is not text, so we add the buffer to tokens
|
911
|
+
nLines = len(pLines)
|
912
|
+
cStyle = pLines[0][4]
|
913
|
+
if self._firstIndent and pIndent and not cStyle & self.M_ALIGNED:
|
914
|
+
# If paragraph indentation is enabled, not temporarily
|
915
|
+
# turned off, and the block is not aligned, we add the
|
916
|
+
# text indentation flag
|
917
|
+
cStyle |= self.A_IND_T
|
918
|
+
|
919
|
+
if nLines == 1:
|
920
|
+
# The paragraph contains a single line, so we just
|
921
|
+
# save that directly to the token list
|
922
|
+
self._tokens.append((
|
923
|
+
self.T_TEXT, pLines[0][1], pLines[0][2], pLines[0][3], cStyle
|
924
|
+
))
|
925
|
+
elif nLines > 1:
|
926
|
+
# The paragraph contains multiple lines, so we need to
|
927
|
+
# join them according to the line break policy, and
|
928
|
+
# recompute all the formatting markers
|
929
|
+
tTxt = ""
|
930
|
+
tFmt: T_Formats = []
|
931
|
+
for aToken in pLines:
|
932
|
+
tLen = len(tTxt)
|
933
|
+
tTxt += f"{aToken[2]}{lineSep}"
|
934
|
+
tFmt.extend((p+tLen, fmt, key) for p, fmt, key in aToken[3])
|
935
|
+
self._tokens.append((
|
936
|
+
self.T_TEXT, pLines[0][1], tTxt[:-1], tFmt, cStyle
|
937
|
+
))
|
938
|
+
|
939
|
+
# Reset buffer and make sure text indent is on for next pass
|
940
|
+
pLines = []
|
941
|
+
pIndent = True
|
942
|
+
|
943
|
+
else:
|
944
|
+
self._tokens.append(cToken)
|
801
945
|
|
802
946
|
return
|
803
947
|
|
@@ -840,7 +984,6 @@ class Tokenizer(ABC):
|
|
840
984
|
textWordChars = self._counts.get("textWordChars", 0)
|
841
985
|
titleWordChars = self._counts.get("titleWordChars", 0)
|
842
986
|
|
843
|
-
para = []
|
844
987
|
for tType, _, tText, _, _ in self._tokens:
|
845
988
|
tText = tText.replace(nwUnicode.U_ENDASH, " ")
|
846
989
|
tText = tText.replace(nwUnicode.U_EMDASH, " ")
|
@@ -850,22 +993,19 @@ class Tokenizer(ABC):
|
|
850
993
|
nChars = len(tText)
|
851
994
|
nWChars = len("".join(tWords))
|
852
995
|
|
853
|
-
if tType == self.
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
allWordChars += nPWChars
|
867
|
-
textWordChars += nPWChars
|
868
|
-
para = []
|
996
|
+
if tType == self.T_TEXT:
|
997
|
+
tPWords = tText.split()
|
998
|
+
nPWords = len(tPWords)
|
999
|
+
nPChars = len(tText)
|
1000
|
+
nPWChars = len("".join(tPWords))
|
1001
|
+
|
1002
|
+
paragraphCount += 1
|
1003
|
+
allWords += nPWords
|
1004
|
+
textWords += nPWords
|
1005
|
+
allChars += nPChars
|
1006
|
+
textChars += nPChars
|
1007
|
+
allWordChars += nPWChars
|
1008
|
+
textWordChars += nPWChars
|
869
1009
|
|
870
1010
|
elif tType in self.L_HEADINGS:
|
871
1011
|
titleCount += 1
|
@@ -881,9 +1021,6 @@ class Tokenizer(ABC):
|
|
881
1021
|
allChars += nChars
|
882
1022
|
allWordChars += nWChars
|
883
1023
|
|
884
|
-
elif tType == self.T_TEXT:
|
885
|
-
para.append(tText.rstrip())
|
886
|
-
|
887
1024
|
elif tType == self.T_SYNOPSIS and self._doSynopsis:
|
888
1025
|
text = "{0}: {1}".format(self._localLookup("Synopsis"), tText)
|
889
1026
|
words = text.split()
|
@@ -935,7 +1072,7 @@ class Tokenizer(ABC):
|
|
935
1072
|
def saveRawMarkdown(self, path: str | Path) -> None:
|
936
1073
|
"""Save the raw text to a plain text file."""
|
937
1074
|
with open(path, mode="w", encoding="utf-8") as outFile:
|
938
|
-
for nwdPage in self.
|
1075
|
+
for nwdPage in self._markdown:
|
939
1076
|
outFile.write(nwdPage)
|
940
1077
|
return
|
941
1078
|
|
@@ -950,7 +1087,7 @@ class Tokenizer(ABC):
|
|
950
1087
|
"buildTimeStr": formatTimeStamp(timeStamp),
|
951
1088
|
},
|
952
1089
|
"text": {
|
953
|
-
"nwd": [page.rstrip("\n").split("\n") for page in self.
|
1090
|
+
"nwd": [page.rstrip("\n").split("\n") for page in self._markdown],
|
954
1091
|
}
|
955
1092
|
}
|
956
1093
|
with open(path, mode="w", encoding="utf-8") as fObj:
|
@@ -961,9 +1098,9 @@ class Tokenizer(ABC):
|
|
961
1098
|
# Internal Functions
|
962
1099
|
##
|
963
1100
|
|
964
|
-
def _extractFormats(self, text: str) -> tuple[str,
|
1101
|
+
def _extractFormats(self, text: str, skip: int = 0) -> tuple[str, T_Formats]:
|
965
1102
|
"""Extract format markers from a text paragraph."""
|
966
|
-
temp = []
|
1103
|
+
temp: list[tuple[int, int, int, str]] = []
|
967
1104
|
|
968
1105
|
# Match Markdown
|
969
1106
|
for regEx, fmts in self._rxMarkdown:
|
@@ -971,7 +1108,7 @@ class Tokenizer(ABC):
|
|
971
1108
|
while rxItt.hasNext():
|
972
1109
|
rxMatch = rxItt.next()
|
973
1110
|
temp.extend(
|
974
|
-
|
1111
|
+
(rxMatch.capturedStart(n), rxMatch.capturedLength(n), fmt, "")
|
975
1112
|
for n, fmt in enumerate(fmts) if fmt > 0
|
976
1113
|
)
|
977
1114
|
|
@@ -979,25 +1116,46 @@ class Tokenizer(ABC):
|
|
979
1116
|
rxItt = self._rxShortCodes.globalMatch(text, 0)
|
980
1117
|
while rxItt.hasNext():
|
981
1118
|
rxMatch = rxItt.next()
|
982
|
-
temp.append(
|
1119
|
+
temp.append((
|
983
1120
|
rxMatch.capturedStart(1),
|
984
1121
|
rxMatch.capturedLength(1),
|
985
|
-
self._shortCodeFmt.get(rxMatch.captured(1).lower(), 0)
|
986
|
-
|
1122
|
+
self._shortCodeFmt.get(rxMatch.captured(1).lower(), 0),
|
1123
|
+
"",
|
1124
|
+
))
|
987
1125
|
|
988
|
-
#
|
1126
|
+
# Match Shortcode w/Values
|
1127
|
+
rxItt = self._rxShortCodeVals.globalMatch(text, 0)
|
1128
|
+
tHandle = self._handle or ""
|
1129
|
+
while rxItt.hasNext():
|
1130
|
+
rxMatch = rxItt.next()
|
1131
|
+
kind = self._shortCodeVals.get(rxMatch.captured(1).lower(), 0)
|
1132
|
+
temp.append((
|
1133
|
+
rxMatch.capturedStart(0),
|
1134
|
+
rxMatch.capturedLength(0),
|
1135
|
+
self.FMT_STRIP if kind == skip else kind,
|
1136
|
+
f"{tHandle}:{rxMatch.captured(2)}",
|
1137
|
+
))
|
1138
|
+
|
1139
|
+
# Match Dialogue
|
1140
|
+
if self._rxDialogue:
|
1141
|
+
for regEx, fmtB, fmtE in self._rxDialogue:
|
1142
|
+
rxItt = regEx.globalMatch(text, 0)
|
1143
|
+
while rxItt.hasNext():
|
1144
|
+
rxMatch = rxItt.next()
|
1145
|
+
temp.append((rxMatch.capturedStart(0), 0, fmtB, ""))
|
1146
|
+
temp.append((rxMatch.capturedEnd(0), 0, fmtE, ""))
|
1147
|
+
|
1148
|
+
# Post-process text and format
|
989
1149
|
result = text
|
990
1150
|
formats = []
|
991
|
-
for pos, n, fmt in reversed(sorted(temp, key=lambda x: x[0])):
|
1151
|
+
for pos, n, fmt, key in reversed(sorted(temp, key=lambda x: x[0])):
|
992
1152
|
if fmt > 0:
|
993
1153
|
result = result[:pos] + result[pos+n:]
|
994
|
-
formats = [(p-n, f) for p, f in formats]
|
995
|
-
formats.insert(0, (pos, fmt))
|
1154
|
+
formats = [(p-n, f, k) for p, f, k in formats]
|
1155
|
+
formats.insert(0, (pos, fmt, key))
|
996
1156
|
|
997
1157
|
return result, formats
|
998
1158
|
|
999
|
-
# END Class Tokenizer
|
1000
|
-
|
1001
1159
|
|
1002
1160
|
class HeadingFormatter:
|
1003
1161
|
|
@@ -1067,5 +1225,3 @@ class HeadingFormatter:
|
|
1067
1225
|
hFormat = hFormat.replace(nwHeadFmt.CHAR_FOCUS, fText)
|
1068
1226
|
|
1069
1227
|
return hFormat
|
1070
|
-
|
1071
|
-
# END Class HeadingFormatter
|