novelWriter 2.4.4__py3-none-any.whl → 2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {novelWriter-2.4.4.dist-info → novelWriter-2.5.dist-info}/METADATA +4 -5
- {novelWriter-2.4.4.dist-info → novelWriter-2.5.dist-info}/RECORD +121 -111
- {novelWriter-2.4.4.dist-info → novelWriter-2.5.dist-info}/WHEEL +1 -1
- novelwriter/__init__.py +33 -39
- novelwriter/assets/i18n/nw_de_DE.qm +0 -0
- novelwriter/assets/i18n/nw_en_US.qm +0 -0
- novelwriter/assets/i18n/nw_es_419.qm +0 -0
- novelwriter/assets/i18n/nw_fr_FR.qm +0 -0
- novelwriter/assets/i18n/nw_it_IT.qm +0 -0
- novelwriter/assets/i18n/nw_ja_JP.qm +0 -0
- novelwriter/assets/i18n/nw_nb_NO.qm +0 -0
- novelwriter/assets/i18n/nw_nl_NL.qm +0 -0
- novelwriter/assets/i18n/nw_pl_PL.qm +0 -0
- novelwriter/assets/i18n/nw_pt_BR.qm +0 -0
- novelwriter/assets/i18n/nw_zh_CN.qm +0 -0
- novelwriter/assets/i18n/project_en_GB.json +1 -0
- novelwriter/assets/i18n/project_pl_PL.json +116 -0
- novelwriter/assets/icons/typicons_dark/icons.conf +2 -0
- novelwriter/assets/icons/typicons_dark/nw_font.svg +4 -0
- novelwriter/assets/icons/typicons_dark/nw_quote.svg +4 -0
- novelwriter/assets/icons/typicons_light/icons.conf +2 -0
- novelwriter/assets/icons/typicons_light/nw_font.svg +4 -0
- novelwriter/assets/icons/typicons_light/nw_quote.svg +4 -0
- novelwriter/assets/manual.pdf +0 -0
- novelwriter/assets/sample.zip +0 -0
- novelwriter/assets/syntax/cyberpunk_night.conf +5 -3
- novelwriter/assets/syntax/default_dark.conf +32 -18
- novelwriter/assets/syntax/default_light.conf +24 -10
- novelwriter/assets/syntax/dracula.conf +44 -0
- novelwriter/assets/syntax/grey_dark.conf +5 -4
- novelwriter/assets/syntax/grey_light.conf +5 -4
- novelwriter/assets/syntax/light_owl.conf +7 -6
- novelwriter/assets/syntax/night_owl.conf +7 -6
- novelwriter/assets/syntax/snazzy.conf +42 -0
- novelwriter/assets/syntax/solarized_dark.conf +4 -3
- novelwriter/assets/syntax/solarized_light.conf +4 -3
- novelwriter/assets/syntax/tango.conf +27 -11
- novelwriter/assets/syntax/tomorrow.conf +6 -5
- novelwriter/assets/syntax/tomorrow_night.conf +7 -6
- novelwriter/assets/syntax/tomorrow_night_blue.conf +6 -5
- novelwriter/assets/syntax/tomorrow_night_bright.conf +6 -5
- novelwriter/assets/syntax/tomorrow_night_eighties.conf +6 -5
- novelwriter/assets/text/credits_en.htm +52 -41
- novelwriter/assets/themes/cyberpunk_night.conf +3 -0
- novelwriter/assets/themes/default_dark.conf +2 -0
- novelwriter/assets/themes/default_light.conf +2 -0
- novelwriter/assets/themes/dracula.conf +48 -0
- novelwriter/assets/themes/solarized_dark.conf +2 -0
- novelwriter/assets/themes/solarized_light.conf +2 -0
- novelwriter/common.py +33 -12
- novelwriter/config.py +184 -98
- novelwriter/constants.py +47 -35
- novelwriter/core/buildsettings.py +68 -69
- novelwriter/core/coretools.py +5 -23
- novelwriter/core/docbuild.py +52 -40
- novelwriter/core/document.py +3 -5
- novelwriter/core/index.py +115 -45
- novelwriter/core/item.py +8 -19
- novelwriter/core/options.py +2 -4
- novelwriter/core/project.py +37 -61
- novelwriter/core/projectdata.py +1 -3
- novelwriter/core/projectxml.py +12 -15
- novelwriter/core/sessions.py +3 -5
- novelwriter/core/spellcheck.py +4 -9
- novelwriter/core/status.py +211 -164
- novelwriter/core/storage.py +0 -8
- novelwriter/core/tohtml.py +139 -105
- novelwriter/core/tokenizer.py +278 -122
- novelwriter/core/{tomd.py → tomarkdown.py} +97 -78
- novelwriter/core/toodt.py +257 -166
- novelwriter/core/toqdoc.py +419 -0
- novelwriter/core/tree.py +5 -7
- novelwriter/dialogs/about.py +11 -18
- novelwriter/dialogs/docmerge.py +17 -19
- novelwriter/dialogs/docsplit.py +17 -19
- novelwriter/dialogs/editlabel.py +6 -10
- novelwriter/dialogs/preferences.py +200 -164
- novelwriter/dialogs/projectsettings.py +225 -189
- novelwriter/dialogs/quotes.py +12 -9
- novelwriter/dialogs/wordlist.py +9 -15
- novelwriter/enum.py +35 -30
- novelwriter/error.py +8 -15
- novelwriter/extensions/configlayout.py +55 -21
- novelwriter/extensions/eventfilters.py +1 -5
- novelwriter/extensions/modified.py +58 -14
- novelwriter/extensions/novelselector.py +1 -3
- novelwriter/extensions/pagedsidebar.py +9 -12
- novelwriter/extensions/{circularprogress.py → progressbars.py} +30 -8
- novelwriter/extensions/statusled.py +40 -26
- novelwriter/extensions/switch.py +4 -6
- novelwriter/extensions/switchbox.py +7 -6
- novelwriter/extensions/versioninfo.py +3 -9
- novelwriter/gui/doceditor.py +120 -139
- novelwriter/gui/dochighlight.py +231 -186
- novelwriter/gui/docviewer.py +69 -108
- novelwriter/gui/docviewerpanel.py +3 -10
- novelwriter/gui/editordocument.py +1 -3
- novelwriter/gui/itemdetails.py +7 -11
- novelwriter/gui/mainmenu.py +22 -18
- novelwriter/gui/noveltree.py +11 -24
- novelwriter/gui/outline.py +15 -26
- novelwriter/gui/projtree.py +35 -60
- novelwriter/gui/search.py +10 -3
- novelwriter/gui/sidebar.py +2 -6
- novelwriter/gui/statusbar.py +29 -37
- novelwriter/gui/theme.py +26 -48
- novelwriter/guimain.py +162 -160
- novelwriter/shared.py +36 -32
- novelwriter/text/patterns.py +113 -0
- novelwriter/tools/dictionaries.py +10 -20
- novelwriter/tools/lipsum.py +10 -16
- novelwriter/tools/manusbuild.py +9 -11
- novelwriter/tools/manuscript.py +71 -145
- novelwriter/tools/manussettings.py +71 -75
- novelwriter/tools/noveldetails.py +16 -21
- novelwriter/tools/welcome.py +21 -26
- novelwriter/tools/writingstats.py +9 -12
- novelwriter/types.py +49 -4
- novelwriter/extensions/simpleprogress.py +0 -55
- {novelWriter-2.4.4.dist-info → novelWriter-2.5.dist-info}/LICENSE.md +0 -0
- {novelWriter-2.4.4.dist-info → novelWriter-2.5.dist-info}/entry_points.txt +0 -0
- {novelWriter-2.4.4.dist-info → novelWriter-2.5.dist-info}/top_level.txt +0 -0
novelwriter/core/tokenizer.py
CHANGED
@@ -24,30 +24,35 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
24
24
|
"""
|
25
25
|
from __future__ import annotations
|
26
26
|
|
27
|
-
import re
|
28
27
|
import json
|
29
28
|
import logging
|
29
|
+
import re
|
30
30
|
|
31
31
|
from abc import ABC, abstractmethod
|
32
|
-
from time import time
|
33
|
-
from pathlib import Path
|
34
32
|
from functools import partial
|
33
|
+
from pathlib import Path
|
34
|
+
from time import time
|
35
35
|
|
36
36
|
from PyQt5.QtCore import QCoreApplication, QRegularExpression
|
37
|
+
from PyQt5.QtGui import QFont
|
37
38
|
|
38
|
-
from novelwriter
|
39
|
-
from novelwriter.
|
40
|
-
|
41
|
-
)
|
39
|
+
from novelwriter import CONFIG
|
40
|
+
from novelwriter.common import checkInt, formatTimeStamp, numberToRoman
|
41
|
+
from novelwriter.constants import nwHeadFmt, nwKeyWords, nwLabels, nwShortcode, nwUnicode, trConst
|
42
42
|
from novelwriter.core.index import processComment
|
43
43
|
from novelwriter.core.project import NWProject
|
44
44
|
from novelwriter.enum import nwComment, nwItemLayout
|
45
|
+
from novelwriter.text.patterns import REGEX_PATTERNS
|
45
46
|
|
46
47
|
logger = logging.getLogger(__name__)
|
47
48
|
|
48
49
|
ESCAPES = {r"\*": "*", r"\~": "~", r"\_": "_", r"\[": "[", r"\]": "]", r"\ ": ""}
|
49
50
|
RX_ESC = re.compile("|".join([re.escape(k) for k in ESCAPES.keys()]), flags=re.DOTALL)
|
50
51
|
|
52
|
+
T_Formats = list[tuple[int, int, str]]
|
53
|
+
T_Comment = tuple[str, T_Formats]
|
54
|
+
T_Token = tuple[int, int, str, T_Formats, int]
|
55
|
+
|
51
56
|
|
52
57
|
def stripEscape(text: str) -> str:
|
53
58
|
"""Strip escaped Markdown characters from paragraph text."""
|
@@ -80,6 +85,12 @@ class Tokenizer(ABC):
|
|
80
85
|
FMT_SUP_E = 12 # End superscript
|
81
86
|
FMT_SUB_B = 13 # Begin subscript
|
82
87
|
FMT_SUB_E = 14 # End subscript
|
88
|
+
FMT_DL_B = 15 # Begin dialogue
|
89
|
+
FMT_DL_E = 16 # End dialogue
|
90
|
+
FMT_ADL_B = 17 # Begin alt dialogue
|
91
|
+
FMT_ADL_E = 18 # End alt dialogue
|
92
|
+
FMT_FNOTE = 19 # Footnote marker
|
93
|
+
FMT_STRIP = 20 # Strip the format code
|
83
94
|
|
84
95
|
# Block Type
|
85
96
|
T_EMPTY = 1 # Empty line (new paragraph)
|
@@ -108,48 +119,60 @@ class Tokenizer(ABC):
|
|
108
119
|
A_Z_BTMMRG = 0x0080 # Zero bottom margin
|
109
120
|
A_IND_L = 0x0100 # Left indentation
|
110
121
|
A_IND_R = 0x0200 # Right indentation
|
122
|
+
A_IND_T = 0x0400 # Text indentation
|
123
|
+
|
124
|
+
# Masks
|
125
|
+
M_ALIGNED = A_LEFT | A_RIGHT | A_CENTRE | A_JUSTIFY
|
111
126
|
|
112
127
|
# Lookups
|
113
128
|
L_HEADINGS = [T_TITLE, T_HEAD1, T_HEAD2, T_HEAD3, T_HEAD4]
|
129
|
+
L_SKIP_INDENT = [T_TITLE, T_HEAD1, T_HEAD2, T_HEAD2, T_HEAD3, T_HEAD4, T_SEP, T_SKIP]
|
130
|
+
L_SUMMARY = [T_SYNOPSIS, T_SHORT]
|
114
131
|
|
115
132
|
def __init__(self, project: NWProject) -> None:
|
116
133
|
|
117
134
|
self._project = project
|
118
135
|
|
119
136
|
# Data Variables
|
120
|
-
self._text = ""
|
121
|
-
self._handle = None
|
122
|
-
self._result = ""
|
137
|
+
self._text = "" # The raw text to be tokenized
|
138
|
+
self._handle = None # The item handle currently being processed
|
139
|
+
self._result = "" # The result of the last document
|
140
|
+
self._keepMD = False # Whether to keep the markdown text
|
123
141
|
|
124
|
-
|
125
|
-
self.
|
142
|
+
# Tokens and Meta Data (Per Document)
|
143
|
+
self._tokens: list[T_Token] = []
|
144
|
+
self._footnotes: dict[str, T_Comment] = {}
|
126
145
|
|
127
|
-
#
|
128
|
-
self._tokens: list[tuple[int, int, str, list[tuple[int, int]], int]] = []
|
146
|
+
# Tokens and Meta Data (Per Instance)
|
129
147
|
self._counts: dict[str, int] = {}
|
130
148
|
self._outline: dict[str, str] = {}
|
149
|
+
self._markdown: list[str] = []
|
131
150
|
|
132
151
|
# User Settings
|
133
|
-
self._textFont = "Serif" # Output text font
|
134
|
-
self._textSize = 11 # Output text size
|
135
|
-
self._textFixed = False # Fixed width text
|
152
|
+
self._textFont = QFont("Serif", 11) # Output text font
|
136
153
|
self._lineHeight = 1.15 # Line height in units of em
|
137
154
|
self._blockIndent = 4.00 # Block indent in units of em
|
155
|
+
self._firstIndent = False # Enable first line indent
|
156
|
+
self._firstWidth = 1.40 # First line indent in units of em
|
157
|
+
self._indentFirst = False # Indent first paragraph
|
138
158
|
self._doJustify = False # Justify text
|
139
159
|
self._doBodyText = True # Include body text
|
140
160
|
self._doSynopsis = False # Also process synopsis comments
|
141
161
|
self._doComments = False # Also process comments
|
142
162
|
self._doKeywords = False # Also process keywords like tags and references
|
143
163
|
self._skipKeywords = set() # Keywords to ignore
|
164
|
+
self._keepBreaks = True # Keep line breaks in paragraphs
|
144
165
|
|
145
166
|
# Margins
|
146
|
-
self._marginTitle = (1.
|
147
|
-
self._marginHead1 = (1.
|
148
|
-
self._marginHead2 = (
|
149
|
-
self._marginHead3 = (
|
150
|
-
self._marginHead4 = (
|
167
|
+
self._marginTitle = (1.417, 0.500)
|
168
|
+
self._marginHead1 = (1.417, 0.500)
|
169
|
+
self._marginHead2 = (1.668, 0.500)
|
170
|
+
self._marginHead3 = (1.168, 0.500)
|
171
|
+
self._marginHead4 = (1.168, 0.500)
|
151
172
|
self._marginText = (0.000, 0.584)
|
152
173
|
self._marginMeta = (0.000, 0.584)
|
174
|
+
self._marginFoot = (1.417, 0.467)
|
175
|
+
self._marginSep = (1.168, 1.168)
|
153
176
|
|
154
177
|
# Title Formats
|
155
178
|
self._fmtTitle = nwHeadFmt.TITLE # Formatting for titles
|
@@ -174,7 +197,9 @@ class Tokenizer(ABC):
|
|
174
197
|
|
175
198
|
# Instance Variables
|
176
199
|
self._hFormatter = HeadingFormatter(self._project)
|
177
|
-
self._noSep = True
|
200
|
+
self._noSep = True # Flag to indicate that we don't want a scene separator
|
201
|
+
self._noIndent = False # Flag to disable text indent on next paragraph
|
202
|
+
self._showDialog = False # Flag for dialogue highlighting
|
178
203
|
|
179
204
|
# This File
|
180
205
|
self._isNovel = False # Document is a novel document
|
@@ -189,12 +214,12 @@ class Tokenizer(ABC):
|
|
189
214
|
|
190
215
|
# Format RegEx
|
191
216
|
self._rxMarkdown = [
|
192
|
-
(
|
193
|
-
(
|
194
|
-
(
|
217
|
+
(REGEX_PATTERNS.markdownItalic, [0, self.FMT_I_B, 0, self.FMT_I_E]),
|
218
|
+
(REGEX_PATTERNS.markdownBold, [0, self.FMT_B_B, 0, self.FMT_B_E]),
|
219
|
+
(REGEX_PATTERNS.markdownStrike, [0, self.FMT_D_B, 0, self.FMT_D_E]),
|
195
220
|
]
|
196
|
-
self._rxShortCodes =
|
197
|
-
self._rxShortCodeVals =
|
221
|
+
self._rxShortCodes = REGEX_PATTERNS.shortcodePlain
|
222
|
+
self._rxShortCodeVals = REGEX_PATTERNS.shortcodeValue
|
198
223
|
|
199
224
|
self._shortCodeFmt = {
|
200
225
|
nwShortcode.ITALIC_O: self.FMT_I_B, nwShortcode.ITALIC_C: self.FMT_I_E,
|
@@ -205,6 +230,11 @@ class Tokenizer(ABC):
|
|
205
230
|
nwShortcode.SUP_O: self.FMT_SUP_B, nwShortcode.SUP_C: self.FMT_SUP_E,
|
206
231
|
nwShortcode.SUB_O: self.FMT_SUB_B, nwShortcode.SUB_C: self.FMT_SUB_E,
|
207
232
|
}
|
233
|
+
self._shortCodeVals = {
|
234
|
+
nwShortcode.FOOTNOTE_B: self.FMT_FNOTE,
|
235
|
+
}
|
236
|
+
|
237
|
+
self._rxDialogue: list[tuple[QRegularExpression, int, int]] = []
|
208
238
|
|
209
239
|
return
|
210
240
|
|
@@ -220,7 +250,7 @@ class Tokenizer(ABC):
|
|
220
250
|
@property
|
221
251
|
def allMarkdown(self) -> list[str]:
|
222
252
|
"""The combined novelWriter Markdown text."""
|
223
|
-
return self.
|
253
|
+
return self._markdown
|
224
254
|
|
225
255
|
@property
|
226
256
|
def textStats(self) -> dict[str, int]:
|
@@ -298,11 +328,9 @@ class Tokenizer(ABC):
|
|
298
328
|
)
|
299
329
|
return
|
300
330
|
|
301
|
-
def setFont(self,
|
331
|
+
def setFont(self, font: QFont) -> None:
|
302
332
|
"""Set the build font."""
|
303
|
-
self._textFont =
|
304
|
-
self._textSize = round(int(size))
|
305
|
-
self._textFixed = isFixed
|
333
|
+
self._textFont = font
|
306
334
|
return
|
307
335
|
|
308
336
|
def setLineHeight(self, height: float) -> None:
|
@@ -315,11 +343,43 @@ class Tokenizer(ABC):
|
|
315
343
|
self._blockIndent = min(max(float(indent), 0.0), 10.0)
|
316
344
|
return
|
317
345
|
|
346
|
+
def setFirstLineIndent(self, state: bool, indent: float, first: bool) -> None:
|
347
|
+
"""Set first line indent and whether to also indent first
|
348
|
+
paragraph after a heading.
|
349
|
+
"""
|
350
|
+
self._firstIndent = state
|
351
|
+
self._firstWidth = indent
|
352
|
+
self._indentFirst = first
|
353
|
+
return
|
354
|
+
|
318
355
|
def setJustify(self, state: bool) -> None:
|
319
356
|
"""Enable or disable text justification."""
|
320
357
|
self._doJustify = state
|
321
358
|
return
|
322
359
|
|
360
|
+
def setDialogueHighlight(self, state: bool) -> None:
|
361
|
+
"""Enable or disable dialogue highlighting."""
|
362
|
+
self._rxDialogue = []
|
363
|
+
self._showDialog = state
|
364
|
+
if state:
|
365
|
+
if CONFIG.dialogStyle > 0:
|
366
|
+
self._rxDialogue.append((
|
367
|
+
REGEX_PATTERNS.dialogStyle, self.FMT_DL_B, self.FMT_DL_E
|
368
|
+
))
|
369
|
+
if CONFIG.dialogLine:
|
370
|
+
self._rxDialogue.append((
|
371
|
+
REGEX_PATTERNS.dialogLine, self.FMT_DL_B, self.FMT_DL_E
|
372
|
+
))
|
373
|
+
if CONFIG.narratorBreak:
|
374
|
+
self._rxDialogue.append((
|
375
|
+
REGEX_PATTERNS.narratorBreak, self.FMT_DL_E, self.FMT_DL_B
|
376
|
+
))
|
377
|
+
if CONFIG.altDialogOpen and CONFIG.altDialogClose:
|
378
|
+
self._rxDialogue.append((
|
379
|
+
REGEX_PATTERNS.altDialogStyle, self.FMT_ADL_B, self.FMT_ADL_E
|
380
|
+
))
|
381
|
+
return
|
382
|
+
|
323
383
|
def setTitleMargins(self, upper: float, lower: float) -> None:
|
324
384
|
"""Set the upper and lower title margin."""
|
325
385
|
self._marginTitle = (float(upper), float(lower))
|
@@ -355,6 +415,11 @@ class Tokenizer(ABC):
|
|
355
415
|
self._marginMeta = (float(upper), float(lower))
|
356
416
|
return
|
357
417
|
|
418
|
+
def setSeparatorMargins(self, upper: float, lower: float) -> None:
|
419
|
+
"""Set the upper and lower meta text margin."""
|
420
|
+
self._marginSep = (float(upper), float(lower))
|
421
|
+
return
|
422
|
+
|
358
423
|
def setLinkHeadings(self, state: bool) -> None:
|
359
424
|
"""Enable or disable adding an anchor before headings."""
|
360
425
|
self._linkHeadings = state
|
@@ -385,9 +450,14 @@ class Tokenizer(ABC):
|
|
385
450
|
self._skipKeywords = set(x.lower().strip() for x in keywords.split(","))
|
386
451
|
return
|
387
452
|
|
453
|
+
def setKeepLineBreaks(self, state: bool) -> None:
|
454
|
+
"""Keep line breaks in paragraphs."""
|
455
|
+
self._keepBreaks = state
|
456
|
+
return
|
457
|
+
|
388
458
|
def setKeepMarkdown(self, state: bool) -> None:
|
389
459
|
"""Keep original markdown during build."""
|
390
|
-
self.
|
460
|
+
self._keepMD = state
|
391
461
|
return
|
392
462
|
|
393
463
|
##
|
@@ -417,8 +487,8 @@ class Tokenizer(ABC):
|
|
417
487
|
self._tokens.append((
|
418
488
|
self.T_TITLE, 1, title, [], textAlign
|
419
489
|
))
|
420
|
-
if self.
|
421
|
-
self.
|
490
|
+
if self._keepMD:
|
491
|
+
self._markdown.append(f"#! {title}\n\n")
|
422
492
|
|
423
493
|
return
|
424
494
|
|
@@ -446,7 +516,7 @@ class Tokenizer(ABC):
|
|
446
516
|
self._text = xRep.sub(lambda x: repDict[x.group(0)], self._text)
|
447
517
|
|
448
518
|
# Process the character translation map
|
449
|
-
trDict = {nwUnicode.
|
519
|
+
trDict = {nwUnicode.U_MAPOS: nwUnicode.U_RSQUO}
|
450
520
|
self._text = self._text.translate(str.maketrans(trDict))
|
451
521
|
|
452
522
|
return
|
@@ -466,22 +536,23 @@ class Tokenizer(ABC):
|
|
466
536
|
4: The internal formatting map of the text, self.FMT_*
|
467
537
|
5: The style of the block, self.A_*
|
468
538
|
"""
|
469
|
-
self._tokens = []
|
470
539
|
if self._isNovel:
|
471
540
|
self._hFormatter.setHandle(self._handle)
|
472
541
|
|
473
542
|
nHead = 0
|
474
543
|
breakNext = False
|
475
544
|
tmpMarkdown = []
|
545
|
+
tHandle = self._handle or ""
|
546
|
+
tokens: list[T_Token] = []
|
476
547
|
for aLine in self._text.splitlines():
|
477
548
|
sLine = aLine.strip().lower()
|
478
549
|
|
479
550
|
# Check for blank lines
|
480
551
|
if len(sLine) == 0:
|
481
|
-
|
552
|
+
tokens.append((
|
482
553
|
self.T_EMPTY, nHead, "", [], self.A_NONE
|
483
554
|
))
|
484
|
-
if self.
|
555
|
+
if self._keepMD:
|
485
556
|
tmpMarkdown.append("\n")
|
486
557
|
|
487
558
|
continue
|
@@ -507,7 +578,7 @@ class Tokenizer(ABC):
|
|
507
578
|
continue
|
508
579
|
|
509
580
|
elif sLine == "[vspace]":
|
510
|
-
|
581
|
+
tokens.append(
|
511
582
|
(self.T_SKIP, nHead, "", [], sAlign)
|
512
583
|
)
|
513
584
|
continue
|
@@ -515,11 +586,11 @@ class Tokenizer(ABC):
|
|
515
586
|
elif sLine.startswith("[vspace:") and sLine.endswith("]"):
|
516
587
|
nSkip = checkInt(sLine[8:-1], 0)
|
517
588
|
if nSkip >= 1:
|
518
|
-
|
589
|
+
tokens.append(
|
519
590
|
(self.T_SKIP, nHead, "", [], sAlign)
|
520
591
|
)
|
521
592
|
if nSkip > 1:
|
522
|
-
|
593
|
+
tokens += (nSkip - 1) * [
|
523
594
|
(self.T_SKIP, nHead, "", [], self.A_NONE)
|
524
595
|
]
|
525
596
|
continue
|
@@ -533,24 +604,32 @@ class Tokenizer(ABC):
|
|
533
604
|
if aLine.startswith("%~"):
|
534
605
|
continue
|
535
606
|
|
536
|
-
cStyle, cText, _ = processComment(aLine)
|
607
|
+
cStyle, cKey, cText, _, _ = processComment(aLine)
|
537
608
|
if cStyle == nwComment.SYNOPSIS:
|
538
|
-
self.
|
539
|
-
|
609
|
+
tLine, tFmt = self._extractFormats(cText)
|
610
|
+
tokens.append((
|
611
|
+
self.T_SYNOPSIS, nHead, tLine, tFmt, sAlign
|
540
612
|
))
|
541
|
-
if self._doSynopsis and self.
|
613
|
+
if self._doSynopsis and self._keepMD:
|
542
614
|
tmpMarkdown.append(f"{aLine}\n")
|
543
615
|
elif cStyle == nwComment.SHORT:
|
544
|
-
self.
|
545
|
-
|
616
|
+
tLine, tFmt = self._extractFormats(cText)
|
617
|
+
tokens.append((
|
618
|
+
self.T_SHORT, nHead, tLine, tFmt, sAlign
|
546
619
|
))
|
547
|
-
if self._doSynopsis and self.
|
620
|
+
if self._doSynopsis and self._keepMD:
|
621
|
+
tmpMarkdown.append(f"{aLine}\n")
|
622
|
+
elif cStyle == nwComment.FOOTNOTE:
|
623
|
+
tLine, tFmt = self._extractFormats(cText, skip=self.FMT_FNOTE)
|
624
|
+
self._footnotes[f"{tHandle}:{cKey}"] = (tLine, tFmt)
|
625
|
+
if self._keepMD:
|
548
626
|
tmpMarkdown.append(f"{aLine}\n")
|
549
627
|
else:
|
550
|
-
self.
|
551
|
-
|
628
|
+
tLine, tFmt = self._extractFormats(cText)
|
629
|
+
tokens.append((
|
630
|
+
self.T_COMMENT, nHead, tLine, tFmt, sAlign
|
552
631
|
))
|
553
|
-
if self._doComments and self.
|
632
|
+
if self._doComments and self._keepMD:
|
554
633
|
tmpMarkdown.append(f"{aLine}\n")
|
555
634
|
|
556
635
|
elif aLine.startswith("@"):
|
@@ -560,11 +639,14 @@ class Tokenizer(ABC):
|
|
560
639
|
# are automatically skipped.
|
561
640
|
|
562
641
|
valid, bits, _ = self._project.index.scanThis(aLine)
|
563
|
-
if
|
564
|
-
|
642
|
+
if (
|
643
|
+
valid and bits and bits[0] in nwLabels.KEY_NAME
|
644
|
+
and bits[0] not in self._skipKeywords
|
645
|
+
):
|
646
|
+
tokens.append((
|
565
647
|
self.T_KEYWORD, nHead, aLine[1:].strip(), [], sAlign
|
566
648
|
))
|
567
|
-
if self._doKeywords and self.
|
649
|
+
if self._doKeywords and self._keepMD:
|
568
650
|
tmpMarkdown.append(f"{aLine}\n")
|
569
651
|
|
570
652
|
elif aLine.startswith(("# ", "#! ")):
|
@@ -597,10 +679,10 @@ class Tokenizer(ABC):
|
|
597
679
|
self._hFormatter.resetAll()
|
598
680
|
self._noSep = True
|
599
681
|
|
600
|
-
|
682
|
+
tokens.append((
|
601
683
|
tType, nHead, tText, [], tStyle
|
602
684
|
))
|
603
|
-
if self.
|
685
|
+
if self._keepMD:
|
604
686
|
tmpMarkdown.append(f"{aLine}\n")
|
605
687
|
|
606
688
|
elif aLine.startswith(("## ", "##! ")):
|
@@ -632,10 +714,10 @@ class Tokenizer(ABC):
|
|
632
714
|
self._hFormatter.resetScene()
|
633
715
|
self._noSep = True
|
634
716
|
|
635
|
-
|
717
|
+
tokens.append((
|
636
718
|
tType, nHead, tText, [], tStyle
|
637
719
|
))
|
638
|
-
if self.
|
720
|
+
if self._keepMD:
|
639
721
|
tmpMarkdown.append(f"{aLine}\n")
|
640
722
|
|
641
723
|
elif aLine.startswith(("### ", "###! ")):
|
@@ -673,10 +755,10 @@ class Tokenizer(ABC):
|
|
673
755
|
tStyle = self.A_NONE if self._noSep else self.A_CENTRE
|
674
756
|
self._noSep = False
|
675
757
|
|
676
|
-
|
758
|
+
tokens.append((
|
677
759
|
tType, nHead, tText, [], tStyle
|
678
760
|
))
|
679
|
-
if self.
|
761
|
+
if self._keepMD:
|
680
762
|
tmpMarkdown.append(f"{aLine}\n")
|
681
763
|
|
682
764
|
elif aLine.startswith("#### "):
|
@@ -703,10 +785,10 @@ class Tokenizer(ABC):
|
|
703
785
|
tType = self.T_SEP
|
704
786
|
tStyle = self.A_CENTRE
|
705
787
|
|
706
|
-
|
788
|
+
tokens.append((
|
707
789
|
tType, nHead, tText, [], tStyle
|
708
790
|
))
|
709
|
-
if self.
|
791
|
+
if self._keepMD:
|
710
792
|
tmpMarkdown.append(f"{aLine}\n")
|
711
793
|
|
712
794
|
else:
|
@@ -750,54 +832,116 @@ class Tokenizer(ABC):
|
|
750
832
|
sAlign |= self.A_IND_R
|
751
833
|
|
752
834
|
# Process formats
|
753
|
-
tLine,
|
754
|
-
|
755
|
-
self.T_TEXT, nHead, tLine,
|
835
|
+
tLine, tFmt = self._extractFormats(aLine)
|
836
|
+
tokens.append((
|
837
|
+
self.T_TEXT, nHead, tLine, tFmt, sAlign
|
756
838
|
))
|
757
|
-
if self.
|
839
|
+
if self._keepMD:
|
758
840
|
tmpMarkdown.append(f"{aLine}\n")
|
759
841
|
|
760
842
|
# If we have content, turn off the first page flag
|
761
|
-
if self._isFirst and
|
843
|
+
if self._isFirst and tokens:
|
762
844
|
self._isFirst = False # First document has been processed
|
763
845
|
|
764
846
|
# Make sure the token array doesn't start with a page break
|
765
847
|
# on the very first page, adding a blank first page.
|
766
|
-
if
|
767
|
-
|
768
|
-
|
769
|
-
|
848
|
+
if tokens[0][4] & self.A_PBB:
|
849
|
+
cToken = tokens[0]
|
850
|
+
tokens[0] = (
|
851
|
+
cToken[0], cToken[1], cToken[2], cToken[3], cToken[4] & ~self.A_PBB
|
770
852
|
)
|
771
853
|
|
772
854
|
# Always add an empty line at the end of the file
|
773
|
-
|
855
|
+
tokens.append((
|
774
856
|
self.T_EMPTY, nHead, "", [], self.A_NONE
|
775
857
|
))
|
776
|
-
if self.
|
858
|
+
if self._keepMD:
|
777
859
|
tmpMarkdown.append("\n")
|
778
|
-
self.
|
860
|
+
self._markdown.append("".join(tmpMarkdown))
|
779
861
|
|
780
862
|
# Second Pass
|
781
863
|
# ===========
|
782
|
-
#
|
864
|
+
# This second pass strips away consecutive blank lines, and
|
865
|
+
# combines consecutive text lines into the same paragraph.
|
866
|
+
# It also ensures that there isn't paragraph spacing between
|
867
|
+
# meta data lines for formats that has spacing.
|
868
|
+
|
869
|
+
self._tokens = []
|
870
|
+
pToken: T_Token = (self.T_EMPTY, 0, "", [], self.A_NONE)
|
871
|
+
nToken: T_Token = (self.T_EMPTY, 0, "", [], self.A_NONE)
|
872
|
+
|
873
|
+
lineSep = "\n" if self._keepBreaks else " "
|
874
|
+
pLines: list[T_Token] = []
|
783
875
|
|
784
|
-
|
785
|
-
|
786
|
-
tCount = len(self._tokens)
|
787
|
-
for n, token in enumerate(self._tokens):
|
876
|
+
tCount = len(tokens)
|
877
|
+
for n, cToken in enumerate(tokens):
|
788
878
|
|
789
879
|
if n > 0:
|
790
|
-
pToken =
|
880
|
+
pToken = tokens[n-1] # Look behind
|
791
881
|
if n < tCount - 1:
|
792
|
-
nToken =
|
882
|
+
nToken = tokens[n+1] # Look ahead
|
883
|
+
|
884
|
+
if cToken[0] in self.L_SKIP_INDENT and not self._indentFirst:
|
885
|
+
# Unless the indentFirst flag is set, we set up the next
|
886
|
+
# paragraph to not be indented if we see a block of a
|
887
|
+
# specific type
|
888
|
+
self._noIndent = True
|
889
|
+
|
890
|
+
if cToken[0] == self.T_EMPTY:
|
891
|
+
# We don't need to keep the empty lines after this pass
|
892
|
+
pass
|
793
893
|
|
794
|
-
|
795
|
-
|
894
|
+
elif cToken[0] == self.T_KEYWORD:
|
895
|
+
# Adjust margins for lines in a list of keyword lines
|
896
|
+
aStyle = cToken[4]
|
796
897
|
if pToken[0] == self.T_KEYWORD:
|
797
898
|
aStyle |= self.A_Z_TOPMRG
|
798
899
|
if nToken[0] == self.T_KEYWORD:
|
799
900
|
aStyle |= self.A_Z_BTMMRG
|
800
|
-
self._tokens
|
901
|
+
self._tokens.append((
|
902
|
+
cToken[0], cToken[1], cToken[2], cToken[3], aStyle
|
903
|
+
))
|
904
|
+
|
905
|
+
elif cToken[0] == self.T_TEXT:
|
906
|
+
# Combine lines from the same paragraph
|
907
|
+
pLines.append(cToken)
|
908
|
+
|
909
|
+
if nToken[0] != self.T_TEXT:
|
910
|
+
# Next token is not text, so we add the buffer to tokens
|
911
|
+
nLines = len(pLines)
|
912
|
+
cStyle = pLines[0][4]
|
913
|
+
if self._firstIndent and not (self._noIndent or cStyle & self.M_ALIGNED):
|
914
|
+
# If paragraph indentation is enabled, not temporarily
|
915
|
+
# turned off, and the block is not aligned, we add the
|
916
|
+
# text indentation flag
|
917
|
+
cStyle |= self.A_IND_T
|
918
|
+
|
919
|
+
if nLines == 1:
|
920
|
+
# The paragraph contains a single line, so we just
|
921
|
+
# save that directly to the token list
|
922
|
+
self._tokens.append((
|
923
|
+
self.T_TEXT, pLines[0][1], pLines[0][2], pLines[0][3], cStyle
|
924
|
+
))
|
925
|
+
elif nLines > 1:
|
926
|
+
# The paragraph contains multiple lines, so we need to
|
927
|
+
# join them according to the line break policy, and
|
928
|
+
# recompute all the formatting markers
|
929
|
+
tTxt = ""
|
930
|
+
tFmt: T_Formats = []
|
931
|
+
for aToken in pLines:
|
932
|
+
tLen = len(tTxt)
|
933
|
+
tTxt += f"{aToken[2]}{lineSep}"
|
934
|
+
tFmt.extend((p+tLen, fmt, key) for p, fmt, key in aToken[3])
|
935
|
+
self._tokens.append((
|
936
|
+
self.T_TEXT, pLines[0][1], tTxt[:-1], tFmt, cStyle
|
937
|
+
))
|
938
|
+
|
939
|
+
# Reset buffer and make sure text indent is on for next pass
|
940
|
+
pLines = []
|
941
|
+
self._noIndent = False
|
942
|
+
|
943
|
+
else:
|
944
|
+
self._tokens.append(cToken)
|
801
945
|
|
802
946
|
return
|
803
947
|
|
@@ -840,7 +984,6 @@ class Tokenizer(ABC):
|
|
840
984
|
textWordChars = self._counts.get("textWordChars", 0)
|
841
985
|
titleWordChars = self._counts.get("titleWordChars", 0)
|
842
986
|
|
843
|
-
para = []
|
844
987
|
for tType, _, tText, _, _ in self._tokens:
|
845
988
|
tText = tText.replace(nwUnicode.U_ENDASH, " ")
|
846
989
|
tText = tText.replace(nwUnicode.U_EMDASH, " ")
|
@@ -850,22 +993,19 @@ class Tokenizer(ABC):
|
|
850
993
|
nChars = len(tText)
|
851
994
|
nWChars = len("".join(tWords))
|
852
995
|
|
853
|
-
if tType == self.
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
allWordChars += nPWChars
|
867
|
-
textWordChars += nPWChars
|
868
|
-
para = []
|
996
|
+
if tType == self.T_TEXT:
|
997
|
+
tPWords = tText.split()
|
998
|
+
nPWords = len(tPWords)
|
999
|
+
nPChars = len(tText)
|
1000
|
+
nPWChars = len("".join(tPWords))
|
1001
|
+
|
1002
|
+
paragraphCount += 1
|
1003
|
+
allWords += nPWords
|
1004
|
+
textWords += nPWords
|
1005
|
+
allChars += nPChars
|
1006
|
+
textChars += nPChars
|
1007
|
+
allWordChars += nPWChars
|
1008
|
+
textWordChars += nPWChars
|
869
1009
|
|
870
1010
|
elif tType in self.L_HEADINGS:
|
871
1011
|
titleCount += 1
|
@@ -881,9 +1021,6 @@ class Tokenizer(ABC):
|
|
881
1021
|
allChars += nChars
|
882
1022
|
allWordChars += nWChars
|
883
1023
|
|
884
|
-
elif tType == self.T_TEXT:
|
885
|
-
para.append(tText.rstrip())
|
886
|
-
|
887
1024
|
elif tType == self.T_SYNOPSIS and self._doSynopsis:
|
888
1025
|
text = "{0}: {1}".format(self._localLookup("Synopsis"), tText)
|
889
1026
|
words = text.split()
|
@@ -935,7 +1072,7 @@ class Tokenizer(ABC):
|
|
935
1072
|
def saveRawMarkdown(self, path: str | Path) -> None:
|
936
1073
|
"""Save the raw text to a plain text file."""
|
937
1074
|
with open(path, mode="w", encoding="utf-8") as outFile:
|
938
|
-
for nwdPage in self.
|
1075
|
+
for nwdPage in self._markdown:
|
939
1076
|
outFile.write(nwdPage)
|
940
1077
|
return
|
941
1078
|
|
@@ -950,7 +1087,7 @@ class Tokenizer(ABC):
|
|
950
1087
|
"buildTimeStr": formatTimeStamp(timeStamp),
|
951
1088
|
},
|
952
1089
|
"text": {
|
953
|
-
"nwd": [page.rstrip("\n").split("\n") for page in self.
|
1090
|
+
"nwd": [page.rstrip("\n").split("\n") for page in self._markdown],
|
954
1091
|
}
|
955
1092
|
}
|
956
1093
|
with open(path, mode="w", encoding="utf-8") as fObj:
|
@@ -961,9 +1098,9 @@ class Tokenizer(ABC):
|
|
961
1098
|
# Internal Functions
|
962
1099
|
##
|
963
1100
|
|
964
|
-
def _extractFormats(self, text: str) -> tuple[str,
|
1101
|
+
def _extractFormats(self, text: str, skip: int = 0) -> tuple[str, T_Formats]:
|
965
1102
|
"""Extract format markers from a text paragraph."""
|
966
|
-
temp = []
|
1103
|
+
temp: list[tuple[int, int, int, str]] = []
|
967
1104
|
|
968
1105
|
# Match Markdown
|
969
1106
|
for regEx, fmts in self._rxMarkdown:
|
@@ -971,7 +1108,7 @@ class Tokenizer(ABC):
|
|
971
1108
|
while rxItt.hasNext():
|
972
1109
|
rxMatch = rxItt.next()
|
973
1110
|
temp.extend(
|
974
|
-
|
1111
|
+
(rxMatch.capturedStart(n), rxMatch.capturedLength(n), fmt, "")
|
975
1112
|
for n, fmt in enumerate(fmts) if fmt > 0
|
976
1113
|
)
|
977
1114
|
|
@@ -979,25 +1116,46 @@ class Tokenizer(ABC):
|
|
979
1116
|
rxItt = self._rxShortCodes.globalMatch(text, 0)
|
980
1117
|
while rxItt.hasNext():
|
981
1118
|
rxMatch = rxItt.next()
|
982
|
-
temp.append(
|
1119
|
+
temp.append((
|
983
1120
|
rxMatch.capturedStart(1),
|
984
1121
|
rxMatch.capturedLength(1),
|
985
|
-
self._shortCodeFmt.get(rxMatch.captured(1).lower(), 0)
|
986
|
-
|
1122
|
+
self._shortCodeFmt.get(rxMatch.captured(1).lower(), 0),
|
1123
|
+
"",
|
1124
|
+
))
|
987
1125
|
|
988
|
-
#
|
1126
|
+
# Match Shortcode w/Values
|
1127
|
+
rxItt = self._rxShortCodeVals.globalMatch(text, 0)
|
1128
|
+
tHandle = self._handle or ""
|
1129
|
+
while rxItt.hasNext():
|
1130
|
+
rxMatch = rxItt.next()
|
1131
|
+
kind = self._shortCodeVals.get(rxMatch.captured(1).lower(), 0)
|
1132
|
+
temp.append((
|
1133
|
+
rxMatch.capturedStart(0),
|
1134
|
+
rxMatch.capturedLength(0),
|
1135
|
+
self.FMT_STRIP if kind == skip else kind,
|
1136
|
+
f"{tHandle}:{rxMatch.captured(2)}",
|
1137
|
+
))
|
1138
|
+
|
1139
|
+
# Match Dialogue
|
1140
|
+
if self._rxDialogue:
|
1141
|
+
for regEx, fmtB, fmtE in self._rxDialogue:
|
1142
|
+
rxItt = regEx.globalMatch(text, 0)
|
1143
|
+
while rxItt.hasNext():
|
1144
|
+
rxMatch = rxItt.next()
|
1145
|
+
temp.append((rxMatch.capturedStart(0), 0, fmtB, ""))
|
1146
|
+
temp.append((rxMatch.capturedEnd(0), 0, fmtE, ""))
|
1147
|
+
|
1148
|
+
# Post-process text and format
|
989
1149
|
result = text
|
990
1150
|
formats = []
|
991
|
-
for pos, n, fmt in reversed(sorted(temp, key=lambda x: x[0])):
|
1151
|
+
for pos, n, fmt, key in reversed(sorted(temp, key=lambda x: x[0])):
|
992
1152
|
if fmt > 0:
|
993
1153
|
result = result[:pos] + result[pos+n:]
|
994
|
-
formats = [(p-n, f) for p, f in formats]
|
995
|
-
formats.insert(0, (pos, fmt))
|
1154
|
+
formats = [(p-n, f, k) for p, f, k in formats]
|
1155
|
+
formats.insert(0, (pos, fmt, key))
|
996
1156
|
|
997
1157
|
return result, formats
|
998
1158
|
|
999
|
-
# END Class Tokenizer
|
1000
|
-
|
1001
1159
|
|
1002
1160
|
class HeadingFormatter:
|
1003
1161
|
|
@@ -1067,5 +1225,3 @@ class HeadingFormatter:
|
|
1067
1225
|
hFormat = hFormat.replace(nwHeadFmt.CHAR_FOCUS, fText)
|
1068
1226
|
|
1069
1227
|
return hFormat
|
1070
|
-
|
1071
|
-
# END Class HeadingFormatter
|