novelWriter 2.4.3__py3-none-any.whl → 2.5b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {novelWriter-2.4.3.dist-info → novelWriter-2.5b1.dist-info}/METADATA +4 -5
- {novelWriter-2.4.3.dist-info → novelWriter-2.5b1.dist-info}/RECORD +109 -101
- novelwriter/__init__.py +33 -39
- novelwriter/assets/i18n/project_en_GB.json +1 -0
- novelwriter/assets/icons/typicons_dark/icons.conf +2 -0
- novelwriter/assets/icons/typicons_dark/nw_font.svg +4 -0
- novelwriter/assets/icons/typicons_dark/nw_quote.svg +4 -0
- novelwriter/assets/icons/typicons_light/icons.conf +2 -0
- novelwriter/assets/icons/typicons_light/nw_font.svg +4 -0
- novelwriter/assets/icons/typicons_light/nw_quote.svg +4 -0
- novelwriter/assets/manual.pdf +0 -0
- novelwriter/assets/sample.zip +0 -0
- novelwriter/assets/syntax/cyberpunk_night.conf +5 -3
- novelwriter/assets/syntax/default_dark.conf +32 -18
- novelwriter/assets/syntax/default_light.conf +24 -10
- novelwriter/assets/syntax/dracula.conf +44 -0
- novelwriter/assets/syntax/grey_dark.conf +5 -4
- novelwriter/assets/syntax/grey_light.conf +5 -4
- novelwriter/assets/syntax/light_owl.conf +7 -6
- novelwriter/assets/syntax/night_owl.conf +7 -6
- novelwriter/assets/syntax/snazzy.conf +42 -0
- novelwriter/assets/syntax/solarized_dark.conf +4 -3
- novelwriter/assets/syntax/solarized_light.conf +4 -3
- novelwriter/assets/syntax/tango.conf +27 -11
- novelwriter/assets/syntax/tomorrow.conf +6 -5
- novelwriter/assets/syntax/tomorrow_night.conf +7 -6
- novelwriter/assets/syntax/tomorrow_night_blue.conf +6 -5
- novelwriter/assets/syntax/tomorrow_night_bright.conf +6 -5
- novelwriter/assets/syntax/tomorrow_night_eighties.conf +6 -5
- novelwriter/assets/text/credits_en.htm +4 -1
- novelwriter/assets/themes/cyberpunk_night.conf +2 -0
- novelwriter/assets/themes/default_dark.conf +1 -0
- novelwriter/assets/themes/default_light.conf +1 -0
- novelwriter/assets/themes/dracula.conf +47 -0
- novelwriter/assets/themes/solarized_dark.conf +1 -0
- novelwriter/assets/themes/solarized_light.conf +1 -0
- novelwriter/common.py +31 -9
- novelwriter/config.py +118 -84
- novelwriter/constants.py +40 -26
- novelwriter/core/buildsettings.py +63 -66
- novelwriter/core/coretools.py +2 -22
- novelwriter/core/docbuild.py +51 -40
- novelwriter/core/document.py +3 -5
- novelwriter/core/index.py +115 -45
- novelwriter/core/item.py +8 -19
- novelwriter/core/options.py +2 -4
- novelwriter/core/project.py +23 -57
- novelwriter/core/projectdata.py +1 -3
- novelwriter/core/projectxml.py +12 -15
- novelwriter/core/sessions.py +3 -5
- novelwriter/core/spellcheck.py +4 -9
- novelwriter/core/status.py +211 -164
- novelwriter/core/storage.py +0 -8
- novelwriter/core/tohtml.py +94 -100
- novelwriter/core/tokenizer.py +199 -112
- novelwriter/core/{tomd.py → tomarkdown.py} +97 -78
- novelwriter/core/toodt.py +212 -148
- novelwriter/core/toqdoc.py +403 -0
- novelwriter/core/tree.py +5 -7
- novelwriter/dialogs/about.py +3 -5
- novelwriter/dialogs/docmerge.py +1 -3
- novelwriter/dialogs/docsplit.py +1 -3
- novelwriter/dialogs/editlabel.py +0 -2
- novelwriter/dialogs/preferences.py +111 -88
- novelwriter/dialogs/projectsettings.py +216 -180
- novelwriter/dialogs/quotes.py +3 -4
- novelwriter/dialogs/wordlist.py +3 -9
- novelwriter/enum.py +31 -25
- novelwriter/error.py +8 -15
- novelwriter/extensions/circularprogress.py +5 -6
- novelwriter/extensions/configlayout.py +18 -18
- novelwriter/extensions/eventfilters.py +1 -5
- novelwriter/extensions/modified.py +50 -13
- novelwriter/extensions/novelselector.py +1 -3
- novelwriter/extensions/pagedsidebar.py +9 -12
- novelwriter/extensions/simpleprogress.py +1 -3
- novelwriter/extensions/statusled.py +1 -3
- novelwriter/extensions/switch.py +4 -6
- novelwriter/extensions/switchbox.py +7 -6
- novelwriter/extensions/versioninfo.py +3 -9
- novelwriter/gui/doceditor.py +98 -126
- novelwriter/gui/dochighlight.py +237 -183
- novelwriter/gui/docviewer.py +46 -94
- novelwriter/gui/docviewerpanel.py +3 -10
- novelwriter/gui/editordocument.py +1 -3
- novelwriter/gui/itemdetails.py +7 -11
- novelwriter/gui/mainmenu.py +11 -7
- novelwriter/gui/noveltree.py +11 -24
- novelwriter/gui/outline.py +11 -23
- novelwriter/gui/projtree.py +26 -43
- novelwriter/gui/search.py +1 -3
- novelwriter/gui/sidebar.py +2 -6
- novelwriter/gui/statusbar.py +6 -10
- novelwriter/gui/theme.py +23 -48
- novelwriter/guimain.py +50 -71
- novelwriter/shared.py +30 -15
- novelwriter/tools/dictionaries.py +8 -12
- novelwriter/tools/lipsum.py +2 -4
- novelwriter/tools/manusbuild.py +1 -3
- novelwriter/tools/manuscript.py +66 -145
- novelwriter/tools/manussettings.py +67 -73
- novelwriter/tools/noveldetails.py +6 -11
- novelwriter/tools/welcome.py +2 -16
- novelwriter/tools/writingstats.py +6 -9
- novelwriter/types.py +45 -3
- {novelWriter-2.4.3.dist-info → novelWriter-2.5b1.dist-info}/LICENSE.md +0 -0
- {novelWriter-2.4.3.dist-info → novelWriter-2.5b1.dist-info}/WHEEL +0 -0
- {novelWriter-2.4.3.dist-info → novelWriter-2.5b1.dist-info}/entry_points.txt +0 -0
- {novelWriter-2.4.3.dist-info → novelWriter-2.5b1.dist-info}/top_level.txt +0 -0
novelwriter/core/tokenizer.py
CHANGED
@@ -24,18 +24,19 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
24
24
|
"""
|
25
25
|
from __future__ import annotations
|
26
26
|
|
27
|
-
import re
|
28
27
|
import json
|
29
28
|
import logging
|
29
|
+
import re
|
30
30
|
|
31
31
|
from abc import ABC, abstractmethod
|
32
|
-
from time import time
|
33
|
-
from pathlib import Path
|
34
32
|
from functools import partial
|
33
|
+
from pathlib import Path
|
34
|
+
from time import time
|
35
35
|
|
36
36
|
from PyQt5.QtCore import QCoreApplication, QRegularExpression
|
37
|
+
from PyQt5.QtGui import QFont
|
37
38
|
|
38
|
-
from novelwriter.common import formatTimeStamp, numberToRoman
|
39
|
+
from novelwriter.common import checkInt, formatTimeStamp, numberToRoman
|
39
40
|
from novelwriter.constants import (
|
40
41
|
nwHeadFmt, nwKeyWords, nwLabels, nwRegEx, nwShortcode, nwUnicode, trConst
|
41
42
|
)
|
@@ -48,6 +49,10 @@ logger = logging.getLogger(__name__)
|
|
48
49
|
ESCAPES = {r"\*": "*", r"\~": "~", r"\_": "_", r"\[": "[", r"\]": "]", r"\ ": ""}
|
49
50
|
RX_ESC = re.compile("|".join([re.escape(k) for k in ESCAPES.keys()]), flags=re.DOTALL)
|
50
51
|
|
52
|
+
T_Formats = list[tuple[int, int, str]]
|
53
|
+
T_Comment = tuple[str, T_Formats]
|
54
|
+
T_Token = tuple[int, int, str, T_Formats, int]
|
55
|
+
|
51
56
|
|
52
57
|
def stripEscape(text: str) -> str:
|
53
58
|
"""Strip escaped Markdown characters from paragraph text."""
|
@@ -80,6 +85,8 @@ class Tokenizer(ABC):
|
|
80
85
|
FMT_SUP_E = 12 # End superscript
|
81
86
|
FMT_SUB_B = 13 # Begin subscript
|
82
87
|
FMT_SUB_E = 14 # End subscript
|
88
|
+
FMT_FNOTE = 15 # Footnote marker
|
89
|
+
FMT_STRIP = 16 # Strip the format code
|
83
90
|
|
84
91
|
# Block Type
|
85
92
|
T_EMPTY = 1 # Empty line (new paragraph)
|
@@ -111,45 +118,53 @@ class Tokenizer(ABC):
|
|
111
118
|
|
112
119
|
# Lookups
|
113
120
|
L_HEADINGS = [T_TITLE, T_HEAD1, T_HEAD2, T_HEAD3, T_HEAD4]
|
121
|
+
L_SKIP_INDENT = [T_TITLE, T_HEAD1, T_HEAD2, T_HEAD2, T_HEAD3, T_HEAD4, T_SEP, T_SKIP]
|
122
|
+
L_SUMMARY = [T_SYNOPSIS, T_SHORT]
|
114
123
|
|
115
124
|
def __init__(self, project: NWProject) -> None:
|
116
125
|
|
117
126
|
self._project = project
|
118
127
|
|
119
128
|
# Data Variables
|
120
|
-
self._text = ""
|
121
|
-
self._handle = None
|
122
|
-
self._result = ""
|
129
|
+
self._text = "" # The raw text to be tokenized
|
130
|
+
self._handle = None # The item handle currently being processed
|
131
|
+
self._result = "" # The result of the last document
|
132
|
+
self._keepMD = False # Whether to keep the markdown text
|
123
133
|
|
124
|
-
|
125
|
-
self.
|
134
|
+
# Tokens and Meta Data (Per Document)
|
135
|
+
self._tokens: list[T_Token] = []
|
136
|
+
self._footnotes: dict[str, T_Comment] = {}
|
126
137
|
|
127
|
-
#
|
128
|
-
self._tokens: list[tuple[int, int, str, list[tuple[int, int]], int]] = []
|
138
|
+
# Tokens and Meta Data (Per Instance)
|
129
139
|
self._counts: dict[str, int] = {}
|
130
140
|
self._outline: dict[str, str] = {}
|
141
|
+
self._markdown: list[str] = []
|
131
142
|
|
132
143
|
# User Settings
|
133
|
-
self._textFont = "Serif" # Output text font
|
134
|
-
self._textSize = 11 # Output text size
|
135
|
-
self._textFixed = False # Fixed width text
|
144
|
+
self._textFont = QFont("Serif", 11) # Output text font
|
136
145
|
self._lineHeight = 1.15 # Line height in units of em
|
137
146
|
self._blockIndent = 4.00 # Block indent in units of em
|
147
|
+
self._firstIndent = False # Enable first line indent
|
148
|
+
self._firstWidth = 1.40 # First line indent in units of em
|
149
|
+
self._indentFirst = False # Indent first paragraph
|
138
150
|
self._doJustify = False # Justify text
|
139
151
|
self._doBodyText = True # Include body text
|
140
152
|
self._doSynopsis = False # Also process synopsis comments
|
141
153
|
self._doComments = False # Also process comments
|
142
154
|
self._doKeywords = False # Also process keywords like tags and references
|
143
155
|
self._skipKeywords = set() # Keywords to ignore
|
156
|
+
self._keepBreaks = True # Keep line breaks in paragraphs
|
144
157
|
|
145
158
|
# Margins
|
146
|
-
self._marginTitle = (1.
|
147
|
-
self._marginHead1 = (1.
|
148
|
-
self._marginHead2 = (
|
149
|
-
self._marginHead3 = (
|
150
|
-
self._marginHead4 = (
|
159
|
+
self._marginTitle = (1.417, 0.500)
|
160
|
+
self._marginHead1 = (1.417, 0.500)
|
161
|
+
self._marginHead2 = (1.668, 0.500)
|
162
|
+
self._marginHead3 = (1.168, 0.500)
|
163
|
+
self._marginHead4 = (1.168, 0.500)
|
151
164
|
self._marginText = (0.000, 0.584)
|
152
165
|
self._marginMeta = (0.000, 0.584)
|
166
|
+
self._marginFoot = (1.417, 0.467)
|
167
|
+
self._marginSep = (1.168, 1.168)
|
153
168
|
|
154
169
|
# Title Formats
|
155
170
|
self._fmtTitle = nwHeadFmt.TITLE # Formatting for titles
|
@@ -205,6 +220,9 @@ class Tokenizer(ABC):
|
|
205
220
|
nwShortcode.SUP_O: self.FMT_SUP_B, nwShortcode.SUP_C: self.FMT_SUP_E,
|
206
221
|
nwShortcode.SUB_O: self.FMT_SUB_B, nwShortcode.SUB_C: self.FMT_SUB_E,
|
207
222
|
}
|
223
|
+
self._shortCodeVals = {
|
224
|
+
nwShortcode.FOOTNOTE_B: self.FMT_FNOTE,
|
225
|
+
}
|
208
226
|
|
209
227
|
return
|
210
228
|
|
@@ -220,7 +238,7 @@ class Tokenizer(ABC):
|
|
220
238
|
@property
|
221
239
|
def allMarkdown(self) -> list[str]:
|
222
240
|
"""The combined novelWriter Markdown text."""
|
223
|
-
return self.
|
241
|
+
return self._markdown
|
224
242
|
|
225
243
|
@property
|
226
244
|
def textStats(self) -> dict[str, int]:
|
@@ -298,11 +316,9 @@ class Tokenizer(ABC):
|
|
298
316
|
)
|
299
317
|
return
|
300
318
|
|
301
|
-
def setFont(self,
|
319
|
+
def setFont(self, font: QFont) -> None:
|
302
320
|
"""Set the build font."""
|
303
|
-
self._textFont =
|
304
|
-
self._textSize = round(int(size))
|
305
|
-
self._textFixed = isFixed
|
321
|
+
self._textFont = font
|
306
322
|
return
|
307
323
|
|
308
324
|
def setLineHeight(self, height: float) -> None:
|
@@ -315,6 +331,15 @@ class Tokenizer(ABC):
|
|
315
331
|
self._blockIndent = min(max(float(indent), 0.0), 10.0)
|
316
332
|
return
|
317
333
|
|
334
|
+
def setFirstLineIndent(self, state: bool, indent: float, first: bool) -> None:
|
335
|
+
"""Set first line indent and whether to also indent first
|
336
|
+
paragraph after a heading.
|
337
|
+
"""
|
338
|
+
self._firstIndent = state
|
339
|
+
self._firstWidth = indent
|
340
|
+
self._indentFirst = first
|
341
|
+
return
|
342
|
+
|
318
343
|
def setJustify(self, state: bool) -> None:
|
319
344
|
"""Enable or disable text justification."""
|
320
345
|
self._doJustify = state
|
@@ -355,6 +380,11 @@ class Tokenizer(ABC):
|
|
355
380
|
self._marginMeta = (float(upper), float(lower))
|
356
381
|
return
|
357
382
|
|
383
|
+
def setSeparatorMargins(self, upper: float, lower: float) -> None:
|
384
|
+
"""Set the upper and lower meta text margin."""
|
385
|
+
self._marginSep = (float(upper), float(lower))
|
386
|
+
return
|
387
|
+
|
358
388
|
def setLinkHeadings(self, state: bool) -> None:
|
359
389
|
"""Enable or disable adding an anchor before headings."""
|
360
390
|
self._linkHeadings = state
|
@@ -385,9 +415,14 @@ class Tokenizer(ABC):
|
|
385
415
|
self._skipKeywords = set(x.lower().strip() for x in keywords.split(","))
|
386
416
|
return
|
387
417
|
|
418
|
+
def setKeepLineBreaks(self, state: bool) -> None:
|
419
|
+
"""Keep line breaks in paragraphs."""
|
420
|
+
self._keepBreaks = state
|
421
|
+
return
|
422
|
+
|
388
423
|
def setKeepMarkdown(self, state: bool) -> None:
|
389
424
|
"""Keep original markdown during build."""
|
390
|
-
self.
|
425
|
+
self._keepMD = state
|
391
426
|
return
|
392
427
|
|
393
428
|
##
|
@@ -417,8 +452,8 @@ class Tokenizer(ABC):
|
|
417
452
|
self._tokens.append((
|
418
453
|
self.T_TITLE, 1, title, [], textAlign
|
419
454
|
))
|
420
|
-
if self.
|
421
|
-
self.
|
455
|
+
if self._keepMD:
|
456
|
+
self._markdown.append(f"#! {title}\n\n")
|
422
457
|
|
423
458
|
return
|
424
459
|
|
@@ -466,22 +501,23 @@ class Tokenizer(ABC):
|
|
466
501
|
4: The internal formatting map of the text, self.FMT_*
|
467
502
|
5: The style of the block, self.A_*
|
468
503
|
"""
|
469
|
-
self._tokens = []
|
470
504
|
if self._isNovel:
|
471
505
|
self._hFormatter.setHandle(self._handle)
|
472
506
|
|
473
507
|
nHead = 0
|
474
508
|
breakNext = False
|
475
509
|
tmpMarkdown = []
|
510
|
+
tHandle = self._handle or ""
|
511
|
+
tokens: list[T_Token] = []
|
476
512
|
for aLine in self._text.splitlines():
|
477
513
|
sLine = aLine.strip().lower()
|
478
514
|
|
479
515
|
# Check for blank lines
|
480
516
|
if len(sLine) == 0:
|
481
|
-
|
517
|
+
tokens.append((
|
482
518
|
self.T_EMPTY, nHead, "", [], self.A_NONE
|
483
519
|
))
|
484
|
-
if self.
|
520
|
+
if self._keepMD:
|
485
521
|
tmpMarkdown.append("\n")
|
486
522
|
|
487
523
|
continue
|
@@ -507,7 +543,7 @@ class Tokenizer(ABC):
|
|
507
543
|
continue
|
508
544
|
|
509
545
|
elif sLine == "[vspace]":
|
510
|
-
|
546
|
+
tokens.append(
|
511
547
|
(self.T_SKIP, nHead, "", [], sAlign)
|
512
548
|
)
|
513
549
|
continue
|
@@ -515,11 +551,11 @@ class Tokenizer(ABC):
|
|
515
551
|
elif sLine.startswith("[vspace:") and sLine.endswith("]"):
|
516
552
|
nSkip = checkInt(sLine[8:-1], 0)
|
517
553
|
if nSkip >= 1:
|
518
|
-
|
554
|
+
tokens.append(
|
519
555
|
(self.T_SKIP, nHead, "", [], sAlign)
|
520
556
|
)
|
521
557
|
if nSkip > 1:
|
522
|
-
|
558
|
+
tokens += (nSkip - 1) * [
|
523
559
|
(self.T_SKIP, nHead, "", [], self.A_NONE)
|
524
560
|
]
|
525
561
|
continue
|
@@ -533,24 +569,32 @@ class Tokenizer(ABC):
|
|
533
569
|
if aLine.startswith("%~"):
|
534
570
|
continue
|
535
571
|
|
536
|
-
cStyle, cText, _ = processComment(aLine)
|
572
|
+
cStyle, cKey, cText, _, _ = processComment(aLine)
|
537
573
|
if cStyle == nwComment.SYNOPSIS:
|
538
|
-
self.
|
539
|
-
|
574
|
+
tLine, tFmt = self._extractFormats(cText)
|
575
|
+
tokens.append((
|
576
|
+
self.T_SYNOPSIS, nHead, tLine, tFmt, sAlign
|
540
577
|
))
|
541
|
-
if self._doSynopsis and self.
|
578
|
+
if self._doSynopsis and self._keepMD:
|
542
579
|
tmpMarkdown.append(f"{aLine}\n")
|
543
580
|
elif cStyle == nwComment.SHORT:
|
544
|
-
self.
|
545
|
-
|
581
|
+
tLine, tFmt = self._extractFormats(cText)
|
582
|
+
tokens.append((
|
583
|
+
self.T_SHORT, nHead, tLine, tFmt, sAlign
|
546
584
|
))
|
547
|
-
if self._doSynopsis and self.
|
585
|
+
if self._doSynopsis and self._keepMD:
|
586
|
+
tmpMarkdown.append(f"{aLine}\n")
|
587
|
+
elif cStyle == nwComment.FOOTNOTE:
|
588
|
+
tLine, tFmt = self._extractFormats(cText, skip=self.FMT_FNOTE)
|
589
|
+
self._footnotes[f"{tHandle}:{cKey}"] = (tLine, tFmt)
|
590
|
+
if self._keepMD:
|
548
591
|
tmpMarkdown.append(f"{aLine}\n")
|
549
592
|
else:
|
550
|
-
self.
|
551
|
-
|
593
|
+
tLine, tFmt = self._extractFormats(cText)
|
594
|
+
tokens.append((
|
595
|
+
self.T_COMMENT, nHead, tLine, tFmt, sAlign
|
552
596
|
))
|
553
|
-
if self._doComments and self.
|
597
|
+
if self._doComments and self._keepMD:
|
554
598
|
tmpMarkdown.append(f"{aLine}\n")
|
555
599
|
|
556
600
|
elif aLine.startswith("@"):
|
@@ -560,11 +604,14 @@ class Tokenizer(ABC):
|
|
560
604
|
# are automatically skipped.
|
561
605
|
|
562
606
|
valid, bits, _ = self._project.index.scanThis(aLine)
|
563
|
-
if
|
564
|
-
|
607
|
+
if (
|
608
|
+
valid and bits and bits[0] in nwLabels.KEY_NAME
|
609
|
+
and bits[0] not in self._skipKeywords
|
610
|
+
):
|
611
|
+
tokens.append((
|
565
612
|
self.T_KEYWORD, nHead, aLine[1:].strip(), [], sAlign
|
566
613
|
))
|
567
|
-
if self._doKeywords and self.
|
614
|
+
if self._doKeywords and self._keepMD:
|
568
615
|
tmpMarkdown.append(f"{aLine}\n")
|
569
616
|
|
570
617
|
elif aLine.startswith(("# ", "#! ")):
|
@@ -597,10 +644,10 @@ class Tokenizer(ABC):
|
|
597
644
|
self._hFormatter.resetAll()
|
598
645
|
self._noSep = True
|
599
646
|
|
600
|
-
|
647
|
+
tokens.append((
|
601
648
|
tType, nHead, tText, [], tStyle
|
602
649
|
))
|
603
|
-
if self.
|
650
|
+
if self._keepMD:
|
604
651
|
tmpMarkdown.append(f"{aLine}\n")
|
605
652
|
|
606
653
|
elif aLine.startswith(("## ", "##! ")):
|
@@ -632,10 +679,10 @@ class Tokenizer(ABC):
|
|
632
679
|
self._hFormatter.resetScene()
|
633
680
|
self._noSep = True
|
634
681
|
|
635
|
-
|
682
|
+
tokens.append((
|
636
683
|
tType, nHead, tText, [], tStyle
|
637
684
|
))
|
638
|
-
if self.
|
685
|
+
if self._keepMD:
|
639
686
|
tmpMarkdown.append(f"{aLine}\n")
|
640
687
|
|
641
688
|
elif aLine.startswith(("### ", "###! ")):
|
@@ -673,10 +720,10 @@ class Tokenizer(ABC):
|
|
673
720
|
tStyle = self.A_NONE if self._noSep else self.A_CENTRE
|
674
721
|
self._noSep = False
|
675
722
|
|
676
|
-
|
723
|
+
tokens.append((
|
677
724
|
tType, nHead, tText, [], tStyle
|
678
725
|
))
|
679
|
-
if self.
|
726
|
+
if self._keepMD:
|
680
727
|
tmpMarkdown.append(f"{aLine}\n")
|
681
728
|
|
682
729
|
elif aLine.startswith("#### "):
|
@@ -703,10 +750,10 @@ class Tokenizer(ABC):
|
|
703
750
|
tType = self.T_SEP
|
704
751
|
tStyle = self.A_CENTRE
|
705
752
|
|
706
|
-
|
753
|
+
tokens.append((
|
707
754
|
tType, nHead, tText, [], tStyle
|
708
755
|
))
|
709
|
-
if self.
|
756
|
+
if self._keepMD:
|
710
757
|
tmpMarkdown.append(f"{aLine}\n")
|
711
758
|
|
712
759
|
else:
|
@@ -750,54 +797,91 @@ class Tokenizer(ABC):
|
|
750
797
|
sAlign |= self.A_IND_R
|
751
798
|
|
752
799
|
# Process formats
|
753
|
-
tLine,
|
754
|
-
|
755
|
-
self.T_TEXT, nHead, tLine,
|
800
|
+
tLine, tFmt = self._extractFormats(aLine)
|
801
|
+
tokens.append((
|
802
|
+
self.T_TEXT, nHead, tLine, tFmt, sAlign
|
756
803
|
))
|
757
|
-
if self.
|
804
|
+
if self._keepMD:
|
758
805
|
tmpMarkdown.append(f"{aLine}\n")
|
759
806
|
|
760
807
|
# If we have content, turn off the first page flag
|
761
|
-
if self._isFirst and
|
808
|
+
if self._isFirst and tokens:
|
762
809
|
self._isFirst = False # First document has been processed
|
763
810
|
|
764
811
|
# Make sure the token array doesn't start with a page break
|
765
812
|
# on the very first page, adding a blank first page.
|
766
|
-
if
|
767
|
-
|
768
|
-
|
769
|
-
|
813
|
+
if tokens[0][4] & self.A_PBB:
|
814
|
+
cToken = tokens[0]
|
815
|
+
tokens[0] = (
|
816
|
+
cToken[0], cToken[1], cToken[2], cToken[3], cToken[4] & ~self.A_PBB
|
770
817
|
)
|
771
818
|
|
772
819
|
# Always add an empty line at the end of the file
|
773
|
-
|
820
|
+
tokens.append((
|
774
821
|
self.T_EMPTY, nHead, "", [], self.A_NONE
|
775
822
|
))
|
776
|
-
if self.
|
823
|
+
if self._keepMD:
|
777
824
|
tmpMarkdown.append("\n")
|
778
|
-
self.
|
825
|
+
self._markdown.append("".join(tmpMarkdown))
|
779
826
|
|
780
827
|
# Second Pass
|
781
828
|
# ===========
|
782
|
-
#
|
829
|
+
# This second pass strips away consecutive blank lines, and
|
830
|
+
# combines consecutive text lines into the same paragraph.
|
831
|
+
# It also ensures that there isn't paragraph spacing between
|
832
|
+
# meta data lines for formats that has spacing.
|
833
|
+
|
834
|
+
self._tokens = []
|
835
|
+
pToken: T_Token = (self.T_EMPTY, 0, "", [], self.A_NONE)
|
836
|
+
nToken: T_Token = (self.T_EMPTY, 0, "", [], self.A_NONE)
|
837
|
+
|
838
|
+
lineSep = "\n" if self._keepBreaks else " "
|
839
|
+
pLines: list[T_Token] = []
|
783
840
|
|
784
|
-
|
785
|
-
|
786
|
-
tCount = len(self._tokens)
|
787
|
-
for n, token in enumerate(self._tokens):
|
841
|
+
tCount = len(tokens)
|
842
|
+
for n, cToken in enumerate(tokens):
|
788
843
|
|
789
844
|
if n > 0:
|
790
|
-
pToken =
|
845
|
+
pToken = tokens[n-1] # Look behind
|
791
846
|
if n < tCount - 1:
|
792
|
-
nToken =
|
847
|
+
nToken = tokens[n+1] # Look ahead
|
793
848
|
|
794
|
-
if
|
795
|
-
|
849
|
+
if cToken[0] == self.T_EMPTY:
|
850
|
+
# We don't need to keep the empty lines after this pass
|
851
|
+
pass
|
852
|
+
|
853
|
+
elif cToken[0] == self.T_KEYWORD:
|
854
|
+
# Adjust margins for lines in a list of keyword lines
|
855
|
+
aStyle = cToken[4]
|
796
856
|
if pToken[0] == self.T_KEYWORD:
|
797
857
|
aStyle |= self.A_Z_TOPMRG
|
798
858
|
if nToken[0] == self.T_KEYWORD:
|
799
859
|
aStyle |= self.A_Z_BTMMRG
|
800
|
-
self._tokens
|
860
|
+
self._tokens.append((
|
861
|
+
cToken[0], cToken[1], cToken[2], cToken[3], aStyle
|
862
|
+
))
|
863
|
+
|
864
|
+
elif cToken[0] == self.T_TEXT:
|
865
|
+
# Combine lines from the same paragraph
|
866
|
+
pLines.append(cToken)
|
867
|
+
if nToken[0] != self.T_TEXT:
|
868
|
+
nLines = len(pLines)
|
869
|
+
if nLines == 1:
|
870
|
+
self._tokens.append(pLines[0])
|
871
|
+
elif nLines > 1:
|
872
|
+
tTxt = ""
|
873
|
+
tFmt: T_Formats = []
|
874
|
+
for aToken in pLines:
|
875
|
+
tLen = len(tTxt)
|
876
|
+
tTxt += f"{aToken[2]}{lineSep}"
|
877
|
+
tFmt.extend((p+tLen, fmt, key) for p, fmt, key in aToken[3])
|
878
|
+
self._tokens.append((
|
879
|
+
self.T_TEXT, pLines[0][1], tTxt[:-1], tFmt, pLines[0][4]
|
880
|
+
))
|
881
|
+
pLines = []
|
882
|
+
|
883
|
+
else:
|
884
|
+
self._tokens.append(cToken)
|
801
885
|
|
802
886
|
return
|
803
887
|
|
@@ -840,7 +924,6 @@ class Tokenizer(ABC):
|
|
840
924
|
textWordChars = self._counts.get("textWordChars", 0)
|
841
925
|
titleWordChars = self._counts.get("titleWordChars", 0)
|
842
926
|
|
843
|
-
para = []
|
844
927
|
for tType, _, tText, _, _ in self._tokens:
|
845
928
|
tText = tText.replace(nwUnicode.U_ENDASH, " ")
|
846
929
|
tText = tText.replace(nwUnicode.U_EMDASH, " ")
|
@@ -850,22 +933,19 @@ class Tokenizer(ABC):
|
|
850
933
|
nChars = len(tText)
|
851
934
|
nWChars = len("".join(tWords))
|
852
935
|
|
853
|
-
if tType == self.
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
allWordChars += nPWChars
|
867
|
-
textWordChars += nPWChars
|
868
|
-
para = []
|
936
|
+
if tType == self.T_TEXT:
|
937
|
+
tPWords = tText.split()
|
938
|
+
nPWords = len(tPWords)
|
939
|
+
nPChars = len(tText)
|
940
|
+
nPWChars = len("".join(tPWords))
|
941
|
+
|
942
|
+
paragraphCount += 1
|
943
|
+
allWords += nPWords
|
944
|
+
textWords += nPWords
|
945
|
+
allChars += nPChars
|
946
|
+
textChars += nPChars
|
947
|
+
allWordChars += nPWChars
|
948
|
+
textWordChars += nPWChars
|
869
949
|
|
870
950
|
elif tType in self.L_HEADINGS:
|
871
951
|
titleCount += 1
|
@@ -881,9 +961,6 @@ class Tokenizer(ABC):
|
|
881
961
|
allChars += nChars
|
882
962
|
allWordChars += nWChars
|
883
963
|
|
884
|
-
elif tType == self.T_TEXT:
|
885
|
-
para.append(tText.rstrip())
|
886
|
-
|
887
964
|
elif tType == self.T_SYNOPSIS and self._doSynopsis:
|
888
965
|
text = "{0}: {1}".format(self._localLookup("Synopsis"), tText)
|
889
966
|
words = text.split()
|
@@ -935,7 +1012,7 @@ class Tokenizer(ABC):
|
|
935
1012
|
def saveRawMarkdown(self, path: str | Path) -> None:
|
936
1013
|
"""Save the raw text to a plain text file."""
|
937
1014
|
with open(path, mode="w", encoding="utf-8") as outFile:
|
938
|
-
for nwdPage in self.
|
1015
|
+
for nwdPage in self._markdown:
|
939
1016
|
outFile.write(nwdPage)
|
940
1017
|
return
|
941
1018
|
|
@@ -950,7 +1027,7 @@ class Tokenizer(ABC):
|
|
950
1027
|
"buildTimeStr": formatTimeStamp(timeStamp),
|
951
1028
|
},
|
952
1029
|
"text": {
|
953
|
-
"nwd": [page.rstrip("\n").split("\n") for page in self.
|
1030
|
+
"nwd": [page.rstrip("\n").split("\n") for page in self._markdown],
|
954
1031
|
}
|
955
1032
|
}
|
956
1033
|
with open(path, mode="w", encoding="utf-8") as fObj:
|
@@ -961,9 +1038,9 @@ class Tokenizer(ABC):
|
|
961
1038
|
# Internal Functions
|
962
1039
|
##
|
963
1040
|
|
964
|
-
def _extractFormats(self, text: str) -> tuple[str,
|
1041
|
+
def _extractFormats(self, text: str, skip: int = 0) -> tuple[str, T_Formats]:
|
965
1042
|
"""Extract format markers from a text paragraph."""
|
966
|
-
temp = []
|
1043
|
+
temp: list[tuple[int, int, int, str]] = []
|
967
1044
|
|
968
1045
|
# Match Markdown
|
969
1046
|
for regEx, fmts in self._rxMarkdown:
|
@@ -971,7 +1048,7 @@ class Tokenizer(ABC):
|
|
971
1048
|
while rxItt.hasNext():
|
972
1049
|
rxMatch = rxItt.next()
|
973
1050
|
temp.extend(
|
974
|
-
|
1051
|
+
(rxMatch.capturedStart(n), rxMatch.capturedLength(n), fmt, "")
|
975
1052
|
for n, fmt in enumerate(fmts) if fmt > 0
|
976
1053
|
)
|
977
1054
|
|
@@ -979,25 +1056,37 @@ class Tokenizer(ABC):
|
|
979
1056
|
rxItt = self._rxShortCodes.globalMatch(text, 0)
|
980
1057
|
while rxItt.hasNext():
|
981
1058
|
rxMatch = rxItt.next()
|
982
|
-
temp.append(
|
1059
|
+
temp.append((
|
983
1060
|
rxMatch.capturedStart(1),
|
984
1061
|
rxMatch.capturedLength(1),
|
985
|
-
self._shortCodeFmt.get(rxMatch.captured(1).lower(), 0)
|
986
|
-
|
1062
|
+
self._shortCodeFmt.get(rxMatch.captured(1).lower(), 0),
|
1063
|
+
"",
|
1064
|
+
))
|
987
1065
|
|
988
|
-
#
|
1066
|
+
# Match Shortcode w/Values
|
1067
|
+
rxItt = self._rxShortCodeVals.globalMatch(text, 0)
|
1068
|
+
tHandle = self._handle or ""
|
1069
|
+
while rxItt.hasNext():
|
1070
|
+
rxMatch = rxItt.next()
|
1071
|
+
kind = self._shortCodeVals.get(rxMatch.captured(1).lower(), 0)
|
1072
|
+
temp.append((
|
1073
|
+
rxMatch.capturedStart(0),
|
1074
|
+
rxMatch.capturedLength(0),
|
1075
|
+
self.FMT_STRIP if kind == skip else kind,
|
1076
|
+
f"{tHandle}:{rxMatch.captured(2)}",
|
1077
|
+
))
|
1078
|
+
|
1079
|
+
# Post-process text and format
|
989
1080
|
result = text
|
990
1081
|
formats = []
|
991
|
-
for pos, n, fmt in reversed(sorted(temp, key=lambda x: x[0])):
|
1082
|
+
for pos, n, fmt, key in reversed(sorted(temp, key=lambda x: x[0])):
|
992
1083
|
if fmt > 0:
|
993
1084
|
result = result[:pos] + result[pos+n:]
|
994
|
-
formats = [(p-n, f) for p, f in formats]
|
995
|
-
formats.insert(0, (pos, fmt))
|
1085
|
+
formats = [(p-n, f, k) for p, f, k in formats]
|
1086
|
+
formats.insert(0, (pos, fmt, key))
|
996
1087
|
|
997
1088
|
return result, formats
|
998
1089
|
|
999
|
-
# END Class Tokenizer
|
1000
|
-
|
1001
1090
|
|
1002
1091
|
class HeadingFormatter:
|
1003
1092
|
|
@@ -1067,5 +1156,3 @@ class HeadingFormatter:
|
|
1067
1156
|
hFormat = hFormat.replace(nwHeadFmt.CHAR_FOCUS, fText)
|
1068
1157
|
|
1069
1158
|
return hFormat
|
1070
|
-
|
1071
|
-
# END Class HeadingFormatter
|