chgksuite 0.26.0b11__py3-none-any.whl → 0.27.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chgksuite/_html2md.py +90 -0
- chgksuite/cli.py +38 -8
- chgksuite/common.py +16 -12
- chgksuite/composer/__init__.py +9 -7
- chgksuite/composer/chgksuite_parser.py +20 -9
- chgksuite/composer/composer_common.py +30 -3
- chgksuite/composer/db.py +1 -2
- chgksuite/composer/docx.py +542 -292
- chgksuite/composer/latex.py +3 -4
- chgksuite/composer/lj.py +1 -2
- chgksuite/composer/{reddit.py → markdown.py} +35 -25
- chgksuite/composer/openquiz.py +2 -3
- chgksuite/composer/pptx.py +18 -6
- chgksuite/composer/telegram.py +22 -10
- chgksuite/handouter/gen.py +11 -7
- chgksuite/handouter/installer.py +0 -0
- chgksuite/handouter/runner.py +237 -10
- chgksuite/handouter/tex_internals.py +12 -13
- chgksuite/handouter/utils.py +22 -1
- chgksuite/lastdir +1 -0
- chgksuite/parser.py +218 -37
- chgksuite/parser_db.py +4 -6
- chgksuite/resources/labels_az.toml +22 -0
- chgksuite/resources/labels_by.toml +1 -2
- chgksuite/resources/labels_by_tar.toml +1 -2
- chgksuite/resources/labels_en.toml +1 -2
- chgksuite/resources/labels_kz_cyr.toml +1 -2
- chgksuite/resources/labels_ru.toml +1 -2
- chgksuite/resources/labels_sr.toml +1 -2
- chgksuite/resources/labels_ua.toml +1 -2
- chgksuite/resources/labels_uz.toml +0 -3
- chgksuite/resources/labels_uz_cyr.toml +1 -2
- chgksuite/resources/regexes_az.json +17 -0
- chgksuite/resources/regexes_by.json +3 -2
- chgksuite/resources/regexes_by_tar.json +17 -0
- chgksuite/resources/regexes_en.json +3 -2
- chgksuite/resources/regexes_kz_cyr.json +3 -2
- chgksuite/resources/regexes_ru.json +3 -2
- chgksuite/resources/regexes_sr.json +3 -2
- chgksuite/resources/regexes_ua.json +3 -2
- chgksuite/resources/regexes_uz.json +16 -0
- chgksuite/resources/regexes_uz_cyr.json +3 -2
- chgksuite/trello.py +8 -9
- chgksuite/typotools.py +9 -8
- chgksuite/version.py +1 -1
- {chgksuite-0.26.0b11.dist-info → chgksuite-0.27.0.dist-info}/METADATA +10 -19
- chgksuite-0.27.0.dist-info/RECORD +63 -0
- {chgksuite-0.26.0b11.dist-info → chgksuite-0.27.0.dist-info}/WHEEL +1 -2
- chgksuite/composer/telegram_parser.py +0 -230
- chgksuite-0.26.0b11.dist-info/RECORD +0 -59
- chgksuite-0.26.0b11.dist-info/top_level.txt +0 -1
- {chgksuite-0.26.0b11.dist-info → chgksuite-0.27.0.dist-info}/entry_points.txt +0 -0
- {chgksuite-0.26.0b11.dist-info → chgksuite-0.27.0.dist-info}/licenses/LICENSE +0 -0
chgksuite/parser.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
#!/usr/bin/env python
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
3
|
import base64
|
|
4
|
-
import codecs
|
|
5
4
|
import datetime
|
|
5
|
+
import hashlib
|
|
6
6
|
import itertools
|
|
7
7
|
import json
|
|
8
8
|
import os
|
|
@@ -13,12 +13,15 @@ import subprocess
|
|
|
13
13
|
import sys
|
|
14
14
|
import tempfile
|
|
15
15
|
import urllib
|
|
16
|
+
import time
|
|
16
17
|
|
|
17
18
|
import bs4
|
|
18
19
|
import chardet
|
|
19
|
-
import dashtable
|
|
20
20
|
import mammoth
|
|
21
|
+
|
|
22
|
+
from chgksuite._html2md import html2md
|
|
21
23
|
import pypandoc
|
|
24
|
+
import requests
|
|
22
25
|
import toml
|
|
23
26
|
from bs4 import BeautifulSoup
|
|
24
27
|
from parse import parse
|
|
@@ -26,11 +29,11 @@ from parse import parse
|
|
|
26
29
|
import chgksuite.typotools as typotools
|
|
27
30
|
from chgksuite.common import (
|
|
28
31
|
QUESTION_LABELS,
|
|
29
|
-
DefaultArgs,
|
|
30
32
|
DefaultNamespace,
|
|
31
33
|
DummyLogger,
|
|
32
34
|
check_question,
|
|
33
35
|
compose_4s,
|
|
36
|
+
get_chgksuite_dir,
|
|
34
37
|
get_lastdir,
|
|
35
38
|
init_logger,
|
|
36
39
|
load_settings,
|
|
@@ -40,9 +43,10 @@ from chgksuite.common import (
|
|
|
40
43
|
from chgksuite.composer import gui_compose
|
|
41
44
|
from chgksuite.composer.composer_common import make_filename
|
|
42
45
|
from chgksuite.parser_db import chgk_parse_db
|
|
46
|
+
from chgksuite.typotools import re_url
|
|
43
47
|
from chgksuite.typotools import remove_excessive_whitespace as rew
|
|
44
48
|
|
|
45
|
-
|
|
49
|
+
|
|
46
50
|
SEP = os.linesep
|
|
47
51
|
EDITORS = {
|
|
48
52
|
"win32": "notepad",
|
|
@@ -57,7 +61,7 @@ def partition(alist, indices):
|
|
|
57
61
|
|
|
58
62
|
|
|
59
63
|
def load_regexes(regexfile):
|
|
60
|
-
with
|
|
64
|
+
with open(regexfile, "r", encoding="utf-8") as f:
|
|
61
65
|
regexes = json.loads(f.read())
|
|
62
66
|
return {k: re.compile(v) for k, v in regexes.items()}
|
|
63
67
|
|
|
@@ -107,7 +111,7 @@ class ChgkParser:
|
|
|
107
111
|
|
|
108
112
|
def __init__(self, defaultauthor=None, args=None, logger=None):
|
|
109
113
|
self.defaultauthor = defaultauthor
|
|
110
|
-
args = args or
|
|
114
|
+
args = args or DefaultNamespace()
|
|
111
115
|
self.regexes = load_regexes(args.regexes)
|
|
112
116
|
self.logger = logger or init_logger("parser")
|
|
113
117
|
self.args = args
|
|
@@ -121,6 +125,148 @@ class ChgkParser:
|
|
|
121
125
|
if self.args.language == "en":
|
|
122
126
|
self.args.typography_quotes = "off"
|
|
123
127
|
|
|
128
|
+
def _setup_image_cache(self):
|
|
129
|
+
"""Setup image download cache directory and load existing cache"""
|
|
130
|
+
if not hasattr(self, "_image_cache"):
|
|
131
|
+
self.image_cache_dir = os.path.join(
|
|
132
|
+
get_chgksuite_dir(), "downloaded_images"
|
|
133
|
+
)
|
|
134
|
+
os.makedirs(self.image_cache_dir, exist_ok=True)
|
|
135
|
+
|
|
136
|
+
self.image_cache_file = os.path.join(
|
|
137
|
+
get_chgksuite_dir(), "image_download_cache.json"
|
|
138
|
+
)
|
|
139
|
+
if os.path.isfile(self.image_cache_file):
|
|
140
|
+
try:
|
|
141
|
+
with open(self.image_cache_file, encoding="utf8") as f:
|
|
142
|
+
self._image_cache = json.load(f)
|
|
143
|
+
except (json.JSONDecodeError, OSError):
|
|
144
|
+
self._image_cache = {}
|
|
145
|
+
else:
|
|
146
|
+
self._image_cache = {}
|
|
147
|
+
|
|
148
|
+
def _download_image(self, url):
|
|
149
|
+
"""Download image from URL and return local filename"""
|
|
150
|
+
self._setup_image_cache()
|
|
151
|
+
url = url.replace("\\", "")
|
|
152
|
+
|
|
153
|
+
# Check cache first
|
|
154
|
+
url_hash = hashlib.sha256(url.encode("utf-8")).hexdigest()[:20]
|
|
155
|
+
if url_hash in self._image_cache:
|
|
156
|
+
cached_filename = self._image_cache[url_hash]
|
|
157
|
+
cached_path = os.path.join(self.image_cache_dir, cached_filename)
|
|
158
|
+
if os.path.isfile(cached_path):
|
|
159
|
+
return cached_path
|
|
160
|
+
|
|
161
|
+
# Determine file extension
|
|
162
|
+
parsed_url = urllib.parse.urlparse(url)
|
|
163
|
+
path_lower = parsed_url.path.lower()
|
|
164
|
+
ext = None
|
|
165
|
+
for image_ext in [".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".svg"]:
|
|
166
|
+
if path_lower.endswith(image_ext):
|
|
167
|
+
ext = image_ext
|
|
168
|
+
break
|
|
169
|
+
|
|
170
|
+
if not ext:
|
|
171
|
+
# Try to guess from URL structure
|
|
172
|
+
if any(
|
|
173
|
+
img_ext in path_lower
|
|
174
|
+
for img_ext in [
|
|
175
|
+
".jpg",
|
|
176
|
+
".jpeg",
|
|
177
|
+
".png",
|
|
178
|
+
".webp",
|
|
179
|
+
".gif",
|
|
180
|
+
".bmp",
|
|
181
|
+
".svg",
|
|
182
|
+
]
|
|
183
|
+
):
|
|
184
|
+
for image_ext in [
|
|
185
|
+
".jpg",
|
|
186
|
+
".jpeg",
|
|
187
|
+
".png",
|
|
188
|
+
".webp",
|
|
189
|
+
".gif",
|
|
190
|
+
".bmp",
|
|
191
|
+
".svg",
|
|
192
|
+
]:
|
|
193
|
+
if image_ext in path_lower:
|
|
194
|
+
ext = image_ext
|
|
195
|
+
break
|
|
196
|
+
else:
|
|
197
|
+
ext = ".jpg" # Default extension
|
|
198
|
+
|
|
199
|
+
filename = url_hash + ext
|
|
200
|
+
filepath = os.path.join(self.image_cache_dir, filename)
|
|
201
|
+
|
|
202
|
+
try:
|
|
203
|
+
self.logger.info(f"Downloading image from {url}")
|
|
204
|
+
headers = {
|
|
205
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
|
206
|
+
"Accept": "image/png,image/jpeg,image/webp,image/gif,image/*,*/*;q=0.8",
|
|
207
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
208
|
+
"Accept-Encoding": "gzip, deflate, br",
|
|
209
|
+
"Connection": "keep-alive",
|
|
210
|
+
"Upgrade-Insecure-Requests": "1",
|
|
211
|
+
}
|
|
212
|
+
response = requests.get(url, timeout=30, stream=True, headers=headers)
|
|
213
|
+
response.raise_for_status()
|
|
214
|
+
time.sleep(0.5) # rate limiting
|
|
215
|
+
|
|
216
|
+
with open(filepath, "wb") as f:
|
|
217
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
218
|
+
f.write(chunk)
|
|
219
|
+
|
|
220
|
+
# Update cache
|
|
221
|
+
self._image_cache[url_hash] = filename
|
|
222
|
+
with open(self.image_cache_file, "w", encoding="utf8") as f:
|
|
223
|
+
json.dump(self._image_cache, f, indent=2, sort_keys=True)
|
|
224
|
+
|
|
225
|
+
return filepath
|
|
226
|
+
|
|
227
|
+
except Exception as e:
|
|
228
|
+
self.logger.warning(f"Failed to download image from {url}: {e}")
|
|
229
|
+
return None
|
|
230
|
+
|
|
231
|
+
def _process_images_in_text(self, text):
|
|
232
|
+
"""Process text to find image URLs and replace them with local references"""
|
|
233
|
+
if not text or not getattr(self.args, "download_images", False):
|
|
234
|
+
return text
|
|
235
|
+
|
|
236
|
+
if isinstance(text, list):
|
|
237
|
+
return [self._process_images_in_text(item) for item in text]
|
|
238
|
+
|
|
239
|
+
if not isinstance(text, str):
|
|
240
|
+
return text
|
|
241
|
+
|
|
242
|
+
# Find all URLs in the text
|
|
243
|
+
for match in re_url.finditer(text):
|
|
244
|
+
url = match.group(0)
|
|
245
|
+
url_lower = url.lower()
|
|
246
|
+
|
|
247
|
+
# Check if it's a direct image URL
|
|
248
|
+
if any(
|
|
249
|
+
url_lower.endswith(ext)
|
|
250
|
+
for ext in [".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".svg"]
|
|
251
|
+
):
|
|
252
|
+
local_filename = self._download_image(url)
|
|
253
|
+
if local_filename:
|
|
254
|
+
# Replace URL with chgksuite image syntax
|
|
255
|
+
img_reference = f"(img {local_filename})"
|
|
256
|
+
text = text.replace(url, img_reference)
|
|
257
|
+
|
|
258
|
+
return text
|
|
259
|
+
|
|
260
|
+
def _process_question_images(self, question):
|
|
261
|
+
"""Process a question dict to download images from URLs"""
|
|
262
|
+
if not getattr(self.args, "download_images", False):
|
|
263
|
+
return
|
|
264
|
+
|
|
265
|
+
# Process all fields except 'source'
|
|
266
|
+
for field in question:
|
|
267
|
+
if field != "source":
|
|
268
|
+
question[field] = self._process_images_in_text(question[field])
|
|
269
|
+
|
|
124
270
|
def merge_to_previous(self, index):
|
|
125
271
|
target = index - 1
|
|
126
272
|
if self.structure[target][1]:
|
|
@@ -181,7 +327,7 @@ class ChgkParser:
|
|
|
181
327
|
regex,
|
|
182
328
|
regexes[regex].search(self.remove_formatting(st[i][1])).start(0),
|
|
183
329
|
)
|
|
184
|
-
for regex in set(regexes) - {"number", "date2"}
|
|
330
|
+
for regex in set(regexes) - {"number", "date2", "handout_short"}
|
|
185
331
|
if regexes[regex].search(self.remove_formatting(st[i][1]))
|
|
186
332
|
}
|
|
187
333
|
|
|
@@ -380,7 +526,7 @@ class ChgkParser:
|
|
|
380
526
|
)
|
|
381
527
|
|
|
382
528
|
if debug:
|
|
383
|
-
with
|
|
529
|
+
with open("debug_0.txt", "w", encoding="utf-8") as f:
|
|
384
530
|
f.write(text)
|
|
385
531
|
|
|
386
532
|
# 1.
|
|
@@ -412,7 +558,7 @@ class ChgkParser:
|
|
|
412
558
|
i = 0
|
|
413
559
|
|
|
414
560
|
if debug:
|
|
415
|
-
with
|
|
561
|
+
with open("debug_1.json", "w", encoding="utf-8") as f:
|
|
416
562
|
f.write(json.dumps(self.structure, ensure_ascii=False, indent=4))
|
|
417
563
|
|
|
418
564
|
self.process_single_number_lines()
|
|
@@ -420,15 +566,15 @@ class ChgkParser:
|
|
|
420
566
|
# hack for https://gitlab.com/peczony/chgksuite/-/issues/23; TODO: make less hacky
|
|
421
567
|
for i, element in enumerate(self.structure):
|
|
422
568
|
if (
|
|
423
|
-
"
|
|
424
|
-
or "
|
|
569
|
+
"Дуплет." in element[1].split()
|
|
570
|
+
or "Блиц." in element[1].split()
|
|
425
571
|
and element[0] != "question"
|
|
426
572
|
and (i == 0 or self.structure[i - 1][0] != "question")
|
|
427
573
|
):
|
|
428
574
|
element[0] = "question"
|
|
429
575
|
|
|
430
576
|
if debug:
|
|
431
|
-
with
|
|
577
|
+
with open("debug_1a.json", "w", encoding="utf-8") as f:
|
|
432
578
|
f.write(json.dumps(self.structure, ensure_ascii=False, indent=4))
|
|
433
579
|
|
|
434
580
|
# 2.
|
|
@@ -438,7 +584,7 @@ class ChgkParser:
|
|
|
438
584
|
self.merge_to_x_until_nextfield("comment")
|
|
439
585
|
|
|
440
586
|
if debug:
|
|
441
|
-
with
|
|
587
|
+
with open("debug_2.json", "w", encoding="utf-8") as f:
|
|
442
588
|
f.write(json.dumps(self.structure, ensure_ascii=False, indent=4))
|
|
443
589
|
|
|
444
590
|
# 3.
|
|
@@ -501,7 +647,7 @@ class ChgkParser:
|
|
|
501
647
|
self.merge_to_x_until_nextfield("nezachet")
|
|
502
648
|
|
|
503
649
|
if debug:
|
|
504
|
-
with
|
|
650
|
+
with open("debug_3.json", "w", encoding="utf-8") as f:
|
|
505
651
|
f.write(json.dumps(self.structure, ensure_ascii=False, indent=4))
|
|
506
652
|
|
|
507
653
|
# 4.
|
|
@@ -513,7 +659,19 @@ class ChgkParser:
|
|
|
513
659
|
):
|
|
514
660
|
self.merge_to_next(0)
|
|
515
661
|
|
|
516
|
-
|
|
662
|
+
if debug:
|
|
663
|
+
with open("debug_3a.json", "w", encoding="utf-8") as f:
|
|
664
|
+
f.write(
|
|
665
|
+
json.dumps(
|
|
666
|
+
list(enumerate(self.structure)), ensure_ascii=False, indent=4
|
|
667
|
+
)
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
idx = 0
|
|
671
|
+
cycle = -1
|
|
672
|
+
while idx < len(self.structure):
|
|
673
|
+
cycle += 1
|
|
674
|
+
element = self.structure[idx]
|
|
517
675
|
if element[0] == "":
|
|
518
676
|
element[0] = "meta"
|
|
519
677
|
if element[0] in regexes and element[0] not in [
|
|
@@ -525,34 +683,42 @@ class ChgkParser:
|
|
|
525
683
|
try:
|
|
526
684
|
num = regexes["question"].search(element[1])
|
|
527
685
|
if num and num.group("number"):
|
|
528
|
-
self.structure.insert(
|
|
686
|
+
self.structure.insert(idx, ["number", num.group("number")])
|
|
687
|
+
idx += 1
|
|
529
688
|
except Exception as e:
|
|
689
|
+
num = None
|
|
530
690
|
sys.stderr.write(
|
|
531
|
-
f"exception at
|
|
691
|
+
f"exception at setting number: {type(e)} {e}\nQuestion: {element[1]}\n"
|
|
532
692
|
)
|
|
533
|
-
if (
|
|
534
|
-
|
|
535
|
-
and ("нулевой вопрос" in element[1].lower())
|
|
693
|
+
if (num is None or num and not num.group("number")) and (
|
|
694
|
+
("нулевой вопрос" in element[1].lower())
|
|
536
695
|
or ("разминочный вопрос" in element[1].lower())
|
|
537
696
|
):
|
|
538
|
-
self.structure.insert(
|
|
697
|
+
self.structure.insert(idx, ["number", "0"])
|
|
698
|
+
idx += 1
|
|
539
699
|
if element[0] == "question":
|
|
540
700
|
lines = element[1].split(SEP)
|
|
541
701
|
for i, line in enumerate(lines):
|
|
542
702
|
if regexes["question"].search(line):
|
|
543
703
|
lines[i] = regexes["question"].sub("", line, 1)
|
|
544
704
|
element[1] = SEP.join([x.strip() for x in lines if x.strip()])
|
|
705
|
+
before_replacement = None
|
|
545
706
|
else:
|
|
546
707
|
before_replacement = element[1]
|
|
547
708
|
element[1] = regexes[element[0]].sub("", element[1], 1)
|
|
548
709
|
if element[1].startswith(SEP):
|
|
549
710
|
element[1] = element[1][len(SEP) :]
|
|
550
711
|
# TODO: переделать корявую обработку авторки на нормальную
|
|
551
|
-
if
|
|
712
|
+
if (
|
|
713
|
+
element[0] == "author"
|
|
714
|
+
and before_replacement
|
|
715
|
+
and "авторка:" in before_replacement.lower()
|
|
716
|
+
):
|
|
552
717
|
element[1] = "!!Авторка" + element[1]
|
|
718
|
+
idx += 1
|
|
553
719
|
|
|
554
720
|
if debug:
|
|
555
|
-
with
|
|
721
|
+
with open("debug_4.json", "w", encoding="utf-8") as f:
|
|
556
722
|
f.write(json.dumps(self.structure, ensure_ascii=False, indent=4))
|
|
557
723
|
|
|
558
724
|
# 5.
|
|
@@ -632,7 +798,7 @@ class ChgkParser:
|
|
|
632
798
|
)
|
|
633
799
|
|
|
634
800
|
if debug:
|
|
635
|
-
with
|
|
801
|
+
with open("debug_5.json", "w", encoding="utf-8") as f:
|
|
636
802
|
f.write(json.dumps(self.structure, ensure_ascii=False, indent=4))
|
|
637
803
|
|
|
638
804
|
# 6.
|
|
@@ -648,6 +814,7 @@ class ChgkParser:
|
|
|
648
814
|
):
|
|
649
815
|
if self.defaultauthor and "author" not in current_question:
|
|
650
816
|
current_question["author"] = self.defaultauthor
|
|
817
|
+
self._process_question_images(current_question)
|
|
651
818
|
check_question(current_question, logger=logger)
|
|
652
819
|
final_structure.append(["Question", current_question])
|
|
653
820
|
current_question = {}
|
|
@@ -681,11 +848,12 @@ class ChgkParser:
|
|
|
681
848
|
if current_question != {}:
|
|
682
849
|
if self.defaultauthor and "author" not in current_question:
|
|
683
850
|
current_question["author"] = self.defaultauthor
|
|
851
|
+
self._process_question_images(current_question)
|
|
684
852
|
check_question(current_question, logger=logger)
|
|
685
853
|
final_structure.append(["Question", current_question])
|
|
686
854
|
|
|
687
855
|
if debug:
|
|
688
|
-
with
|
|
856
|
+
with open("debug_6.json", "w", encoding="utf-8") as f:
|
|
689
857
|
f.write(json.dumps(final_structure, ensure_ascii=False, indent=4))
|
|
690
858
|
|
|
691
859
|
# 7.
|
|
@@ -724,16 +892,22 @@ class ChgkParser:
|
|
|
724
892
|
elif element[0] == "tour" and self.args.tour_numbers_as_words == "on":
|
|
725
893
|
element[1] = f"{self.TOUR_NUMBERS_AS_WORDS[tour_cnt]} тур"
|
|
726
894
|
tour_cnt += 1
|
|
895
|
+
elif element[0] not in ["Question", "source"] and getattr(
|
|
896
|
+
self.args, "download_images", False
|
|
897
|
+
):
|
|
898
|
+
# Process images in metadata fields (excluding source)
|
|
899
|
+
element[1] = self._process_images_in_text(element[1])
|
|
727
900
|
|
|
728
901
|
if debug:
|
|
729
|
-
with
|
|
902
|
+
with open("debug_final.json", "w", encoding="utf-8") as f:
|
|
730
903
|
f.write(json.dumps(final_structure, ensure_ascii=False, indent=4))
|
|
731
904
|
return final_structure
|
|
732
905
|
|
|
733
906
|
|
|
734
907
|
def chgk_parse(text, defaultauthor=None, args=None):
|
|
735
908
|
parser = ChgkParser(defaultauthor=defaultauthor, args=args)
|
|
736
|
-
|
|
909
|
+
parsed = parser.parse(text)
|
|
910
|
+
return parsed
|
|
737
911
|
|
|
738
912
|
|
|
739
913
|
class UnknownEncodingException(Exception):
|
|
@@ -779,7 +953,7 @@ def ensure_line_breaks(tag):
|
|
|
779
953
|
|
|
780
954
|
def chgk_parse_docx(docxfile, defaultauthor="", args=None, logger=None):
|
|
781
955
|
logger = logger or DummyLogger()
|
|
782
|
-
args = args or
|
|
956
|
+
args = args or DefaultNamespace()
|
|
783
957
|
for_ol = {}
|
|
784
958
|
|
|
785
959
|
def get_number(tag):
|
|
@@ -807,6 +981,11 @@ def chgk_parse_docx(docxfile, defaultauthor="", args=None, logger=None):
|
|
|
807
981
|
else:
|
|
808
982
|
with open(docxfile, "rb") as docx_file:
|
|
809
983
|
html = mammoth.convert_to_html(docx_file).value
|
|
984
|
+
if args.debug:
|
|
985
|
+
with open(
|
|
986
|
+
os.path.join(target_dir, "debugdebug.pydocx"), "w", encoding="utf-8"
|
|
987
|
+
) as dbg:
|
|
988
|
+
dbg.write(html)
|
|
810
989
|
input_docx = (
|
|
811
990
|
html.replace("</strong><strong>", "")
|
|
812
991
|
.replace("</em><em>", "")
|
|
@@ -815,8 +994,8 @@ def chgk_parse_docx(docxfile, defaultauthor="", args=None, logger=None):
|
|
|
815
994
|
bsoup = BeautifulSoup(input_docx, "html.parser")
|
|
816
995
|
|
|
817
996
|
if args.debug:
|
|
818
|
-
with
|
|
819
|
-
os.path.join(target_dir, "debug.pydocx"), "w", "
|
|
997
|
+
with open(
|
|
998
|
+
os.path.join(target_dir, "debug.pydocx"), "w", encoding="utf-8"
|
|
820
999
|
) as dbg:
|
|
821
1000
|
dbg.write(input_docx)
|
|
822
1001
|
|
|
@@ -886,7 +1065,7 @@ def chgk_parse_docx(docxfile, defaultauthor="", args=None, logger=None):
|
|
|
886
1065
|
ensure_line_breaks(tag)
|
|
887
1066
|
for tag in bsoup.find_all("table"):
|
|
888
1067
|
try:
|
|
889
|
-
table =
|
|
1068
|
+
table = html2md(str(tag))
|
|
890
1069
|
tag.insert_before(table)
|
|
891
1070
|
except (TypeError, ValueError):
|
|
892
1071
|
logger.error(f"couldn't parse html table: {str(tag)}")
|
|
@@ -917,12 +1096,12 @@ def chgk_parse_docx(docxfile, defaultauthor="", args=None, logger=None):
|
|
|
917
1096
|
tag.unwrap()
|
|
918
1097
|
|
|
919
1098
|
if args.debug:
|
|
920
|
-
with
|
|
921
|
-
os.path.join(target_dir, "debug_raw.html"), "w", "
|
|
1099
|
+
with open(
|
|
1100
|
+
os.path.join(target_dir, "debug_raw.html"), "w", encoding="utf-8"
|
|
922
1101
|
) as dbg:
|
|
923
1102
|
dbg.write(str(bsoup))
|
|
924
|
-
with
|
|
925
|
-
os.path.join(target_dir, "debug.html"), "w", "
|
|
1103
|
+
with open(
|
|
1104
|
+
os.path.join(target_dir, "debug.html"), "w", encoding="utf-8"
|
|
926
1105
|
) as dbg:
|
|
927
1106
|
dbg.write(bsoup.prettify())
|
|
928
1107
|
|
|
@@ -960,7 +1139,9 @@ def chgk_parse_docx(docxfile, defaultauthor="", args=None, logger=None):
|
|
|
960
1139
|
txt = txt.replace(f"IMGPATH({i})", elem)
|
|
961
1140
|
|
|
962
1141
|
if args.debug:
|
|
963
|
-
with
|
|
1142
|
+
with open(
|
|
1143
|
+
os.path.join(target_dir, "debug.debug"), "w", encoding="utf-8"
|
|
1144
|
+
) as dbg:
|
|
964
1145
|
dbg.write(txt)
|
|
965
1146
|
|
|
966
1147
|
final_structure = chgk_parse(txt, defaultauthor=defaultauthor, args=args)
|
|
@@ -994,7 +1175,7 @@ def chgk_parse_wrapper(path, args, logger=None):
|
|
|
994
1175
|
sys.exit()
|
|
995
1176
|
outfilename = os.path.join(target_dir, make_filename(abspath, "4s", args))
|
|
996
1177
|
logger.info("Output: {}".format(os.path.abspath(outfilename)))
|
|
997
|
-
with
|
|
1178
|
+
with open(outfilename, "w", encoding="utf-8") as output_file:
|
|
998
1179
|
output_file.write(compose_4s(final_structure, args=args))
|
|
999
1180
|
return outfilename
|
|
1000
1181
|
|
chgksuite/parser_db.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env python
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
3
|
|
|
4
|
-
import codecs
|
|
5
4
|
import json
|
|
6
5
|
import os
|
|
7
6
|
import re
|
|
@@ -128,7 +127,7 @@ def t_ANSWER(t):
|
|
|
128
127
|
t.lexer.text = ""
|
|
129
128
|
if t.lexer.question["answer"]:
|
|
130
129
|
logger.warning(
|
|
131
|
-
"Bad format: several Answer fields.
|
|
130
|
+
"Bad format: several Answer fields. Previous Answer was: '%s'",
|
|
132
131
|
t.lexer.question["answer"],
|
|
133
132
|
)
|
|
134
133
|
|
|
@@ -151,7 +150,7 @@ def t_COMMENT(t):
|
|
|
151
150
|
t.lexer.text = ""
|
|
152
151
|
if t.lexer.question["comment"]:
|
|
153
152
|
logger.warning(
|
|
154
|
-
"Bad format: several Comment fields.
|
|
153
|
+
"Bad format: several Comment fields. Previous Comment was: '%s'",
|
|
155
154
|
t.lexer.question["comment"],
|
|
156
155
|
)
|
|
157
156
|
|
|
@@ -162,7 +161,7 @@ def t_SOURCE(t):
|
|
|
162
161
|
t.lexer.text = ""
|
|
163
162
|
if t.lexer.question["source"]:
|
|
164
163
|
logger.warning(
|
|
165
|
-
"Bad format: several Source fields.
|
|
164
|
+
"Bad format: several Source fields. Previous Source was: '%s'",
|
|
166
165
|
t.lexer.question["source"],
|
|
167
166
|
)
|
|
168
167
|
|
|
@@ -417,7 +416,6 @@ def replace_handouts(match_handout):
|
|
|
417
416
|
|
|
418
417
|
|
|
419
418
|
def chgk_parse_db(text, debug=False, logger=False):
|
|
420
|
-
|
|
421
419
|
if not logger:
|
|
422
420
|
logger = init_logger("parser_db", debug=debug)
|
|
423
421
|
|
|
@@ -437,7 +435,7 @@ def chgk_parse_db(text, debug=False, logger=False):
|
|
|
437
435
|
append_question(lexer)
|
|
438
436
|
|
|
439
437
|
if debug:
|
|
440
|
-
with
|
|
438
|
+
with open("debug_final.json", "w", encoding="utf-8") as f:
|
|
441
439
|
f.write(json.dumps(lexer.structure, ensure_ascii=False, indent=4))
|
|
442
440
|
|
|
443
441
|
return lexer.structure
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
[question_labels]
|
|
2
|
+
question = "Sual"
|
|
3
|
+
answer = "Cavab"
|
|
4
|
+
zachet = "Sayılma meyarı"
|
|
5
|
+
nezachet = "Sayılmır"
|
|
6
|
+
comment = "Şərh"
|
|
7
|
+
source = "Mənbə"
|
|
8
|
+
sources = "Mənbələr"
|
|
9
|
+
author = "Müəllif"
|
|
10
|
+
authors = "Müəlliflər"
|
|
11
|
+
handout = "Paylama materialı"
|
|
12
|
+
|
|
13
|
+
[general]
|
|
14
|
+
section = "Tur"
|
|
15
|
+
editor = "Redaktor"
|
|
16
|
+
date = "Tarix"
|
|
17
|
+
questions_in_comments = "Suallar şərh bölməsindədir."
|
|
18
|
+
general_impressions_caption = "Ümumi təəssüratlar"
|
|
19
|
+
handout_for_question = "Paylama materialı sual {} üçün"
|
|
20
|
+
general_impressions_text = "Paket barədə ümumi təəssüratlar — bu posun altına şərh kimi yazın."
|
|
21
|
+
right_answers_for_stats = "Doğru cavablar"
|
|
22
|
+
cf_image = "Şəklə baxın"
|
|
@@ -9,7 +9,6 @@ sources = "Крыніцы"
|
|
|
9
9
|
author = "Аўтар"
|
|
10
10
|
authors = "Аўтары"
|
|
11
11
|
handout = "Раздаткавы матэрыял"
|
|
12
|
-
handout_short = "Раздат"
|
|
13
12
|
|
|
14
13
|
[general]
|
|
15
14
|
section = "Тур"
|
|
@@ -19,4 +18,4 @@ questions_in_comments = "Пытанні ў каментарах."
|
|
|
19
18
|
handout_for_question = "Раздаткавы матэрыял да пытання {}"
|
|
20
19
|
general_impressions_caption = "Агульныя ўражанні"
|
|
21
20
|
general_impressions_text = "Агульныя ўражанні ад пакета — у каментарах да гэтага паста."
|
|
22
|
-
right_answers_for_stats = "Правільных адказаў"
|
|
21
|
+
right_answers_for_stats = "Правільных адказаў"
|
|
@@ -9,7 +9,6 @@ sources = "Крыніцы"
|
|
|
9
9
|
author = "Аўтар"
|
|
10
10
|
authors = "Аўтары"
|
|
11
11
|
handout = "Раздаткавы матэрыял"
|
|
12
|
-
handout_short = "Раздат"
|
|
13
12
|
|
|
14
13
|
[general]
|
|
15
14
|
section = "Тур"
|
|
@@ -19,4 +18,4 @@ questions_in_comments = "Пытаньні ў каментарыях."
|
|
|
19
18
|
handout_for_question = "Раздатачны матэрыял да пытаньня {}"
|
|
20
19
|
general_impressions_caption = "Агульныя ўражаньні"
|
|
21
20
|
general_impressions_text = "Агульныя ўражаньні ад пакета — у каментарыях да гэтага паста."
|
|
22
|
-
right_answers_for_stats = "Правільных адказаў"
|
|
21
|
+
right_answers_for_stats = "Правільных адказаў"
|
|
@@ -9,7 +9,6 @@ sources = "Sources"
|
|
|
9
9
|
author = "Author"
|
|
10
10
|
authors = "Authors"
|
|
11
11
|
handout = "Handout"
|
|
12
|
-
handout_short = "Handout"
|
|
13
12
|
|
|
14
13
|
[general]
|
|
15
14
|
section = "Block"
|
|
@@ -19,4 +18,4 @@ questions_in_comments = "Questions are in the comments."
|
|
|
19
18
|
handout_for_question = "Handout for question {}"
|
|
20
19
|
general_impressions_caption = "General impression"
|
|
21
20
|
general_impressions_text = "Please share your general impression of the packet in the comments to this post."
|
|
22
|
-
right_answers_for_stats = "Correct answers"
|
|
21
|
+
right_answers_for_stats = "Correct answers"
|
|
@@ -9,7 +9,6 @@ sources = "Дереккөздер"
|
|
|
9
9
|
author = "Автор"
|
|
10
10
|
authors = "Авторлар"
|
|
11
11
|
handout = "Үлестіру материалы"
|
|
12
|
-
handout_short = "Материал"
|
|
13
12
|
|
|
14
13
|
[general]
|
|
15
14
|
section = "Тур"
|
|
@@ -20,4 +19,4 @@ general_impressions_caption = "Жалпы әсер"
|
|
|
20
19
|
handout_for_question = "{}-сұрақтың үлестіру материалы"
|
|
21
20
|
general_impressions_text = "Пакеттен алған жалпы әсер — осы пост астындағы комментарийлерде."
|
|
22
21
|
right_answers_for_stats = "Алған сұрақтар"
|
|
23
|
-
cf_image = "суретті қараңыз"
|
|
22
|
+
cf_image = "суретті қараңыз"
|
|
@@ -9,7 +9,6 @@ sources = "Источники"
|
|
|
9
9
|
author = "Автор"
|
|
10
10
|
authors = "Авторы"
|
|
11
11
|
handout = "Раздаточный материал"
|
|
12
|
-
handout_short = "Раздат"
|
|
13
12
|
|
|
14
13
|
[general]
|
|
15
14
|
section = "Тур"
|
|
@@ -20,4 +19,4 @@ general_impressions_caption = "Общие впечатления"
|
|
|
20
19
|
handout_for_question = "Раздаточный материал к вопросу {}"
|
|
21
20
|
general_impressions_text = "Общее впечатление от пакета — в комментариях к этому посту."
|
|
22
21
|
right_answers_for_stats = "Взятия"
|
|
23
|
-
cf_image = "см. изображение"
|
|
22
|
+
cf_image = "см. изображение"
|
|
@@ -9,7 +9,6 @@ sources = "Izvori"
|
|
|
9
9
|
author = "Autor"
|
|
10
10
|
authors = "Autori"
|
|
11
11
|
handout = "Materijal za deljenje"
|
|
12
|
-
handout_short = "Podeljeno"
|
|
13
12
|
|
|
14
13
|
[general]
|
|
15
14
|
section = "Runda"
|
|
@@ -20,4 +19,4 @@ general_impressions_caption = "Opšti utisci"
|
|
|
20
19
|
handout_for_question = "Materijal za deljenje uz pitanje {}"
|
|
21
20
|
general_impressions_text = "Opšti utisak od paketa — u komentarima na ovu objavu."
|
|
22
21
|
right_answers_for_stats = "Pogodak"
|
|
23
|
-
cf_image = "vidi sliku"
|
|
22
|
+
cf_image = "vidi sliku"
|
|
@@ -9,7 +9,6 @@ sources = "Джерела"
|
|
|
9
9
|
author = "Автор"
|
|
10
10
|
authors = "Автори"
|
|
11
11
|
handout = "Роздатковий матеріал"
|
|
12
|
-
handout_short = "Роздат"
|
|
13
12
|
|
|
14
13
|
[general]
|
|
15
14
|
section = "Тур"
|
|
@@ -19,4 +18,4 @@ questions_in_comments = "Запитання у коментарях."
|
|
|
19
18
|
handout_for_question = "Роздатковий матеріал до запитання {}"
|
|
20
19
|
general_impressions_caption = "Загальні враження"
|
|
21
20
|
general_impressions_text = "Загальні враження про пакет — у коментарях до цього посту."
|
|
22
|
-
right_answers_for_stats = "Взяття"
|
|
21
|
+
right_answers_for_stats = "Взяття"
|
|
@@ -9,7 +9,6 @@ sources = "Manbalar"
|
|
|
9
9
|
author = "Muallif"
|
|
10
10
|
authors = "Mualliflar"
|
|
11
11
|
handout = "Tarqatma material"
|
|
12
|
-
handout_short = "Tarqat"
|
|
13
12
|
|
|
14
13
|
[general]
|
|
15
14
|
section = "Tur"
|
|
@@ -20,5 +19,3 @@ handout_for_question = "{} savolga tarqatma material."
|
|
|
20
19
|
general_impressions_caption = "Umumiy taasurotlar"
|
|
21
20
|
general_impressions_text = "To‘plamdan umumiy taasurotlar — ushbu postning izohlarida."
|
|
22
21
|
right_answers_for_stats = "To‘g‘ri javoblar foizi"
|
|
23
|
-
|
|
24
|
-
|