chgksuite 0.26.0b10__py3-none-any.whl → 0.26.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chgksuite/cli.py +17 -0
- chgksuite/composer/chgksuite_parser.py +4 -2
- chgksuite/composer/composer_common.py +18 -0
- chgksuite/composer/docx.py +521 -292
- chgksuite/composer/latex.py +2 -2
- chgksuite/composer/pptx.py +17 -5
- chgksuite/composer/telegram.py +20 -9
- chgksuite/handouter/runner.py +3 -0
- chgksuite/handouter/utils.py +22 -1
- chgksuite/parser.py +208 -14
- chgksuite/resources/regexes_by.json +1 -1
- chgksuite/resources/regexes_en.json +1 -1
- chgksuite/resources/regexes_kz_cyr.json +1 -1
- chgksuite/resources/regexes_ru.json +1 -1
- chgksuite/resources/regexes_sr.json +1 -1
- chgksuite/resources/regexes_ua.json +1 -1
- chgksuite/resources/regexes_uz_cyr.json +1 -1
- chgksuite/version.py +1 -1
- {chgksuite-0.26.0b10.dist-info → chgksuite-0.26.1.dist-info}/METADATA +1 -1
- {chgksuite-0.26.0b10.dist-info → chgksuite-0.26.1.dist-info}/RECORD +24 -24
- {chgksuite-0.26.0b10.dist-info → chgksuite-0.26.1.dist-info}/WHEEL +0 -0
- {chgksuite-0.26.0b10.dist-info → chgksuite-0.26.1.dist-info}/entry_points.txt +0 -0
- {chgksuite-0.26.0b10.dist-info → chgksuite-0.26.1.dist-info}/licenses/LICENSE +0 -0
- {chgksuite-0.26.0b10.dist-info → chgksuite-0.26.1.dist-info}/top_level.txt +0 -0
chgksuite/composer/latex.py
CHANGED
|
@@ -199,11 +199,11 @@ class LatexExporter(BaseExporter):
|
|
|
199
199
|
firsttour = True
|
|
200
200
|
for element in self.structure:
|
|
201
201
|
if element[0] == "heading":
|
|
202
|
-
tex += "\n{{\\huge {}}}\n
|
|
202
|
+
tex += "\n{{\\huge {}}}\n\\vspace{{0.8em}}\n".format(
|
|
203
203
|
self.tex_element_layout(element[1])
|
|
204
204
|
)
|
|
205
205
|
if element[0] == "date":
|
|
206
|
-
tex += "\n{{\\large {}}}\n
|
|
206
|
+
tex += "\n{{\\large {}}}\n\\vspace{{0.8em}}\n".format(
|
|
207
207
|
self.tex_element_layout(element[1])
|
|
208
208
|
)
|
|
209
209
|
if element[0] in {"meta", "editor"}:
|
chgksuite/composer/pptx.py
CHANGED
|
@@ -5,7 +5,12 @@ import re
|
|
|
5
5
|
import toml
|
|
6
6
|
|
|
7
7
|
from chgksuite.common import log_wrap, replace_escaped, tryint
|
|
8
|
-
from chgksuite.composer.composer_common import
|
|
8
|
+
from chgksuite.composer.composer_common import (
|
|
9
|
+
BaseExporter,
|
|
10
|
+
backtick_replace,
|
|
11
|
+
parseimg,
|
|
12
|
+
remove_accents_standalone,
|
|
13
|
+
)
|
|
9
14
|
from pptx import Presentation
|
|
10
15
|
from pptx.dml.color import RGBColor
|
|
11
16
|
from pptx.enum.text import MSO_AUTO_SIZE, MSO_VERTICAL_ANCHOR, PP_ALIGN
|
|
@@ -109,15 +114,20 @@ class PptxExporter(BaseExporter):
|
|
|
109
114
|
r.font.underline = True
|
|
110
115
|
|
|
111
116
|
def pptx_process_text(
|
|
112
|
-
self,
|
|
117
|
+
self,
|
|
118
|
+
s,
|
|
119
|
+
image=None,
|
|
120
|
+
strip_brackets=True,
|
|
121
|
+
replace_spaces=True,
|
|
122
|
+
do_not_remove_accents=False,
|
|
113
123
|
):
|
|
114
124
|
hs = self.labels["question_labels"]["handout_short"]
|
|
115
125
|
if isinstance(s, list):
|
|
116
126
|
for i in range(len(s)):
|
|
117
127
|
s[i] = self.pptx_process_text(s[i], image=image)
|
|
118
128
|
return s
|
|
119
|
-
if not self.args.do_not_remove_accents:
|
|
120
|
-
s = s
|
|
129
|
+
if not (self.args.do_not_remove_accents or do_not_remove_accents):
|
|
130
|
+
s = remove_accents_standalone(s, self.labels)
|
|
121
131
|
if strip_brackets:
|
|
122
132
|
s = self.remove_square_brackets(s)
|
|
123
133
|
s = s.replace("]\n", "]\n\n")
|
|
@@ -408,7 +418,9 @@ class PptxExporter(BaseExporter):
|
|
|
408
418
|
if number is not None:
|
|
409
419
|
self.set_question_number(slide, number)
|
|
410
420
|
p = self.init_paragraph(tf, text=handout)
|
|
411
|
-
self.pptx_format(
|
|
421
|
+
self.pptx_format(
|
|
422
|
+
self.pptx_process_text(handout, do_not_remove_accents=True), p, tf, slide
|
|
423
|
+
)
|
|
412
424
|
|
|
413
425
|
def process_question_text(self, q):
|
|
414
426
|
image = self._get_image_from_4s(q["question"])
|
chgksuite/composer/telegram.py
CHANGED
|
@@ -126,7 +126,9 @@ class TelegramExporter(BaseExporter):
|
|
|
126
126
|
if result:
|
|
127
127
|
msg_data = json.loads(result["raw_data"])
|
|
128
128
|
if msg_data["message"]["chat"]["type"] != "private":
|
|
129
|
-
print(
|
|
129
|
+
print(
|
|
130
|
+
"You should post to the PRIVATE chat, not to the channel/group"
|
|
131
|
+
)
|
|
130
132
|
continue
|
|
131
133
|
self.control_chat_id = msg_data["message"]["chat"]["id"]
|
|
132
134
|
self.send_api_request(
|
|
@@ -869,7 +871,7 @@ class TelegramExporter(BaseExporter):
|
|
|
869
871
|
raise Exception("Failed to get channel ID from forwarded message")
|
|
870
872
|
else:
|
|
871
873
|
raise Exception("Channel ID is undefined")
|
|
872
|
-
|
|
874
|
+
|
|
873
875
|
# Handle chat resolution
|
|
874
876
|
if isinstance(chat_result, int):
|
|
875
877
|
chat_id = chat_result
|
|
@@ -881,7 +883,9 @@ class TelegramExporter(BaseExporter):
|
|
|
881
883
|
f"Please write a message in the discussion group with text: {self.chat_auth_uuid}"
|
|
882
884
|
)
|
|
883
885
|
print("This will allow me to extract the group ID automatically.")
|
|
884
|
-
print(
|
|
886
|
+
print(
|
|
887
|
+
"The bot MUST be added do the group and made admin, else it won't work!"
|
|
888
|
+
)
|
|
885
889
|
print("=" * 50 + "\n")
|
|
886
890
|
|
|
887
891
|
# Wait for a forwarded message with chat information
|
|
@@ -1158,7 +1162,10 @@ class TelegramExporter(BaseExporter):
|
|
|
1158
1162
|
if get_text(msg_data) != self.chat_auth_uuid:
|
|
1159
1163
|
continue
|
|
1160
1164
|
extracted_id = msg_data["message"]["chat"]["id"]
|
|
1161
|
-
if
|
|
1165
|
+
if (
|
|
1166
|
+
extracted_id == channel_numeric_id
|
|
1167
|
+
or extracted_id == self.control_chat_id
|
|
1168
|
+
):
|
|
1162
1169
|
self.logger.warning(
|
|
1163
1170
|
"User posted a message in the channel, not the discussion group"
|
|
1164
1171
|
)
|
|
@@ -1168,7 +1175,7 @@ class TelegramExporter(BaseExporter):
|
|
|
1168
1175
|
"chat_id": self.control_chat_id,
|
|
1169
1176
|
"text": (
|
|
1170
1177
|
"⚠️ You posted a message in the channel, not in the discussion group."
|
|
1171
|
-
)
|
|
1178
|
+
),
|
|
1172
1179
|
},
|
|
1173
1180
|
)
|
|
1174
1181
|
# Skip this message and continue waiting
|
|
@@ -1176,7 +1183,10 @@ class TelegramExporter(BaseExporter):
|
|
|
1176
1183
|
elif entity_type == "channel":
|
|
1177
1184
|
if msg_data["message"]["chat"]["id"] != self.control_chat_id:
|
|
1178
1185
|
continue
|
|
1179
|
-
if
|
|
1186
|
+
if (
|
|
1187
|
+
"message" in msg_data
|
|
1188
|
+
and "forward_from_chat" in msg_data["message"]
|
|
1189
|
+
):
|
|
1180
1190
|
forward_info = msg_data["message"]["forward_from_chat"]
|
|
1181
1191
|
|
|
1182
1192
|
# Extract chat ID from the message
|
|
@@ -1187,9 +1197,10 @@ class TelegramExporter(BaseExporter):
|
|
|
1187
1197
|
else:
|
|
1188
1198
|
extracted_id = chat_id
|
|
1189
1199
|
# For channels, check the type; for chats, accept any type except "channel" if check_type is False
|
|
1190
|
-
if extracted_id and (
|
|
1191
|
-
|
|
1192
|
-
|
|
1200
|
+
if extracted_id and (
|
|
1201
|
+
(check_type and forward_info.get("type") == "channel")
|
|
1202
|
+
or (not check_type)
|
|
1203
|
+
):
|
|
1193
1204
|
resolved = True
|
|
1194
1205
|
self.created_at = row["created_at"]
|
|
1195
1206
|
self.logger.info(
|
chgksuite/handouter/runner.py
CHANGED
|
@@ -124,6 +124,9 @@ class HandoutGenerator:
|
|
|
124
124
|
|
|
125
125
|
def generate(self):
|
|
126
126
|
for block in self.parse_input(self.args.filename):
|
|
127
|
+
if not block:
|
|
128
|
+
self.blocks.append("\n\\clearpage\n")
|
|
129
|
+
continue
|
|
127
130
|
if self.args.debug:
|
|
128
131
|
print(block)
|
|
129
132
|
if block.get("for_question"):
|
chgksuite/handouter/utils.py
CHANGED
|
@@ -45,8 +45,29 @@ def wrap_val(key, val):
|
|
|
45
45
|
return val.strip()
|
|
46
46
|
|
|
47
47
|
|
|
48
|
+
def split_array_by_value(arr, delimiter):
|
|
49
|
+
result = []
|
|
50
|
+
current_subarray = []
|
|
51
|
+
for item in arr:
|
|
52
|
+
if item == delimiter:
|
|
53
|
+
result.append(current_subarray)
|
|
54
|
+
current_subarray = []
|
|
55
|
+
else:
|
|
56
|
+
current_subarray.append(item)
|
|
57
|
+
result.append(current_subarray)
|
|
58
|
+
return result
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def split_blocks(contents):
|
|
62
|
+
lines = contents.split("\n")
|
|
63
|
+
sp = ["\n".join(x) for x in split_array_by_value(lines, "---")]
|
|
64
|
+
if not sp[0].strip():
|
|
65
|
+
sp = sp[1:]
|
|
66
|
+
return sp
|
|
67
|
+
|
|
68
|
+
|
|
48
69
|
def parse_handouts(contents):
|
|
49
|
-
blocks = contents
|
|
70
|
+
blocks = split_blocks(contents)
|
|
50
71
|
result = []
|
|
51
72
|
for block_ in blocks:
|
|
52
73
|
block = block_.strip()
|
chgksuite/parser.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import base64
|
|
4
4
|
import codecs
|
|
5
5
|
import datetime
|
|
6
|
+
import hashlib
|
|
6
7
|
import itertools
|
|
7
8
|
import json
|
|
8
9
|
import os
|
|
@@ -13,12 +14,14 @@ import subprocess
|
|
|
13
14
|
import sys
|
|
14
15
|
import tempfile
|
|
15
16
|
import urllib
|
|
17
|
+
import time
|
|
16
18
|
|
|
17
19
|
import bs4
|
|
18
20
|
import chardet
|
|
19
21
|
import dashtable
|
|
20
22
|
import mammoth
|
|
21
23
|
import pypandoc
|
|
24
|
+
import requests
|
|
22
25
|
import toml
|
|
23
26
|
from bs4 import BeautifulSoup
|
|
24
27
|
from parse import parse
|
|
@@ -26,11 +29,11 @@ from parse import parse
|
|
|
26
29
|
import chgksuite.typotools as typotools
|
|
27
30
|
from chgksuite.common import (
|
|
28
31
|
QUESTION_LABELS,
|
|
29
|
-
DefaultArgs,
|
|
30
32
|
DefaultNamespace,
|
|
31
33
|
DummyLogger,
|
|
32
34
|
check_question,
|
|
33
35
|
compose_4s,
|
|
36
|
+
get_chgksuite_dir,
|
|
34
37
|
get_lastdir,
|
|
35
38
|
init_logger,
|
|
36
39
|
load_settings,
|
|
@@ -40,6 +43,7 @@ from chgksuite.common import (
|
|
|
40
43
|
from chgksuite.composer import gui_compose
|
|
41
44
|
from chgksuite.composer.composer_common import make_filename
|
|
42
45
|
from chgksuite.parser_db import chgk_parse_db
|
|
46
|
+
from chgksuite.typotools import re_url
|
|
43
47
|
from chgksuite.typotools import remove_excessive_whitespace as rew
|
|
44
48
|
|
|
45
49
|
ENC = sys.stdout.encoding or "utf8"
|
|
@@ -107,7 +111,7 @@ class ChgkParser:
|
|
|
107
111
|
|
|
108
112
|
def __init__(self, defaultauthor=None, args=None, logger=None):
|
|
109
113
|
self.defaultauthor = defaultauthor
|
|
110
|
-
args = args or
|
|
114
|
+
args = args or DefaultNamespace()
|
|
111
115
|
self.regexes = load_regexes(args.regexes)
|
|
112
116
|
self.logger = logger or init_logger("parser")
|
|
113
117
|
self.args = args
|
|
@@ -121,6 +125,148 @@ class ChgkParser:
|
|
|
121
125
|
if self.args.language == "en":
|
|
122
126
|
self.args.typography_quotes = "off"
|
|
123
127
|
|
|
128
|
+
def _setup_image_cache(self):
|
|
129
|
+
"""Setup image download cache directory and load existing cache"""
|
|
130
|
+
if not hasattr(self, "_image_cache"):
|
|
131
|
+
self.image_cache_dir = os.path.join(
|
|
132
|
+
get_chgksuite_dir(), "downloaded_images"
|
|
133
|
+
)
|
|
134
|
+
os.makedirs(self.image_cache_dir, exist_ok=True)
|
|
135
|
+
|
|
136
|
+
self.image_cache_file = os.path.join(
|
|
137
|
+
get_chgksuite_dir(), "image_download_cache.json"
|
|
138
|
+
)
|
|
139
|
+
if os.path.isfile(self.image_cache_file):
|
|
140
|
+
try:
|
|
141
|
+
with open(self.image_cache_file, encoding="utf8") as f:
|
|
142
|
+
self._image_cache = json.load(f)
|
|
143
|
+
except (json.JSONDecodeError, OSError):
|
|
144
|
+
self._image_cache = {}
|
|
145
|
+
else:
|
|
146
|
+
self._image_cache = {}
|
|
147
|
+
|
|
148
|
+
def _download_image(self, url):
|
|
149
|
+
"""Download image from URL and return local filename"""
|
|
150
|
+
self._setup_image_cache()
|
|
151
|
+
url = url.replace("\\", "")
|
|
152
|
+
|
|
153
|
+
# Check cache first
|
|
154
|
+
url_hash = hashlib.sha256(url.encode("utf-8")).hexdigest()[:20]
|
|
155
|
+
if url_hash in self._image_cache:
|
|
156
|
+
cached_filename = self._image_cache[url_hash]
|
|
157
|
+
cached_path = os.path.join(self.image_cache_dir, cached_filename)
|
|
158
|
+
if os.path.isfile(cached_path):
|
|
159
|
+
return cached_path
|
|
160
|
+
|
|
161
|
+
# Determine file extension
|
|
162
|
+
parsed_url = urllib.parse.urlparse(url)
|
|
163
|
+
path_lower = parsed_url.path.lower()
|
|
164
|
+
ext = None
|
|
165
|
+
for image_ext in [".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".svg"]:
|
|
166
|
+
if path_lower.endswith(image_ext):
|
|
167
|
+
ext = image_ext
|
|
168
|
+
break
|
|
169
|
+
|
|
170
|
+
if not ext:
|
|
171
|
+
# Try to guess from URL structure
|
|
172
|
+
if any(
|
|
173
|
+
img_ext in path_lower
|
|
174
|
+
for img_ext in [
|
|
175
|
+
".jpg",
|
|
176
|
+
".jpeg",
|
|
177
|
+
".png",
|
|
178
|
+
".webp",
|
|
179
|
+
".gif",
|
|
180
|
+
".bmp",
|
|
181
|
+
".svg",
|
|
182
|
+
]
|
|
183
|
+
):
|
|
184
|
+
for image_ext in [
|
|
185
|
+
".jpg",
|
|
186
|
+
".jpeg",
|
|
187
|
+
".png",
|
|
188
|
+
".webp",
|
|
189
|
+
".gif",
|
|
190
|
+
".bmp",
|
|
191
|
+
".svg",
|
|
192
|
+
]:
|
|
193
|
+
if image_ext in path_lower:
|
|
194
|
+
ext = image_ext
|
|
195
|
+
break
|
|
196
|
+
else:
|
|
197
|
+
ext = ".jpg" # Default extension
|
|
198
|
+
|
|
199
|
+
filename = url_hash + ext
|
|
200
|
+
filepath = os.path.join(self.image_cache_dir, filename)
|
|
201
|
+
|
|
202
|
+
try:
|
|
203
|
+
self.logger.info(f"Downloading image from {url}")
|
|
204
|
+
headers = {
|
|
205
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
|
206
|
+
"Accept": "image/png,image/jpeg,image/webp,image/gif,image/*,*/*;q=0.8",
|
|
207
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
208
|
+
"Accept-Encoding": "gzip, deflate, br",
|
|
209
|
+
"Connection": "keep-alive",
|
|
210
|
+
"Upgrade-Insecure-Requests": "1",
|
|
211
|
+
}
|
|
212
|
+
response = requests.get(url, timeout=30, stream=True, headers=headers)
|
|
213
|
+
response.raise_for_status()
|
|
214
|
+
time.sleep(0.5) # rate limiting
|
|
215
|
+
|
|
216
|
+
with open(filepath, "wb") as f:
|
|
217
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
218
|
+
f.write(chunk)
|
|
219
|
+
|
|
220
|
+
# Update cache
|
|
221
|
+
self._image_cache[url_hash] = filename
|
|
222
|
+
with open(self.image_cache_file, "w", encoding="utf8") as f:
|
|
223
|
+
json.dump(self._image_cache, f, indent=2, sort_keys=True)
|
|
224
|
+
|
|
225
|
+
return filepath
|
|
226
|
+
|
|
227
|
+
except Exception as e:
|
|
228
|
+
self.logger.warning(f"Failed to download image from {url}: {e}")
|
|
229
|
+
return None
|
|
230
|
+
|
|
231
|
+
def _process_images_in_text(self, text):
|
|
232
|
+
"""Process text to find image URLs and replace them with local references"""
|
|
233
|
+
if not text or not getattr(self.args, "download_images", False):
|
|
234
|
+
return text
|
|
235
|
+
|
|
236
|
+
if isinstance(text, list):
|
|
237
|
+
return [self._process_images_in_text(item) for item in text]
|
|
238
|
+
|
|
239
|
+
if not isinstance(text, str):
|
|
240
|
+
return text
|
|
241
|
+
|
|
242
|
+
# Find all URLs in the text
|
|
243
|
+
for match in re_url.finditer(text):
|
|
244
|
+
url = match.group(0)
|
|
245
|
+
url_lower = url.lower()
|
|
246
|
+
|
|
247
|
+
# Check if it's a direct image URL
|
|
248
|
+
if any(
|
|
249
|
+
url_lower.endswith(ext)
|
|
250
|
+
for ext in [".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".svg"]
|
|
251
|
+
):
|
|
252
|
+
local_filename = self._download_image(url)
|
|
253
|
+
if local_filename:
|
|
254
|
+
# Replace URL with chgksuite image syntax
|
|
255
|
+
img_reference = f"(img {local_filename})"
|
|
256
|
+
text = text.replace(url, img_reference)
|
|
257
|
+
|
|
258
|
+
return text
|
|
259
|
+
|
|
260
|
+
def _process_question_images(self, question):
|
|
261
|
+
"""Process a question dict to download images from URLs"""
|
|
262
|
+
if not getattr(self.args, "download_images", False):
|
|
263
|
+
return
|
|
264
|
+
|
|
265
|
+
# Process all fields except 'source'
|
|
266
|
+
for field in question:
|
|
267
|
+
if field != "source":
|
|
268
|
+
question[field] = self._process_images_in_text(question[field])
|
|
269
|
+
|
|
124
270
|
def merge_to_previous(self, index):
|
|
125
271
|
target = index - 1
|
|
126
272
|
if self.structure[target][1]:
|
|
@@ -513,7 +659,19 @@ class ChgkParser:
|
|
|
513
659
|
):
|
|
514
660
|
self.merge_to_next(0)
|
|
515
661
|
|
|
516
|
-
|
|
662
|
+
if debug:
|
|
663
|
+
with codecs.open("debug_3a.json", "w", "utf8") as f:
|
|
664
|
+
f.write(
|
|
665
|
+
json.dumps(
|
|
666
|
+
list(enumerate(self.structure)), ensure_ascii=False, indent=4
|
|
667
|
+
)
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
idx = 0
|
|
671
|
+
cycle = -1
|
|
672
|
+
while idx < len(self.structure):
|
|
673
|
+
cycle += 1
|
|
674
|
+
element = self.structure[idx]
|
|
517
675
|
if element[0] == "":
|
|
518
676
|
element[0] = "meta"
|
|
519
677
|
if element[0] in regexes and element[0] not in [
|
|
@@ -524,19 +682,43 @@ class ChgkParser:
|
|
|
524
682
|
if element[0] == "question":
|
|
525
683
|
try:
|
|
526
684
|
num = regexes["question"].search(element[1])
|
|
527
|
-
if num:
|
|
528
|
-
self.structure.insert(
|
|
685
|
+
if num and num.group("number"):
|
|
686
|
+
self.structure.insert(idx, ["number", num.group("number")])
|
|
687
|
+
idx += 1
|
|
529
688
|
except Exception as e:
|
|
689
|
+
num = None
|
|
530
690
|
sys.stderr.write(
|
|
531
|
-
f"exception at
|
|
691
|
+
f"exception at setting number: {type(e)} {e}\n"
|
|
532
692
|
)
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
693
|
+
if (
|
|
694
|
+
num is None
|
|
695
|
+
or num and not num.group("number")
|
|
696
|
+
) and (
|
|
697
|
+
("нулевой вопрос" in element[1].lower())
|
|
698
|
+
or ("разминочный вопрос" in element[1].lower())
|
|
699
|
+
):
|
|
700
|
+
self.structure.insert(idx, ["number", "0"])
|
|
701
|
+
idx += 1
|
|
702
|
+
if element[0] == "question":
|
|
703
|
+
lines = element[1].split(SEP)
|
|
704
|
+
for i, line in enumerate(lines):
|
|
705
|
+
if regexes["question"].search(line):
|
|
706
|
+
lines[i] = regexes["question"].sub("", line, 1)
|
|
707
|
+
element[1] = SEP.join([x.strip() for x in lines if x.strip()])
|
|
708
|
+
before_replacement = None
|
|
709
|
+
else:
|
|
710
|
+
before_replacement = element[1]
|
|
711
|
+
element[1] = regexes[element[0]].sub("", element[1], 1)
|
|
536
712
|
if element[1].startswith(SEP):
|
|
537
713
|
element[1] = element[1][len(SEP) :]
|
|
538
|
-
|
|
714
|
+
# TODO: переделать корявую обработку авторки на нормальную
|
|
715
|
+
if (
|
|
716
|
+
element[0] == "author"
|
|
717
|
+
and before_replacement
|
|
718
|
+
and "авторка:" in before_replacement.lower()
|
|
719
|
+
):
|
|
539
720
|
element[1] = "!!Авторка" + element[1]
|
|
721
|
+
idx += 1
|
|
540
722
|
|
|
541
723
|
if debug:
|
|
542
724
|
with codecs.open("debug_4.json", "w", "utf8") as f:
|
|
@@ -551,7 +733,7 @@ class ChgkParser:
|
|
|
551
733
|
try:
|
|
552
734
|
num = regexes["question"].search(element[1])
|
|
553
735
|
if num:
|
|
554
|
-
self.structure.insert(_id, ["number", num.group(
|
|
736
|
+
self.structure.insert(_id, ["number", num.group("number")])
|
|
555
737
|
except Exception as e:
|
|
556
738
|
sys.stderr.write(
|
|
557
739
|
f"exception at line 470 of parser: {type(e)} {e}\n"
|
|
@@ -559,7 +741,6 @@ class ChgkParser:
|
|
|
559
741
|
element[1] = regexes["question"].sub("", element[1])
|
|
560
742
|
|
|
561
743
|
# detect inner lists
|
|
562
|
-
|
|
563
744
|
mo = {
|
|
564
745
|
m for m in re.finditer(r"(\s+|^)(\d+)[\.\)]\s*(?!\d)", element[1], re.U)
|
|
565
746
|
}
|
|
@@ -636,6 +817,7 @@ class ChgkParser:
|
|
|
636
817
|
):
|
|
637
818
|
if self.defaultauthor and "author" not in current_question:
|
|
638
819
|
current_question["author"] = self.defaultauthor
|
|
820
|
+
self._process_question_images(current_question)
|
|
639
821
|
check_question(current_question, logger=logger)
|
|
640
822
|
final_structure.append(["Question", current_question])
|
|
641
823
|
current_question = {}
|
|
@@ -669,6 +851,7 @@ class ChgkParser:
|
|
|
669
851
|
if current_question != {}:
|
|
670
852
|
if self.defaultauthor and "author" not in current_question:
|
|
671
853
|
current_question["author"] = self.defaultauthor
|
|
854
|
+
self._process_question_images(current_question)
|
|
672
855
|
check_question(current_question, logger=logger)
|
|
673
856
|
final_structure.append(["Question", current_question])
|
|
674
857
|
|
|
@@ -712,6 +895,11 @@ class ChgkParser:
|
|
|
712
895
|
elif element[0] == "tour" and self.args.tour_numbers_as_words == "on":
|
|
713
896
|
element[1] = f"{self.TOUR_NUMBERS_AS_WORDS[tour_cnt]} тур"
|
|
714
897
|
tour_cnt += 1
|
|
898
|
+
elif element[0] not in ["Question", "source"] and getattr(
|
|
899
|
+
self.args, "download_images", False
|
|
900
|
+
):
|
|
901
|
+
# Process images in metadata fields (excluding source)
|
|
902
|
+
element[1] = self._process_images_in_text(element[1])
|
|
715
903
|
|
|
716
904
|
if debug:
|
|
717
905
|
with codecs.open("debug_final.json", "w", "utf8") as f:
|
|
@@ -721,7 +909,8 @@ class ChgkParser:
|
|
|
721
909
|
|
|
722
910
|
def chgk_parse(text, defaultauthor=None, args=None):
|
|
723
911
|
parser = ChgkParser(defaultauthor=defaultauthor, args=args)
|
|
724
|
-
|
|
912
|
+
parsed = parser.parse(text)
|
|
913
|
+
return parsed
|
|
725
914
|
|
|
726
915
|
|
|
727
916
|
class UnknownEncodingException(Exception):
|
|
@@ -767,7 +956,7 @@ def ensure_line_breaks(tag):
|
|
|
767
956
|
|
|
768
957
|
def chgk_parse_docx(docxfile, defaultauthor="", args=None, logger=None):
|
|
769
958
|
logger = logger or DummyLogger()
|
|
770
|
-
args = args or
|
|
959
|
+
args = args or DefaultNamespace()
|
|
771
960
|
for_ol = {}
|
|
772
961
|
|
|
773
962
|
def get_number(tag):
|
|
@@ -795,6 +984,11 @@ def chgk_parse_docx(docxfile, defaultauthor="", args=None, logger=None):
|
|
|
795
984
|
else:
|
|
796
985
|
with open(docxfile, "rb") as docx_file:
|
|
797
986
|
html = mammoth.convert_to_html(docx_file).value
|
|
987
|
+
if args.debug:
|
|
988
|
+
with codecs.open(
|
|
989
|
+
os.path.join(target_dir, "debugdebug.pydocx"), "w", "utf8"
|
|
990
|
+
) as dbg:
|
|
991
|
+
dbg.write(html)
|
|
798
992
|
input_docx = (
|
|
799
993
|
html.replace("</strong><strong>", "")
|
|
800
994
|
.replace("</em><em>", "")
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"tour": "^(#\\s+)?Т[Уу][Рр]\\s?№?\\s?([0-9IVXLCDM]*)([\\.:])?$",
|
|
3
3
|
"tourrev": "^([0-9IVXLCDM]+)\\s[Тт][Уу][Рр]([\\.:])?$",
|
|
4
|
-
"question": "П[Ыы][Тт][Аа][Нн][Ьь]?[Нн][Ее]\\s?[№N]?([0-9\\s]*)\\s?([\\.:]|\n|\r\n|$)",
|
|
4
|
+
"question": "П[Ыы][Тт][Аа][Нн][Ьь]?[Нн][Ее]\\s?[№N]?(?P<number>[0-9\\s]*)\\s?([\\.:]|\n|\r\n|$)",
|
|
5
5
|
"handout": "Р[Аа][Зз][Дд][Аа][Тт]([Аа][Чч][Нн]|[Кк][Аа][Вв])[Ыы][\\s\\s][Мм][Аа][Тт][Ээ][Рр][Ыы][Яя][Лл][\\.:]",
|
|
6
6
|
"answer": "А[Дд][Кк][Аа][Зз][Ыы]?\\s?[№N]?([0-9]+)?\\s?[\\.:]",
|
|
7
7
|
"zachet": "З[Аа][Лл][Іі][Кк]\\s?[\\.:]",
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"tour": "^(#\\s+)?(B[Ll][Oo][Cc][Kk]|R[Oo][Uu][Nn][Dd])\\s?№?\\s?([0-9IVXLCDM]*)([\\.:])?$",
|
|
3
3
|
"tourrev": "^([0-9IVXLCDM]+)\\s(B[Ll][Oo][Cc][Kk]|R[Oo][Uu][Nn][Dd])([\\.:])?$",
|
|
4
|
-
"question": "Q[Uu][Ee][Ss][Tt][Ii][Oo][Nn]\\s?[№N]?([0-9\\s]*)\\s?([\\.:]|\n|\r\n|$)",
|
|
4
|
+
"question": "Q[Uu][Ee][Ss][Tt][Ii][Oo][Nn]\\s?[№N]?(?P<number>[0-9\\s]*)\\s?([\\.:]|\n|\r\n|$)",
|
|
5
5
|
"answer": "A[Nn][Ss][Ww][Ee][Rr]\\s?[№N]?([0-9]+)?\\s?[\\.:]",
|
|
6
6
|
"handout": "^H[Aa][Nn][Dd][Oo][Uu][Tt][\\.:]",
|
|
7
7
|
"zachet": "A[Cc][Ee][Pp][Tt]\\s?[\\.:]",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"tour": "^(#\\s+)?([0-9]+)\\s?[-–—]\\s?[Тт][Уу][Рр]\\s*$",
|
|
3
|
-
"question": "([0-9]+)\\s?[-–—]\\s?\\sС[Ұұ][Рр][Аа][Ққ]([\\.:]|\n|\r\n|$)",
|
|
3
|
+
"question": "(?P<number>[0-9]+)\\s?[-–—]\\s?\\sС[Ұұ][Рр][Аа][Ққ]([\\.:]|\n|\r\n|$)",
|
|
4
4
|
"handout": "^Ү[Лл][Ее][Сс][Тт][Іі][Рр][Уу][\\s\\s][Мм][Аа][Тт][Ее][Рр][Ии][Аа][Лл][Ыы][\\.:]",
|
|
5
5
|
"answer": "Ж[Аа][Уу][Аа][Пп]\\s?[№N]?([0-9]+)?\\s?[\\.:]",
|
|
6
6
|
"zachet": "Қ[Аа][Бб][Ыы][Лл][Дд][Аа][Нн][Аа][Тт][Ыы][Нн]\\s[Жж][Аа][Уу][Аа][Пп](\\([Тт][Аа][Рр]\\))?\\s?[\\.:]",
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"tour": "^(#\\s+)?Т[Уу][Рр]\\s?№?\\s?([0-9IVXLCDM]*)([\\.:])?$",
|
|
3
3
|
"tourrev": "^([0-9IVXLCDM]+|[Пп][Ее][Рр][Вв][Ыы][Йй]|[Вв][Тт][Оо][Рр][Оо][Йй]|[Тт][Рр][Ее][Тт][Ии][Йй]|[Чч][Ее][Тт][Вв][Ее][Рр][Тт][Ыы][Йй]|[Пп][Яя][Тт][Ыы][Йй]|[Шш][Ее][Сс][Тт][Оо][Йй]|[Сс][Ее][Дд][Ьь][Мм][Оо][Йй]|[Вв][Оо][Сс][Ьь][Мм][Оо][Йй]|[Дд][Ее][Вв][Яя][Тт][Ыы][Йй]|[Дд][Ее][Сс][Яя][Тт][Ыы][Йй])\\s[Тт][Уу][Рр]([\\.:])?$",
|
|
4
|
-
"question": "
|
|
4
|
+
"question": "^([Нн][Уу][Лл][Ее][Вв][Оо][Йй]|[Рр][Аа][Зз][Мм][Ии][Нн][Оо][Чч][Нн][Ыы][Йй])? ?[Вв][Оо][Пп][Рр][Оо][Сс]\\s?[№N]?(?P<number>[0-9\\s]*)\\s?([\\.:]|\n|\r\n|$)",
|
|
5
5
|
"handout": "^Р[Аа][Зз][Дд][Аа][Тт][Оо][Чч][Нн][Ыы][Йй]\\s+[Мм][Аа][Тт][Ее][Рр][Ии][Аа][Лл][\\.:]",
|
|
6
6
|
"answer": "О[Тт][Вв][Ее][Тт][Ыы]?\\s?[№N]?([0-9]+)?\\s?[\\.:]",
|
|
7
7
|
"zachet": "З[Аа][Чч][ЕеЁё][Тт]\\s?[\\.:]",
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"tour": "^(#\\s+)?R[Uu][Nn][Dd][Aa]\\s?№?\\s?([0-9IVXLCDM]*)([\\.:])?$",
|
|
3
3
|
"tourrev": "^([0-9IVXLCDM]+)\\sR[Uu][Nn][Dd][Aa]([\\.:])?$",
|
|
4
|
-
"question": "P[Ii][Tt][Aa][Nn][Jj][Ee]\\s?[№N]?([0-9\\s]*)\\s?([\\.:]|\n|\r\n|$)",
|
|
4
|
+
"question": "P[Ii][Tt][Aa][Nn][Jj][Ee]\\s?[№N]?(?P<number>[0-9\\s]*)\\s?([\\.:]|\n|\r\n|$)",
|
|
5
5
|
"answer": "O[Dd][Gg][Oo][Vv][Oo][Rr]\\s?[№N]?([0-9]+)?\\s?[\\.:]",
|
|
6
6
|
"handout": "^(M[Aa][Tt][Ee][Rr][Ii][Jj][Aa][Ll]|P[Oo][Dd][Ee][Ll][Jj][Ee][Nn][Oo])[\\.:]",
|
|
7
7
|
"zachet": "P[Rr][Ii][Hh][Vv][Aa][Tt][Aa]\\s[Ss][Ee]\\s?[\\.:]",
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"tour": "^(#\\s+)?Т[Уу][Рр]\\s?№?\\s?([0-9IVXLCDM]*)([\\.:])?$",
|
|
3
3
|
"tourrev": "^([0-9IVXLCDM]+)\\s[Тт][Уу][Рр]([\\.:])?$",
|
|
4
|
-
"question": "З[Аа][Пп][Ии][Тт][Аа][Нн][Нн][Яя]\\s?[№N]?([0-9\\s]*)\\s?([\\.:]|\n|\r\n|$)",
|
|
4
|
+
"question": "З[Аа][Пп][Ии][Тт][Аа][Нн][Нн][Яя]\\s?[№N]?(?P<number>[0-9\\s]*)\\s?([\\.:]|\n|\r\n|$)",
|
|
5
5
|
"handout": "Р[Оо][Зз][Дд][Аа][Тт][Кк][Оо][Вв][Ии][Йй][\\s\\s][Мм][Аа][Тт][Ее][Рр][Іі][Аа][Лл][\\.:]",
|
|
6
6
|
"answer": "В[Іі][Дд][Пп][Оо][Вв][Іі][Дд][Ьь]?\\s?[№N]?([0-9]+)?\\s?[\\.:]",
|
|
7
7
|
"zachet": "З[Аа][Лл][Іі][Кк]\\s?[\\.:]",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"tour": "^(#\\s+)?([0-9]+)\\s?[-–—]\\s?[Тт][Уу][Рр]\\s*$",
|
|
3
|
-
"question": "([0-9]+)\\s?[-–—]\\s?\\sС[Аа][Вв][Оо][Лл]([\\.:]|\n|\r\n|$)",
|
|
3
|
+
"question": "(?P<number>[0-9]+)\\s?[-–—]\\s?\\sС[Аа][Вв][Оо][Лл]([\\.:]|\n|\r\n|$)",
|
|
4
4
|
"answer": "Ж[Аа][Вв][Оо][Бб]\\s?[№N]?([0-9]+)?\\s?[\\.:]",
|
|
5
5
|
"handout": "Т[Аа][Рр][Ққ][Аа][Тт][Мм][Аа][\\s\\s][Мм][Аа][Тт][Ее][Рр][Ии][Аа][Лл][\\.:]",
|
|
6
6
|
"zachet": "Қ[Аа][Бб][Уу][Лл]\\s?[\\.:]",
|
chgksuite/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.26.
|
|
1
|
+
__version__ = "0.26.1"
|