abstractcore 2.5.2__py3-none-any.whl → 2.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractcore/__init__.py +12 -0
- abstractcore/architectures/detection.py +250 -4
- abstractcore/assets/architecture_formats.json +14 -1
- abstractcore/assets/model_capabilities.json +533 -10
- abstractcore/compression/__init__.py +29 -0
- abstractcore/compression/analytics.py +420 -0
- abstractcore/compression/cache.py +250 -0
- abstractcore/compression/config.py +279 -0
- abstractcore/compression/exceptions.py +30 -0
- abstractcore/compression/glyph_processor.py +381 -0
- abstractcore/compression/optimizer.py +388 -0
- abstractcore/compression/orchestrator.py +380 -0
- abstractcore/compression/pil_text_renderer.py +818 -0
- abstractcore/compression/quality.py +226 -0
- abstractcore/compression/text_formatter.py +666 -0
- abstractcore/compression/vision_compressor.py +371 -0
- abstractcore/config/main.py +64 -0
- abstractcore/config/manager.py +100 -5
- abstractcore/core/session.py +61 -6
- abstractcore/events/__init__.py +1 -1
- abstractcore/media/auto_handler.py +312 -18
- abstractcore/media/handlers/local_handler.py +14 -2
- abstractcore/media/handlers/openai_handler.py +62 -3
- abstractcore/media/processors/__init__.py +11 -1
- abstractcore/media/processors/direct_pdf_processor.py +210 -0
- abstractcore/media/processors/glyph_pdf_processor.py +227 -0
- abstractcore/media/processors/image_processor.py +7 -1
- abstractcore/media/processors/text_processor.py +18 -3
- abstractcore/media/types.py +164 -7
- abstractcore/providers/__init__.py +18 -0
- abstractcore/providers/anthropic_provider.py +28 -2
- abstractcore/providers/base.py +278 -6
- abstractcore/providers/huggingface_provider.py +563 -23
- abstractcore/providers/lmstudio_provider.py +38 -2
- abstractcore/providers/mlx_provider.py +27 -2
- abstractcore/providers/model_capabilities.py +352 -0
- abstractcore/providers/ollama_provider.py +38 -4
- abstractcore/providers/openai_provider.py +28 -2
- abstractcore/providers/registry.py +85 -13
- abstractcore/server/app.py +91 -81
- abstractcore/utils/__init__.py +4 -1
- abstractcore/utils/trace_export.py +287 -0
- abstractcore/utils/version.py +1 -1
- abstractcore/utils/vlm_token_calculator.py +655 -0
- {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/METADATA +107 -6
- {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/RECORD +50 -33
- {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/WHEEL +0 -0
- {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/entry_points.txt +0 -0
- {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/licenses/LICENSE +0 -0
- {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,666 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Text formatter for Glyph compression with markdown-like formatting support.
|
|
3
|
+
|
|
4
|
+
This module provides text preprocessing to improve readability in compressed images
|
|
5
|
+
by handling newlines, markdown formatting, and headers appropriately.
|
|
6
|
+
|
|
7
|
+
The formatter converts markdown-like syntax to ReportLab-compatible rich text
|
|
8
|
+
with proper bold and italic font rendering.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
from typing import Dict, Any, Optional, List, Tuple
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
|
|
15
|
+
from ..utils.structured_logging import get_logger
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class TextSegment:
|
|
20
|
+
"""Represents a segment of text with formatting information."""
|
|
21
|
+
text: str
|
|
22
|
+
is_bold: bool = False
|
|
23
|
+
is_italic: bool = False
|
|
24
|
+
is_header: bool = False
|
|
25
|
+
header_level: int = 0 # 1, 2, 3 for H1, H2, H3
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class FormattingConfig:
|
|
30
|
+
"""Configuration for text formatting options."""
|
|
31
|
+
|
|
32
|
+
# Newline handling - UPDATED RULES
|
|
33
|
+
single_newline_to_space: bool = True # Single \n becomes 1 space
|
|
34
|
+
double_newline_to_two_spaces: bool = True # \n\n becomes 2 spaces
|
|
35
|
+
triple_newline_to_break: bool = True # \n\n\n+ becomes single line break
|
|
36
|
+
|
|
37
|
+
# Markdown formatting
|
|
38
|
+
bold_formatting: bool = True # **text** → BOLD TEXT
|
|
39
|
+
italic_formatting: bool = True # *text* → italic text
|
|
40
|
+
|
|
41
|
+
# Header formatting
|
|
42
|
+
header_formatting: bool = True # Convert # ## ### to A) a) 1)
|
|
43
|
+
header_bold_caps: bool = True # Headers in BOLD AND ALL CAPS
|
|
44
|
+
|
|
45
|
+
# Header numbering styles - HIERARCHICAL
|
|
46
|
+
h1_style: str = "A" # A. B. C. ...
|
|
47
|
+
h2_style: str = "A.1" # A.1. A.2. A.3. ...
|
|
48
|
+
h3_style: str = "A.1.a" # A.1.a. A.1.b. A.1.c. ...
|
|
49
|
+
h4_style: str = "A.1.a.i" # A.1.a.i. A.1.a.ii. A.1.a.iii. ...
|
|
50
|
+
h5_style: str = "A.1.a.i.1" # A.1.a.i.1. A.1.a.i.2. ...
|
|
51
|
+
|
|
52
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
53
|
+
"""Convert to dictionary for caching."""
|
|
54
|
+
return {
|
|
55
|
+
'single_newline_to_space': self.single_newline_to_space,
|
|
56
|
+
'double_newline_to_two_spaces': self.double_newline_to_two_spaces,
|
|
57
|
+
'triple_newline_to_break': self.triple_newline_to_break,
|
|
58
|
+
'bold_formatting': self.bold_formatting,
|
|
59
|
+
'italic_formatting': self.italic_formatting,
|
|
60
|
+
'header_formatting': self.header_formatting,
|
|
61
|
+
'header_bold_caps': self.header_bold_caps,
|
|
62
|
+
'h1_style': self.h1_style,
|
|
63
|
+
'h2_style': self.h2_style,
|
|
64
|
+
'h3_style': self.h3_style,
|
|
65
|
+
'h4_style': self.h4_style
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class TextFormatter:
|
|
70
|
+
"""
|
|
71
|
+
Text formatter for improving readability in Glyph-compressed images.
|
|
72
|
+
|
|
73
|
+
Handles markdown-like formatting, newline processing, and header conversion
|
|
74
|
+
to make text more readable when rendered as images.
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
def __init__(self, config: Optional[FormattingConfig] = None):
|
|
78
|
+
"""
|
|
79
|
+
Initialize text formatter.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
config: Formatting configuration
|
|
83
|
+
"""
|
|
84
|
+
self.config = config or FormattingConfig()
|
|
85
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
86
|
+
|
|
87
|
+
# Header counters for numbering
|
|
88
|
+
self._header_counters = {
|
|
89
|
+
'h1': 0,
|
|
90
|
+
'h2': 0,
|
|
91
|
+
'h3': 0,
|
|
92
|
+
'h4': 0,
|
|
93
|
+
'h5': 0
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
self.logger.debug("TextFormatter initialized", config=self.config.to_dict())
|
|
97
|
+
|
|
98
|
+
def format_text(self, text: str) -> List[TextSegment]:
|
|
99
|
+
"""
|
|
100
|
+
Apply formatting transformations to text and return structured segments.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
text: Raw text to format
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
List of TextSegment objects with formatting information
|
|
107
|
+
"""
|
|
108
|
+
import time
|
|
109
|
+
start_time = time.time()
|
|
110
|
+
|
|
111
|
+
if not text:
|
|
112
|
+
return [TextSegment(text="")]
|
|
113
|
+
|
|
114
|
+
# Better header detection - check for any line starting with #
|
|
115
|
+
has_headers = any(line.strip().startswith('#') for line in text.split('\n')[:100]) # Check first 100 lines for performance
|
|
116
|
+
has_bold_markers = '**' in text
|
|
117
|
+
has_italic_markers = '*' in text and '**' not in text
|
|
118
|
+
|
|
119
|
+
self.logger.debug("Starting text formatting",
|
|
120
|
+
original_length=len(text),
|
|
121
|
+
has_newlines='\n' in text,
|
|
122
|
+
has_bold_markers=has_bold_markers,
|
|
123
|
+
has_italic_markers=has_italic_markers,
|
|
124
|
+
has_headers=has_headers)
|
|
125
|
+
|
|
126
|
+
# Performance optimization: For large files with no formatting, skip complex processing
|
|
127
|
+
if len(text) > 50000 and not has_headers and not has_bold_markers and not has_italic_markers:
|
|
128
|
+
self.logger.debug("Large file with no formatting detected - using fast path")
|
|
129
|
+
# Just process newlines and return as single segment
|
|
130
|
+
processed_text = self._process_newlines(text)
|
|
131
|
+
return [TextSegment(text=processed_text)]
|
|
132
|
+
|
|
133
|
+
# Reset header counters for each new text
|
|
134
|
+
self._reset_counters()
|
|
135
|
+
|
|
136
|
+
# Step 1: Parse into segments with formatting (before newline processing)
|
|
137
|
+
step1_start = time.time()
|
|
138
|
+
self.logger.debug("Step 1: Starting _parse_formatted_text")
|
|
139
|
+
segments = self._parse_formatted_text(text)
|
|
140
|
+
step1_time = time.time() - step1_start
|
|
141
|
+
self.logger.debug(f"Step 1: _parse_formatted_text completed in {step1_time:.3f}s, segments={len(segments)}")
|
|
142
|
+
|
|
143
|
+
# Step 2: Apply newline processing to the final segments
|
|
144
|
+
step2_start = time.time()
|
|
145
|
+
self.logger.debug("Step 2: Starting _apply_newline_processing_to_segments")
|
|
146
|
+
segments = self._apply_newline_processing_to_segments(segments)
|
|
147
|
+
step2_time = time.time() - step2_start
|
|
148
|
+
self.logger.debug(f"Step 2: _apply_newline_processing_to_segments completed in {step2_time:.3f}s")
|
|
149
|
+
|
|
150
|
+
total_time = time.time() - start_time
|
|
151
|
+
self.logger.debug("Text formatting completed",
|
|
152
|
+
original_length=len(text),
|
|
153
|
+
segments_count=len(segments),
|
|
154
|
+
total_formatted_length=sum(len(s.text) for s in segments),
|
|
155
|
+
total_time_seconds=f"{total_time:.3f}")
|
|
156
|
+
|
|
157
|
+
return segments
|
|
158
|
+
|
|
159
|
+
def format_text_to_string(self, text: str) -> str:
|
|
160
|
+
"""
|
|
161
|
+
Apply formatting and return as plain text (for backward compatibility).
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
text: Raw text to format
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
Formatted text as plain string
|
|
168
|
+
"""
|
|
169
|
+
segments = self.format_text(text)
|
|
170
|
+
return ''.join(segment.text for segment in segments)
|
|
171
|
+
|
|
172
|
+
def _reset_counters(self):
|
|
173
|
+
"""Reset header counters for new text."""
|
|
174
|
+
self._header_counters = {'h1': 0, 'h2': 0, 'h3': 0, 'h4': 0, 'h5': 0}
|
|
175
|
+
|
|
176
|
+
def _parse_formatted_text(self, text: str) -> List[TextSegment]:
|
|
177
|
+
"""
|
|
178
|
+
Parse text with markdown formatting into structured segments.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
text: Text with markdown formatting
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
List of TextSegment objects
|
|
185
|
+
"""
|
|
186
|
+
import time
|
|
187
|
+
start_time = time.time()
|
|
188
|
+
|
|
189
|
+
segments = []
|
|
190
|
+
|
|
191
|
+
# Split text by lines first to handle headers
|
|
192
|
+
lines = text.split('\n')
|
|
193
|
+
total_lines = len(lines)
|
|
194
|
+
|
|
195
|
+
self.logger.debug(f"_parse_formatted_text: Processing {total_lines} lines")
|
|
196
|
+
|
|
197
|
+
for line_idx, line in enumerate(lines):
|
|
198
|
+
# Progress logging every 1000 lines for large files
|
|
199
|
+
if line_idx > 0 and line_idx % 1000 == 0:
|
|
200
|
+
elapsed = time.time() - start_time
|
|
201
|
+
self.logger.debug(f"_parse_formatted_text: Progress {line_idx}/{total_lines} lines ({line_idx/total_lines*100:.1f}%) in {elapsed:.2f}s")
|
|
202
|
+
if line.strip():
|
|
203
|
+
# Process headers first
|
|
204
|
+
if self.config.header_formatting and line.strip().startswith('#'):
|
|
205
|
+
header_segment = self._process_header_line_to_segment(line)
|
|
206
|
+
if header_segment:
|
|
207
|
+
segments.append(header_segment)
|
|
208
|
+
# NEVER add line break after header (rule #9)
|
|
209
|
+
continue
|
|
210
|
+
|
|
211
|
+
# Process inline formatting (bold/italic) for non-header lines
|
|
212
|
+
line_segments = self._parse_inline_formatting(line)
|
|
213
|
+
segments.extend(line_segments)
|
|
214
|
+
else:
|
|
215
|
+
# Empty line
|
|
216
|
+
segments.append(TextSegment(text=""))
|
|
217
|
+
|
|
218
|
+
# Add line break after each line (except the last one)
|
|
219
|
+
if line_idx < len(lines) - 1 and not (self.config.header_formatting and line.strip().startswith('#')):
|
|
220
|
+
segments.append(TextSegment(text="\n"))
|
|
221
|
+
|
|
222
|
+
return segments
|
|
223
|
+
|
|
224
|
+
def _apply_newline_processing_to_segments(self, segments: List[TextSegment]) -> List[TextSegment]:
|
|
225
|
+
"""
|
|
226
|
+
Apply newline processing rules to segments.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
segments: List of TextSegment objects
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
List of TextSegment objects with newline processing applied
|
|
233
|
+
"""
|
|
234
|
+
processed_segments = []
|
|
235
|
+
|
|
236
|
+
for segment in segments:
|
|
237
|
+
if segment.text == "\n":
|
|
238
|
+
# Single newline becomes 1 space
|
|
239
|
+
if self.config.single_newline_to_space:
|
|
240
|
+
processed_segments.append(TextSegment(text=" "))
|
|
241
|
+
else:
|
|
242
|
+
processed_segments.append(segment)
|
|
243
|
+
else:
|
|
244
|
+
# Apply newline processing to text content
|
|
245
|
+
processed_text = self._process_newlines(segment.text)
|
|
246
|
+
processed_segments.append(TextSegment(
|
|
247
|
+
text=processed_text,
|
|
248
|
+
is_bold=segment.is_bold,
|
|
249
|
+
is_italic=segment.is_italic,
|
|
250
|
+
is_header=segment.is_header,
|
|
251
|
+
header_level=segment.header_level
|
|
252
|
+
))
|
|
253
|
+
|
|
254
|
+
return processed_segments
|
|
255
|
+
|
|
256
|
+
def _process_newlines(self, text: str) -> str:
|
|
257
|
+
"""
|
|
258
|
+
Process newlines within text content according to updated rules:
|
|
259
|
+
1) Single \n → 1 space
|
|
260
|
+
2) Double \n\n → 2 spaces
|
|
261
|
+
3) Triple+ \n\n\n → 1 linebreak
|
|
262
|
+
|
|
263
|
+
Also handles literal \n sequences (backslash-n) in addition to actual newlines.
|
|
264
|
+
"""
|
|
265
|
+
import re
|
|
266
|
+
|
|
267
|
+
# First, convert literal \n sequences to actual newlines
|
|
268
|
+
text = text.replace('\\n', '\n')
|
|
269
|
+
|
|
270
|
+
# Process in order: triple+, double, single
|
|
271
|
+
# Use placeholder to avoid conflicts
|
|
272
|
+
|
|
273
|
+
# 1. Triple or more newlines → single line break (use placeholder first)
|
|
274
|
+
if self.config.triple_newline_to_break:
|
|
275
|
+
text = re.sub(r'\n{3,}', '___LINEBREAK___', text)
|
|
276
|
+
|
|
277
|
+
# 2. Double newlines → 2 spaces
|
|
278
|
+
if self.config.double_newline_to_two_spaces:
|
|
279
|
+
text = re.sub(r'\n\n', ' ', text)
|
|
280
|
+
|
|
281
|
+
# 3. Single newlines → 1 space
|
|
282
|
+
if self.config.single_newline_to_space:
|
|
283
|
+
text = re.sub(r'\n', ' ', text)
|
|
284
|
+
|
|
285
|
+
# 4. Replace placeholder with actual line break
|
|
286
|
+
text = text.replace('___LINEBREAK___', '\n')
|
|
287
|
+
|
|
288
|
+
return text
|
|
289
|
+
|
|
290
|
+
def _parse_inline_formatting(self, text: str) -> List[TextSegment]:
|
|
291
|
+
"""
|
|
292
|
+
Parse inline formatting (bold, italic) in a line of text.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
text: Line of text with potential formatting
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
List of TextSegment objects for this line
|
|
299
|
+
"""
|
|
300
|
+
segments = []
|
|
301
|
+
|
|
302
|
+
if not text:
|
|
303
|
+
return segments
|
|
304
|
+
|
|
305
|
+
# Performance optimization: Skip inline parsing if no formatting markers
|
|
306
|
+
if '**' not in text and '*' not in text:
|
|
307
|
+
return [TextSegment(text=text)]
|
|
308
|
+
|
|
309
|
+
# Process text sequentially to handle formatting correctly
|
|
310
|
+
i = 0
|
|
311
|
+
while i < len(text):
|
|
312
|
+
# Check for bold formatting **text**
|
|
313
|
+
if i < len(text) - 3 and text[i:i+2] == '**':
|
|
314
|
+
# Find the closing **
|
|
315
|
+
end_pos = text.find('**', i + 2)
|
|
316
|
+
if end_pos != -1 and end_pos > i + 2: # Must have content between
|
|
317
|
+
# Found bold text
|
|
318
|
+
bold_content = text[i+2:end_pos]
|
|
319
|
+
if bold_content and self.config.bold_formatting:
|
|
320
|
+
segments.append(TextSegment(text=bold_content, is_bold=True))
|
|
321
|
+
i = end_pos + 2
|
|
322
|
+
continue
|
|
323
|
+
|
|
324
|
+
# Check for italic formatting *text* (but not part of **)
|
|
325
|
+
if (i < len(text) - 2 and text[i] == '*' and
|
|
326
|
+
(i == 0 or text[i-1:i+1] != '**') and # Not part of **
|
|
327
|
+
(i >= len(text) - 2 or text[i:i+2] != '**')): # Not start of **
|
|
328
|
+
|
|
329
|
+
# Find the closing *
|
|
330
|
+
end_pos = i + 1
|
|
331
|
+
while end_pos < len(text) and text[end_pos] != '*':
|
|
332
|
+
end_pos += 1
|
|
333
|
+
|
|
334
|
+
if end_pos < len(text) and end_pos > i + 1: # Must have content between
|
|
335
|
+
# Make sure this isn't part of **
|
|
336
|
+
if end_pos >= len(text) - 1 or text[end_pos:end_pos+2] != '**':
|
|
337
|
+
italic_content = text[i+1:end_pos]
|
|
338
|
+
if italic_content and self.config.italic_formatting:
|
|
339
|
+
segments.append(TextSegment(text=italic_content, is_italic=True))
|
|
340
|
+
i = end_pos + 1
|
|
341
|
+
continue
|
|
342
|
+
|
|
343
|
+
# Regular character - collect until next formatting marker or advance by 1
|
|
344
|
+
start_pos = i
|
|
345
|
+
while i < len(text) and text[i] != '*':
|
|
346
|
+
i += 1
|
|
347
|
+
|
|
348
|
+
if i > start_pos:
|
|
349
|
+
plain_text = text[start_pos:i]
|
|
350
|
+
if plain_text: # Only add non-empty segments
|
|
351
|
+
segments.append(TextSegment(text=plain_text))
|
|
352
|
+
else:
|
|
353
|
+
# If we didn't advance, we hit a * that didn't match formatting
|
|
354
|
+
# Add the single character and advance to prevent infinite loop
|
|
355
|
+
segments.append(TextSegment(text=text[i]))
|
|
356
|
+
i += 1
|
|
357
|
+
|
|
358
|
+
return segments
|
|
359
|
+
|
|
360
|
+
def _process_header_line_to_segment(self, line: str) -> Optional[TextSegment]:
|
|
361
|
+
"""
|
|
362
|
+
Process a header line and return a TextSegment.
|
|
363
|
+
|
|
364
|
+
Args:
|
|
365
|
+
line: Line starting with # ## or ###
|
|
366
|
+
|
|
367
|
+
Returns:
|
|
368
|
+
TextSegment with header formatting, or None if not a valid header
|
|
369
|
+
"""
|
|
370
|
+
stripped = line.strip()
|
|
371
|
+
|
|
372
|
+
if stripped.startswith('#####'):
|
|
373
|
+
# H5 header
|
|
374
|
+
content = stripped[5:].strip()
|
|
375
|
+
if content:
|
|
376
|
+
self._header_counters['h5'] += 1
|
|
377
|
+
number = self._get_header_number('h5', self._header_counters['h5'])
|
|
378
|
+
# Process inline formatting in header content
|
|
379
|
+
clean_content = self._strip_markdown_formatting(content)
|
|
380
|
+
formatted_content = f"{number} {clean_content.upper() if self.config.header_bold_caps else clean_content}"
|
|
381
|
+
return TextSegment(text=formatted_content, is_bold=True, is_header=True, header_level=5)
|
|
382
|
+
|
|
383
|
+
elif stripped.startswith('####'):
|
|
384
|
+
# H4 header
|
|
385
|
+
content = stripped[4:].strip()
|
|
386
|
+
if content:
|
|
387
|
+
self._header_counters['h4'] += 1
|
|
388
|
+
# Reset h5 counter when we encounter h4
|
|
389
|
+
self._header_counters['h5'] = 0
|
|
390
|
+
number = self._get_header_number('h4', self._header_counters['h4'])
|
|
391
|
+
# Process inline formatting in header content
|
|
392
|
+
clean_content = self._strip_markdown_formatting(content)
|
|
393
|
+
formatted_content = f"{number} {clean_content.upper() if self.config.header_bold_caps else clean_content}"
|
|
394
|
+
return TextSegment(text=formatted_content, is_bold=True, is_header=True, header_level=4)
|
|
395
|
+
|
|
396
|
+
elif stripped.startswith('###'):
|
|
397
|
+
# H3 header
|
|
398
|
+
content = stripped[3:].strip()
|
|
399
|
+
if content:
|
|
400
|
+
self._header_counters['h3'] += 1
|
|
401
|
+
# Reset h4 and h5 counters when we encounter h3
|
|
402
|
+
self._header_counters['h4'] = 0
|
|
403
|
+
self._header_counters['h5'] = 0
|
|
404
|
+
number = self._get_header_number('h3', self._header_counters['h3'])
|
|
405
|
+
# Process inline formatting in header content
|
|
406
|
+
clean_content = self._strip_markdown_formatting(content)
|
|
407
|
+
formatted_content = f"{number} {clean_content.upper() if self.config.header_bold_caps else clean_content}"
|
|
408
|
+
return TextSegment(text=formatted_content, is_bold=True, is_header=True, header_level=3)
|
|
409
|
+
|
|
410
|
+
elif stripped.startswith('##'):
|
|
411
|
+
# H2 header
|
|
412
|
+
content = stripped[2:].strip()
|
|
413
|
+
if content:
|
|
414
|
+
self._header_counters['h2'] += 1
|
|
415
|
+
# Reset h3, h4, and h5 counters when we encounter h2
|
|
416
|
+
self._header_counters['h3'] = 0
|
|
417
|
+
self._header_counters['h4'] = 0
|
|
418
|
+
self._header_counters['h5'] = 0
|
|
419
|
+
number = self._get_header_number('h2', self._header_counters['h2'])
|
|
420
|
+
# Process inline formatting in header content
|
|
421
|
+
clean_content = self._strip_markdown_formatting(content)
|
|
422
|
+
formatted_content = f"{number} {clean_content.upper() if self.config.header_bold_caps else clean_content}"
|
|
423
|
+
return TextSegment(text=formatted_content, is_bold=True, is_header=True, header_level=2)
|
|
424
|
+
|
|
425
|
+
elif stripped.startswith('#'):
|
|
426
|
+
# H1 header - NO NUMBERING according to new rules
|
|
427
|
+
content = stripped[1:].strip()
|
|
428
|
+
if content:
|
|
429
|
+
self._header_counters['h1'] += 1
|
|
430
|
+
# Reset h2, h3, h4, and h5 counters when we encounter h1
|
|
431
|
+
self._header_counters['h2'] = 0
|
|
432
|
+
self._header_counters['h3'] = 0
|
|
433
|
+
self._header_counters['h4'] = 0
|
|
434
|
+
self._header_counters['h5'] = 0
|
|
435
|
+
# Process inline formatting in header content
|
|
436
|
+
clean_content = self._strip_markdown_formatting(content)
|
|
437
|
+
formatted_content = f"{clean_content.upper() if self.config.header_bold_caps else clean_content}"
|
|
438
|
+
return TextSegment(text=formatted_content, is_bold=True, is_header=True, header_level=1)
|
|
439
|
+
|
|
440
|
+
return None
|
|
441
|
+
|
|
442
|
+
def _strip_markdown_formatting(self, text: str) -> str:
|
|
443
|
+
"""Strip markdown formatting markers from text."""
|
|
444
|
+
# Remove **bold** markers
|
|
445
|
+
text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
|
|
446
|
+
# Remove *italic* markers
|
|
447
|
+
text = re.sub(r'(?<!\*)\*([^*]+?)\*(?!\*)', r'\1', text)
|
|
448
|
+
return text
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def _process_bold(self, text: str) -> str:
|
|
452
|
+
"""
|
|
453
|
+
Process bold markdown formatting (**text** → BOLD TEXT).
|
|
454
|
+
|
|
455
|
+
Note: In text rendering, we'll use uppercase to simulate bold
|
|
456
|
+
since ReportLab bold fonts may not be available.
|
|
457
|
+
"""
|
|
458
|
+
def bold_replacer(match):
|
|
459
|
+
content = match.group(1)
|
|
460
|
+
# Convert to uppercase to simulate bold in plain text
|
|
461
|
+
return content.upper()
|
|
462
|
+
|
|
463
|
+
# Match **text** patterns (non-greedy)
|
|
464
|
+
return re.sub(r'\*\*(.*?)\*\*', bold_replacer, text)
|
|
465
|
+
|
|
466
|
+
def _process_italic(self, text: str) -> str:
|
|
467
|
+
"""
|
|
468
|
+
Process italic markdown formatting (*text* → italic text).
|
|
469
|
+
|
|
470
|
+
Note: We'll keep italic text as-is since true italic rendering
|
|
471
|
+
would require font changes in ReportLab.
|
|
472
|
+
"""
|
|
473
|
+
def italic_replacer(match):
|
|
474
|
+
content = match.group(1)
|
|
475
|
+
# For now, just remove the markers and keep text as-is
|
|
476
|
+
# In future, could add special markers for ReportLab italic rendering
|
|
477
|
+
return content
|
|
478
|
+
|
|
479
|
+
# Match *text* patterns (but not **text**) - single asterisks only
|
|
480
|
+
return re.sub(r'(?<!\*)\*([^*]+?)\*(?!\*)', italic_replacer, text)
|
|
481
|
+
|
|
482
|
+
def _process_headers(self, text: str) -> str:
|
|
483
|
+
"""
|
|
484
|
+
Process markdown headers and convert to numbered format.
|
|
485
|
+
|
|
486
|
+
# Header → A) HEADER
|
|
487
|
+
## Header → a) HEADER
|
|
488
|
+
### Header → 1) HEADER
|
|
489
|
+
"""
|
|
490
|
+
lines = text.split('\n')
|
|
491
|
+
processed_lines = []
|
|
492
|
+
|
|
493
|
+
for line in lines:
|
|
494
|
+
processed_line = self._process_header_line(line)
|
|
495
|
+
processed_lines.append(processed_line)
|
|
496
|
+
|
|
497
|
+
return '\n'.join(processed_lines)
|
|
498
|
+
|
|
499
|
+
def _process_header_line(self, line: str) -> str:
|
|
500
|
+
"""Process a single line for header formatting."""
|
|
501
|
+
stripped = line.strip()
|
|
502
|
+
|
|
503
|
+
# Check for headers
|
|
504
|
+
if stripped.startswith('###'):
|
|
505
|
+
# H3 header
|
|
506
|
+
content = stripped[3:].strip()
|
|
507
|
+
if content:
|
|
508
|
+
self._header_counters['h3'] += 1
|
|
509
|
+
number = self._get_header_number('h3', self._header_counters['h3'])
|
|
510
|
+
formatted_content = content.upper() if self.config.header_bold_caps else content
|
|
511
|
+
return f"{number}) {formatted_content}"
|
|
512
|
+
|
|
513
|
+
elif stripped.startswith('##'):
|
|
514
|
+
# H2 header
|
|
515
|
+
content = stripped[2:].strip()
|
|
516
|
+
if content:
|
|
517
|
+
self._header_counters['h2'] += 1
|
|
518
|
+
# Reset h3 counter when we encounter h2
|
|
519
|
+
self._header_counters['h3'] = 0
|
|
520
|
+
number = self._get_header_number('h2', self._header_counters['h2'])
|
|
521
|
+
formatted_content = content.upper() if self.config.header_bold_caps else content
|
|
522
|
+
return f"{number}) {formatted_content}"
|
|
523
|
+
|
|
524
|
+
elif stripped.startswith('#'):
|
|
525
|
+
# H1 header
|
|
526
|
+
content = stripped[1:].strip()
|
|
527
|
+
if content:
|
|
528
|
+
self._header_counters['h1'] += 1
|
|
529
|
+
# Reset h2 and h3 counters when we encounter h1
|
|
530
|
+
self._header_counters['h2'] = 0
|
|
531
|
+
self._header_counters['h3'] = 0
|
|
532
|
+
number = self._get_header_number('h1', self._header_counters['h1'])
|
|
533
|
+
formatted_content = content.upper() if self.config.header_bold_caps else content
|
|
534
|
+
return f"{number}) {formatted_content}"
|
|
535
|
+
|
|
536
|
+
return line
|
|
537
|
+
|
|
538
|
+
def _get_header_number(self, level: str, count: int) -> str:
|
|
539
|
+
"""
|
|
540
|
+
Get the appropriate header number/letter based on level and count.
|
|
541
|
+
|
|
542
|
+
NEW HIERARCHICAL FORMAT:
|
|
543
|
+
H1: No numbering
|
|
544
|
+
H2: A. B. C. ...
|
|
545
|
+
H3: A.1. A.2. A.3. ...
|
|
546
|
+
H4: A.1.a. A.1.b. A.1.c. ...
|
|
547
|
+
H5: A.1.a.i. A.1.a.ii. A.1.a.iii. ...
|
|
548
|
+
|
|
549
|
+
Args:
|
|
550
|
+
level: Header level ('h1', 'h2', 'h3', 'h4', 'h5')
|
|
551
|
+
count: Current count for this level
|
|
552
|
+
|
|
553
|
+
Returns:
|
|
554
|
+
Formatted number/letter (e.g., 'A.', 'A.1.', 'A.1.a.')
|
|
555
|
+
"""
|
|
556
|
+
|
|
557
|
+
if level == 'h1':
|
|
558
|
+
# H1: No numbering
|
|
559
|
+
return ""
|
|
560
|
+
|
|
561
|
+
elif level == 'h2':
|
|
562
|
+
# H2: A. B. C. ...
|
|
563
|
+
if count <= 26:
|
|
564
|
+
letter = chr(ord('A') + count - 1)
|
|
565
|
+
else:
|
|
566
|
+
# After Z, use AA, BB, CC, etc.
|
|
567
|
+
letter = chr(ord('A') + ((count - 1) % 26))
|
|
568
|
+
letter = letter * ((count - 1) // 26 + 1)
|
|
569
|
+
return f"{letter}."
|
|
570
|
+
|
|
571
|
+
elif level == 'h3':
|
|
572
|
+
# H3: A.1. A.2. A.3. ...
|
|
573
|
+
h2_count = self._header_counters['h2']
|
|
574
|
+
if h2_count <= 26:
|
|
575
|
+
h2_letter = chr(ord('A') + h2_count - 1)
|
|
576
|
+
else:
|
|
577
|
+
h2_letter = chr(ord('A') + ((h2_count - 1) % 26))
|
|
578
|
+
h2_letter = h2_letter * ((h2_count - 1) // 26 + 1)
|
|
579
|
+
return f"{h2_letter}.{count}."
|
|
580
|
+
|
|
581
|
+
elif level == 'h4':
|
|
582
|
+
# H4: A.1.a. A.1.b. A.1.c. ...
|
|
583
|
+
h2_count = self._header_counters['h2']
|
|
584
|
+
h3_count = self._header_counters['h3']
|
|
585
|
+
|
|
586
|
+
if h2_count <= 26:
|
|
587
|
+
h2_letter = chr(ord('A') + h2_count - 1)
|
|
588
|
+
else:
|
|
589
|
+
h2_letter = chr(ord('A') + ((h2_count - 1) % 26))
|
|
590
|
+
h2_letter = h2_letter * ((h2_count - 1) // 26 + 1)
|
|
591
|
+
|
|
592
|
+
if count <= 26:
|
|
593
|
+
h4_letter = chr(ord('a') + count - 1)
|
|
594
|
+
else:
|
|
595
|
+
h4_letter = chr(ord('a') + ((count - 1) % 26))
|
|
596
|
+
h4_letter = h4_letter * ((count - 1) // 26 + 1)
|
|
597
|
+
|
|
598
|
+
return f"{h2_letter}.{h3_count}.{h4_letter}."
|
|
599
|
+
|
|
600
|
+
elif level == 'h5':
|
|
601
|
+
# H5: A.1.a.i. A.1.a.ii. A.1.a.iii. ...
|
|
602
|
+
h2_count = self._header_counters['h2']
|
|
603
|
+
h3_count = self._header_counters['h3']
|
|
604
|
+
h4_count = self._header_counters['h4']
|
|
605
|
+
|
|
606
|
+
if h2_count <= 26:
|
|
607
|
+
h2_letter = chr(ord('A') + h2_count - 1)
|
|
608
|
+
else:
|
|
609
|
+
h2_letter = chr(ord('A') + ((h2_count - 1) % 26))
|
|
610
|
+
h2_letter = h2_letter * ((h2_count - 1) // 26 + 1)
|
|
611
|
+
|
|
612
|
+
if h4_count <= 26:
|
|
613
|
+
h4_letter = chr(ord('a') + h4_count - 1)
|
|
614
|
+
else:
|
|
615
|
+
h4_letter = chr(ord('a') + ((h4_count - 1) % 26))
|
|
616
|
+
h4_letter = h4_letter * ((h4_count - 1) // 26 + 1)
|
|
617
|
+
|
|
618
|
+
h5_roman = self._int_to_roman_lower(count)
|
|
619
|
+
return f"{h2_letter}.{h3_count}.{h4_letter}.{h5_roman}."
|
|
620
|
+
|
|
621
|
+
return str(count)
|
|
622
|
+
|
|
623
|
+
def _int_to_roman_lower(self, num: int) -> str:
|
|
624
|
+
"""Convert integer to lowercase Roman numeral."""
|
|
625
|
+
values = [1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1]
|
|
626
|
+
literals = ['m', 'cm', 'd', 'cd', 'c', 'xc', 'l', 'xl', 'x', 'ix', 'v', 'iv', 'i']
|
|
627
|
+
|
|
628
|
+
result = ""
|
|
629
|
+
for i in range(len(values)):
|
|
630
|
+
count = num // values[i]
|
|
631
|
+
if count:
|
|
632
|
+
result += literals[i] * count
|
|
633
|
+
num -= values[i] * count
|
|
634
|
+
return result
|
|
635
|
+
|
|
636
|
+
def get_formatting_summary(self) -> Dict[str, Any]:
|
|
637
|
+
"""Get summary of formatting configuration and usage."""
|
|
638
|
+
return {
|
|
639
|
+
'config': self.config.to_dict(),
|
|
640
|
+
'header_counters': self._header_counters.copy(),
|
|
641
|
+
'formatter_version': '1.0'
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
|
|
645
|
+
def create_default_formatter() -> TextFormatter:
|
|
646
|
+
"""Create a TextFormatter with default configuration."""
|
|
647
|
+
return TextFormatter(FormattingConfig())
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
def create_minimal_formatter() -> TextFormatter:
|
|
651
|
+
"""Create a TextFormatter with minimal formatting (only newlines)."""
|
|
652
|
+
config = FormattingConfig()
|
|
653
|
+
config.bold_formatting = False
|
|
654
|
+
config.italic_formatting = False
|
|
655
|
+
config.header_formatting = False
|
|
656
|
+
return TextFormatter(config)
|
|
657
|
+
|
|
658
|
+
|
|
659
|
+
def create_headers_only_formatter() -> TextFormatter:
|
|
660
|
+
"""Create a TextFormatter that only processes headers."""
|
|
661
|
+
config = FormattingConfig()
|
|
662
|
+
config.bold_formatting = False
|
|
663
|
+
config.italic_formatting = False
|
|
664
|
+
config.consecutive_newlines_to_break = False
|
|
665
|
+
config.single_newline_to_spaces = False
|
|
666
|
+
return TextFormatter(config)
|