abstractcore 2.5.0__py3-none-any.whl → 2.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. abstractcore/__init__.py +12 -0
  2. abstractcore/apps/__main__.py +8 -1
  3. abstractcore/apps/deepsearch.py +644 -0
  4. abstractcore/apps/intent.py +614 -0
  5. abstractcore/architectures/detection.py +250 -4
  6. abstractcore/assets/architecture_formats.json +14 -1
  7. abstractcore/assets/model_capabilities.json +583 -44
  8. abstractcore/compression/__init__.py +29 -0
  9. abstractcore/compression/analytics.py +420 -0
  10. abstractcore/compression/cache.py +250 -0
  11. abstractcore/compression/config.py +279 -0
  12. abstractcore/compression/exceptions.py +30 -0
  13. abstractcore/compression/glyph_processor.py +381 -0
  14. abstractcore/compression/optimizer.py +388 -0
  15. abstractcore/compression/orchestrator.py +380 -0
  16. abstractcore/compression/pil_text_renderer.py +818 -0
  17. abstractcore/compression/quality.py +226 -0
  18. abstractcore/compression/text_formatter.py +666 -0
  19. abstractcore/compression/vision_compressor.py +371 -0
  20. abstractcore/config/main.py +66 -1
  21. abstractcore/config/manager.py +111 -5
  22. abstractcore/core/session.py +105 -5
  23. abstractcore/events/__init__.py +1 -1
  24. abstractcore/media/auto_handler.py +312 -18
  25. abstractcore/media/handlers/local_handler.py +14 -2
  26. abstractcore/media/handlers/openai_handler.py +62 -3
  27. abstractcore/media/processors/__init__.py +11 -1
  28. abstractcore/media/processors/direct_pdf_processor.py +210 -0
  29. abstractcore/media/processors/glyph_pdf_processor.py +227 -0
  30. abstractcore/media/processors/image_processor.py +7 -1
  31. abstractcore/media/processors/text_processor.py +18 -3
  32. abstractcore/media/types.py +164 -7
  33. abstractcore/processing/__init__.py +5 -1
  34. abstractcore/processing/basic_deepsearch.py +2173 -0
  35. abstractcore/processing/basic_intent.py +690 -0
  36. abstractcore/providers/__init__.py +18 -0
  37. abstractcore/providers/anthropic_provider.py +29 -2
  38. abstractcore/providers/base.py +279 -6
  39. abstractcore/providers/huggingface_provider.py +658 -27
  40. abstractcore/providers/lmstudio_provider.py +52 -2
  41. abstractcore/providers/mlx_provider.py +103 -4
  42. abstractcore/providers/model_capabilities.py +352 -0
  43. abstractcore/providers/ollama_provider.py +44 -6
  44. abstractcore/providers/openai_provider.py +29 -2
  45. abstractcore/providers/registry.py +91 -19
  46. abstractcore/server/app.py +91 -81
  47. abstractcore/structured/handler.py +161 -1
  48. abstractcore/tools/common_tools.py +98 -3
  49. abstractcore/utils/__init__.py +4 -1
  50. abstractcore/utils/cli.py +114 -1
  51. abstractcore/utils/trace_export.py +287 -0
  52. abstractcore/utils/version.py +1 -1
  53. abstractcore/utils/vlm_token_calculator.py +655 -0
  54. {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/METADATA +140 -23
  55. abstractcore-2.5.3.dist-info/RECORD +107 -0
  56. {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/entry_points.txt +4 -0
  57. abstractcore-2.5.0.dist-info/RECORD +0 -86
  58. {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/WHEEL +0 -0
  59. {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/licenses/LICENSE +0 -0
  60. {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,666 @@
1
+ """
2
+ Text formatter for Glyph compression with markdown-like formatting support.
3
+
4
+ This module provides text preprocessing to improve readability in compressed images
5
+ by handling newlines, markdown formatting, and headers appropriately.
6
+
7
+ The formatter converts markdown-like syntax to ReportLab-compatible rich text
8
+ with proper bold and italic font rendering.
9
+ """
10
+
11
+ import re
12
+ from typing import Dict, Any, Optional, List, Tuple
13
+ from dataclasses import dataclass
14
+
15
+ from ..utils.structured_logging import get_logger
16
+
17
+
18
+ @dataclass
19
+ class TextSegment:
20
+ """Represents a segment of text with formatting information."""
21
+ text: str
22
+ is_bold: bool = False
23
+ is_italic: bool = False
24
+ is_header: bool = False
25
+ header_level: int = 0 # 1, 2, 3 for H1, H2, H3
26
+
27
+
28
+ @dataclass
29
+ class FormattingConfig:
30
+ """Configuration for text formatting options."""
31
+
32
+ # Newline handling - UPDATED RULES
33
+ single_newline_to_space: bool = True # Single \n becomes 1 space
34
+ double_newline_to_two_spaces: bool = True # \n\n becomes 2 spaces
35
+ triple_newline_to_break: bool = True # \n\n\n+ becomes single line break
36
+
37
+ # Markdown formatting
38
+ bold_formatting: bool = True # **text** → BOLD TEXT
39
+ italic_formatting: bool = True # *text* → italic text
40
+
41
+ # Header formatting
42
+ header_formatting: bool = True # Convert # ## ### to A) a) 1)
43
+ header_bold_caps: bool = True # Headers in BOLD AND ALL CAPS
44
+
45
+ # Header numbering styles - HIERARCHICAL
46
+ h1_style: str = "A" # A. B. C. ...
47
+ h2_style: str = "A.1" # A.1. A.2. A.3. ...
48
+ h3_style: str = "A.1.a" # A.1.a. A.1.b. A.1.c. ...
49
+ h4_style: str = "A.1.a.i" # A.1.a.i. A.1.a.ii. A.1.a.iii. ...
50
+ h5_style: str = "A.1.a.i.1" # A.1.a.i.1. A.1.a.i.2. ...
51
+
52
+ def to_dict(self) -> Dict[str, Any]:
53
+ """Convert to dictionary for caching."""
54
+ return {
55
+ 'single_newline_to_space': self.single_newline_to_space,
56
+ 'double_newline_to_two_spaces': self.double_newline_to_two_spaces,
57
+ 'triple_newline_to_break': self.triple_newline_to_break,
58
+ 'bold_formatting': self.bold_formatting,
59
+ 'italic_formatting': self.italic_formatting,
60
+ 'header_formatting': self.header_formatting,
61
+ 'header_bold_caps': self.header_bold_caps,
62
+ 'h1_style': self.h1_style,
63
+ 'h2_style': self.h2_style,
64
+ 'h3_style': self.h3_style,
65
+ 'h4_style': self.h4_style
66
+ }
67
+
68
+
69
+ class TextFormatter:
70
+ """
71
+ Text formatter for improving readability in Glyph-compressed images.
72
+
73
+ Handles markdown-like formatting, newline processing, and header conversion
74
+ to make text more readable when rendered as images.
75
+ """
76
+
77
+ def __init__(self, config: Optional[FormattingConfig] = None):
78
+ """
79
+ Initialize text formatter.
80
+
81
+ Args:
82
+ config: Formatting configuration
83
+ """
84
+ self.config = config or FormattingConfig()
85
+ self.logger = get_logger(self.__class__.__name__)
86
+
87
+ # Header counters for numbering
88
+ self._header_counters = {
89
+ 'h1': 0,
90
+ 'h2': 0,
91
+ 'h3': 0,
92
+ 'h4': 0,
93
+ 'h5': 0
94
+ }
95
+
96
+ self.logger.debug("TextFormatter initialized", config=self.config.to_dict())
97
+
98
+ def format_text(self, text: str) -> List[TextSegment]:
99
+ """
100
+ Apply formatting transformations to text and return structured segments.
101
+
102
+ Args:
103
+ text: Raw text to format
104
+
105
+ Returns:
106
+ List of TextSegment objects with formatting information
107
+ """
108
+ import time
109
+ start_time = time.time()
110
+
111
+ if not text:
112
+ return [TextSegment(text="")]
113
+
114
+ # Better header detection - check for any line starting with #
115
+ has_headers = any(line.strip().startswith('#') for line in text.split('\n')[:100]) # Check first 100 lines for performance
116
+ has_bold_markers = '**' in text
117
+ has_italic_markers = '*' in text and '**' not in text
118
+
119
+ self.logger.debug("Starting text formatting",
120
+ original_length=len(text),
121
+ has_newlines='\n' in text,
122
+ has_bold_markers=has_bold_markers,
123
+ has_italic_markers=has_italic_markers,
124
+ has_headers=has_headers)
125
+
126
+ # Performance optimization: For large files with no formatting, skip complex processing
127
+ if len(text) > 50000 and not has_headers and not has_bold_markers and not has_italic_markers:
128
+ self.logger.debug("Large file with no formatting detected - using fast path")
129
+ # Just process newlines and return as single segment
130
+ processed_text = self._process_newlines(text)
131
+ return [TextSegment(text=processed_text)]
132
+
133
+ # Reset header counters for each new text
134
+ self._reset_counters()
135
+
136
+ # Step 1: Parse into segments with formatting (before newline processing)
137
+ step1_start = time.time()
138
+ self.logger.debug("Step 1: Starting _parse_formatted_text")
139
+ segments = self._parse_formatted_text(text)
140
+ step1_time = time.time() - step1_start
141
+ self.logger.debug(f"Step 1: _parse_formatted_text completed in {step1_time:.3f}s, segments={len(segments)}")
142
+
143
+ # Step 2: Apply newline processing to the final segments
144
+ step2_start = time.time()
145
+ self.logger.debug("Step 2: Starting _apply_newline_processing_to_segments")
146
+ segments = self._apply_newline_processing_to_segments(segments)
147
+ step2_time = time.time() - step2_start
148
+ self.logger.debug(f"Step 2: _apply_newline_processing_to_segments completed in {step2_time:.3f}s")
149
+
150
+ total_time = time.time() - start_time
151
+ self.logger.debug("Text formatting completed",
152
+ original_length=len(text),
153
+ segments_count=len(segments),
154
+ total_formatted_length=sum(len(s.text) for s in segments),
155
+ total_time_seconds=f"{total_time:.3f}")
156
+
157
+ return segments
158
+
159
+ def format_text_to_string(self, text: str) -> str:
160
+ """
161
+ Apply formatting and return as plain text (for backward compatibility).
162
+
163
+ Args:
164
+ text: Raw text to format
165
+
166
+ Returns:
167
+ Formatted text as plain string
168
+ """
169
+ segments = self.format_text(text)
170
+ return ''.join(segment.text for segment in segments)
171
+
172
+ def _reset_counters(self):
173
+ """Reset header counters for new text."""
174
+ self._header_counters = {'h1': 0, 'h2': 0, 'h3': 0, 'h4': 0, 'h5': 0}
175
+
176
+ def _parse_formatted_text(self, text: str) -> List[TextSegment]:
177
+ """
178
+ Parse text with markdown formatting into structured segments.
179
+
180
+ Args:
181
+ text: Text with markdown formatting
182
+
183
+ Returns:
184
+ List of TextSegment objects
185
+ """
186
+ import time
187
+ start_time = time.time()
188
+
189
+ segments = []
190
+
191
+ # Split text by lines first to handle headers
192
+ lines = text.split('\n')
193
+ total_lines = len(lines)
194
+
195
+ self.logger.debug(f"_parse_formatted_text: Processing {total_lines} lines")
196
+
197
+ for line_idx, line in enumerate(lines):
198
+ # Progress logging every 1000 lines for large files
199
+ if line_idx > 0 and line_idx % 1000 == 0:
200
+ elapsed = time.time() - start_time
201
+ self.logger.debug(f"_parse_formatted_text: Progress {line_idx}/{total_lines} lines ({line_idx/total_lines*100:.1f}%) in {elapsed:.2f}s")
202
+ if line.strip():
203
+ # Process headers first
204
+ if self.config.header_formatting and line.strip().startswith('#'):
205
+ header_segment = self._process_header_line_to_segment(line)
206
+ if header_segment:
207
+ segments.append(header_segment)
208
+ # NEVER add line break after header (rule #9)
209
+ continue
210
+
211
+ # Process inline formatting (bold/italic) for non-header lines
212
+ line_segments = self._parse_inline_formatting(line)
213
+ segments.extend(line_segments)
214
+ else:
215
+ # Empty line
216
+ segments.append(TextSegment(text=""))
217
+
218
+ # Add line break after each line (except the last one)
219
+ if line_idx < len(lines) - 1 and not (self.config.header_formatting and line.strip().startswith('#')):
220
+ segments.append(TextSegment(text="\n"))
221
+
222
+ return segments
223
+
224
+ def _apply_newline_processing_to_segments(self, segments: List[TextSegment]) -> List[TextSegment]:
225
+ """
226
+ Apply newline processing rules to segments.
227
+
228
+ Args:
229
+ segments: List of TextSegment objects
230
+
231
+ Returns:
232
+ List of TextSegment objects with newline processing applied
233
+ """
234
+ processed_segments = []
235
+
236
+ for segment in segments:
237
+ if segment.text == "\n":
238
+ # Single newline becomes 1 space
239
+ if self.config.single_newline_to_space:
240
+ processed_segments.append(TextSegment(text=" "))
241
+ else:
242
+ processed_segments.append(segment)
243
+ else:
244
+ # Apply newline processing to text content
245
+ processed_text = self._process_newlines(segment.text)
246
+ processed_segments.append(TextSegment(
247
+ text=processed_text,
248
+ is_bold=segment.is_bold,
249
+ is_italic=segment.is_italic,
250
+ is_header=segment.is_header,
251
+ header_level=segment.header_level
252
+ ))
253
+
254
+ return processed_segments
255
+
256
+ def _process_newlines(self, text: str) -> str:
257
+ """
258
+ Process newlines within text content according to updated rules:
259
+ 1) Single \n → 1 space
260
+ 2) Double \n\n → 2 spaces
261
+ 3) Triple+ \n\n\n → 1 linebreak
262
+
263
+ Also handles literal \n sequences (backslash-n) in addition to actual newlines.
264
+ """
265
+ import re
266
+
267
+ # First, convert literal \n sequences to actual newlines
268
+ text = text.replace('\\n', '\n')
269
+
270
+ # Process in order: triple+, double, single
271
+ # Use placeholder to avoid conflicts
272
+
273
+ # 1. Triple or more newlines → single line break (use placeholder first)
274
+ if self.config.triple_newline_to_break:
275
+ text = re.sub(r'\n{3,}', '___LINEBREAK___', text)
276
+
277
+ # 2. Double newlines → 2 spaces
278
+ if self.config.double_newline_to_two_spaces:
279
+ text = re.sub(r'\n\n', ' ', text)
280
+
281
+ # 3. Single newlines → 1 space
282
+ if self.config.single_newline_to_space:
283
+ text = re.sub(r'\n', ' ', text)
284
+
285
+ # 4. Replace placeholder with actual line break
286
+ text = text.replace('___LINEBREAK___', '\n')
287
+
288
+ return text
289
+
290
+ def _parse_inline_formatting(self, text: str) -> List[TextSegment]:
291
+ """
292
+ Parse inline formatting (bold, italic) in a line of text.
293
+
294
+ Args:
295
+ text: Line of text with potential formatting
296
+
297
+ Returns:
298
+ List of TextSegment objects for this line
299
+ """
300
+ segments = []
301
+
302
+ if not text:
303
+ return segments
304
+
305
+ # Performance optimization: Skip inline parsing if no formatting markers
306
+ if '**' not in text and '*' not in text:
307
+ return [TextSegment(text=text)]
308
+
309
+ # Process text sequentially to handle formatting correctly
310
+ i = 0
311
+ while i < len(text):
312
+ # Check for bold formatting **text**
313
+ if i < len(text) - 3 and text[i:i+2] == '**':
314
+ # Find the closing **
315
+ end_pos = text.find('**', i + 2)
316
+ if end_pos != -1 and end_pos > i + 2: # Must have content between
317
+ # Found bold text
318
+ bold_content = text[i+2:end_pos]
319
+ if bold_content and self.config.bold_formatting:
320
+ segments.append(TextSegment(text=bold_content, is_bold=True))
321
+ i = end_pos + 2
322
+ continue
323
+
324
+ # Check for italic formatting *text* (but not part of **)
325
+ if (i < len(text) - 2 and text[i] == '*' and
326
+ (i == 0 or text[i-1:i+1] != '**') and # Not part of **
327
+ (i >= len(text) - 2 or text[i:i+2] != '**')): # Not start of **
328
+
329
+ # Find the closing *
330
+ end_pos = i + 1
331
+ while end_pos < len(text) and text[end_pos] != '*':
332
+ end_pos += 1
333
+
334
+ if end_pos < len(text) and end_pos > i + 1: # Must have content between
335
+ # Make sure this isn't part of **
336
+ if end_pos >= len(text) - 1 or text[end_pos:end_pos+2] != '**':
337
+ italic_content = text[i+1:end_pos]
338
+ if italic_content and self.config.italic_formatting:
339
+ segments.append(TextSegment(text=italic_content, is_italic=True))
340
+ i = end_pos + 1
341
+ continue
342
+
343
+ # Regular character - collect until next formatting marker or advance by 1
344
+ start_pos = i
345
+ while i < len(text) and text[i] != '*':
346
+ i += 1
347
+
348
+ if i > start_pos:
349
+ plain_text = text[start_pos:i]
350
+ if plain_text: # Only add non-empty segments
351
+ segments.append(TextSegment(text=plain_text))
352
+ else:
353
+ # If we didn't advance, we hit a * that didn't match formatting
354
+ # Add the single character and advance to prevent infinite loop
355
+ segments.append(TextSegment(text=text[i]))
356
+ i += 1
357
+
358
+ return segments
359
+
360
+ def _process_header_line_to_segment(self, line: str) -> Optional[TextSegment]:
361
+ """
362
+ Process a header line and return a TextSegment.
363
+
364
+ Args:
365
+ line: Line starting with # ## or ###
366
+
367
+ Returns:
368
+ TextSegment with header formatting, or None if not a valid header
369
+ """
370
+ stripped = line.strip()
371
+
372
+ if stripped.startswith('#####'):
373
+ # H5 header
374
+ content = stripped[5:].strip()
375
+ if content:
376
+ self._header_counters['h5'] += 1
377
+ number = self._get_header_number('h5', self._header_counters['h5'])
378
+ # Process inline formatting in header content
379
+ clean_content = self._strip_markdown_formatting(content)
380
+ formatted_content = f"{number} {clean_content.upper() if self.config.header_bold_caps else clean_content}"
381
+ return TextSegment(text=formatted_content, is_bold=True, is_header=True, header_level=5)
382
+
383
+ elif stripped.startswith('####'):
384
+ # H4 header
385
+ content = stripped[4:].strip()
386
+ if content:
387
+ self._header_counters['h4'] += 1
388
+ # Reset h5 counter when we encounter h4
389
+ self._header_counters['h5'] = 0
390
+ number = self._get_header_number('h4', self._header_counters['h4'])
391
+ # Process inline formatting in header content
392
+ clean_content = self._strip_markdown_formatting(content)
393
+ formatted_content = f"{number} {clean_content.upper() if self.config.header_bold_caps else clean_content}"
394
+ return TextSegment(text=formatted_content, is_bold=True, is_header=True, header_level=4)
395
+
396
+ elif stripped.startswith('###'):
397
+ # H3 header
398
+ content = stripped[3:].strip()
399
+ if content:
400
+ self._header_counters['h3'] += 1
401
+ # Reset h4 and h5 counters when we encounter h3
402
+ self._header_counters['h4'] = 0
403
+ self._header_counters['h5'] = 0
404
+ number = self._get_header_number('h3', self._header_counters['h3'])
405
+ # Process inline formatting in header content
406
+ clean_content = self._strip_markdown_formatting(content)
407
+ formatted_content = f"{number} {clean_content.upper() if self.config.header_bold_caps else clean_content}"
408
+ return TextSegment(text=formatted_content, is_bold=True, is_header=True, header_level=3)
409
+
410
+ elif stripped.startswith('##'):
411
+ # H2 header
412
+ content = stripped[2:].strip()
413
+ if content:
414
+ self._header_counters['h2'] += 1
415
+ # Reset h3, h4, and h5 counters when we encounter h2
416
+ self._header_counters['h3'] = 0
417
+ self._header_counters['h4'] = 0
418
+ self._header_counters['h5'] = 0
419
+ number = self._get_header_number('h2', self._header_counters['h2'])
420
+ # Process inline formatting in header content
421
+ clean_content = self._strip_markdown_formatting(content)
422
+ formatted_content = f"{number} {clean_content.upper() if self.config.header_bold_caps else clean_content}"
423
+ return TextSegment(text=formatted_content, is_bold=True, is_header=True, header_level=2)
424
+
425
+ elif stripped.startswith('#'):
426
+ # H1 header - NO NUMBERING according to new rules
427
+ content = stripped[1:].strip()
428
+ if content:
429
+ self._header_counters['h1'] += 1
430
+ # Reset h2, h3, h4, and h5 counters when we encounter h1
431
+ self._header_counters['h2'] = 0
432
+ self._header_counters['h3'] = 0
433
+ self._header_counters['h4'] = 0
434
+ self._header_counters['h5'] = 0
435
+ # Process inline formatting in header content
436
+ clean_content = self._strip_markdown_formatting(content)
437
+ formatted_content = f"{clean_content.upper() if self.config.header_bold_caps else clean_content}"
438
+ return TextSegment(text=formatted_content, is_bold=True, is_header=True, header_level=1)
439
+
440
+ return None
441
+
442
+ def _strip_markdown_formatting(self, text: str) -> str:
443
+ """Strip markdown formatting markers from text."""
444
+ # Remove **bold** markers
445
+ text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
446
+ # Remove *italic* markers
447
+ text = re.sub(r'(?<!\*)\*([^*]+?)\*(?!\*)', r'\1', text)
448
+ return text
449
+
450
+
451
+ def _process_bold(self, text: str) -> str:
452
+ """
453
+ Process bold markdown formatting (**text** → BOLD TEXT).
454
+
455
+ Note: In text rendering, we'll use uppercase to simulate bold
456
+ since ReportLab bold fonts may not be available.
457
+ """
458
+ def bold_replacer(match):
459
+ content = match.group(1)
460
+ # Convert to uppercase to simulate bold in plain text
461
+ return content.upper()
462
+
463
+ # Match **text** patterns (non-greedy)
464
+ return re.sub(r'\*\*(.*?)\*\*', bold_replacer, text)
465
+
466
+ def _process_italic(self, text: str) -> str:
467
+ """
468
+ Process italic markdown formatting (*text* → italic text).
469
+
470
+ Note: We'll keep italic text as-is since true italic rendering
471
+ would require font changes in ReportLab.
472
+ """
473
+ def italic_replacer(match):
474
+ content = match.group(1)
475
+ # For now, just remove the markers and keep text as-is
476
+ # In future, could add special markers for ReportLab italic rendering
477
+ return content
478
+
479
+ # Match *text* patterns (but not **text**) - single asterisks only
480
+ return re.sub(r'(?<!\*)\*([^*]+?)\*(?!\*)', italic_replacer, text)
481
+
482
+ def _process_headers(self, text: str) -> str:
483
+ """
484
+ Process markdown headers and convert to numbered format.
485
+
486
+ # Header → A) HEADER
487
+ ## Header → a) HEADER
488
+ ### Header → 1) HEADER
489
+ """
490
+ lines = text.split('\n')
491
+ processed_lines = []
492
+
493
+ for line in lines:
494
+ processed_line = self._process_header_line(line)
495
+ processed_lines.append(processed_line)
496
+
497
+ return '\n'.join(processed_lines)
498
+
499
+ def _process_header_line(self, line: str) -> str:
500
+ """Process a single line for header formatting."""
501
+ stripped = line.strip()
502
+
503
+ # Check for headers
504
+ if stripped.startswith('###'):
505
+ # H3 header
506
+ content = stripped[3:].strip()
507
+ if content:
508
+ self._header_counters['h3'] += 1
509
+ number = self._get_header_number('h3', self._header_counters['h3'])
510
+ formatted_content = content.upper() if self.config.header_bold_caps else content
511
+ return f"{number}) {formatted_content}"
512
+
513
+ elif stripped.startswith('##'):
514
+ # H2 header
515
+ content = stripped[2:].strip()
516
+ if content:
517
+ self._header_counters['h2'] += 1
518
+ # Reset h3 counter when we encounter h2
519
+ self._header_counters['h3'] = 0
520
+ number = self._get_header_number('h2', self._header_counters['h2'])
521
+ formatted_content = content.upper() if self.config.header_bold_caps else content
522
+ return f"{number}) {formatted_content}"
523
+
524
+ elif stripped.startswith('#'):
525
+ # H1 header
526
+ content = stripped[1:].strip()
527
+ if content:
528
+ self._header_counters['h1'] += 1
529
+ # Reset h2 and h3 counters when we encounter h1
530
+ self._header_counters['h2'] = 0
531
+ self._header_counters['h3'] = 0
532
+ number = self._get_header_number('h1', self._header_counters['h1'])
533
+ formatted_content = content.upper() if self.config.header_bold_caps else content
534
+ return f"{number}) {formatted_content}"
535
+
536
+ return line
537
+
538
+ def _get_header_number(self, level: str, count: int) -> str:
539
+ """
540
+ Get the appropriate header number/letter based on level and count.
541
+
542
+ NEW HIERARCHICAL FORMAT:
543
+ H1: No numbering
544
+ H2: A. B. C. ...
545
+ H3: A.1. A.2. A.3. ...
546
+ H4: A.1.a. A.1.b. A.1.c. ...
547
+ H5: A.1.a.i. A.1.a.ii. A.1.a.iii. ...
548
+
549
+ Args:
550
+ level: Header level ('h1', 'h2', 'h3', 'h4', 'h5')
551
+ count: Current count for this level
552
+
553
+ Returns:
554
+ Formatted number/letter (e.g., 'A.', 'A.1.', 'A.1.a.')
555
+ """
556
+
557
+ if level == 'h1':
558
+ # H1: No numbering
559
+ return ""
560
+
561
+ elif level == 'h2':
562
+ # H2: A. B. C. ...
563
+ if count <= 26:
564
+ letter = chr(ord('A') + count - 1)
565
+ else:
566
+ # After Z, use AA, BB, CC, etc.
567
+ letter = chr(ord('A') + ((count - 1) % 26))
568
+ letter = letter * ((count - 1) // 26 + 1)
569
+ return f"{letter}."
570
+
571
+ elif level == 'h3':
572
+ # H3: A.1. A.2. A.3. ...
573
+ h2_count = self._header_counters['h2']
574
+ if h2_count <= 26:
575
+ h2_letter = chr(ord('A') + h2_count - 1)
576
+ else:
577
+ h2_letter = chr(ord('A') + ((h2_count - 1) % 26))
578
+ h2_letter = h2_letter * ((h2_count - 1) // 26 + 1)
579
+ return f"{h2_letter}.{count}."
580
+
581
+ elif level == 'h4':
582
+ # H4: A.1.a. A.1.b. A.1.c. ...
583
+ h2_count = self._header_counters['h2']
584
+ h3_count = self._header_counters['h3']
585
+
586
+ if h2_count <= 26:
587
+ h2_letter = chr(ord('A') + h2_count - 1)
588
+ else:
589
+ h2_letter = chr(ord('A') + ((h2_count - 1) % 26))
590
+ h2_letter = h2_letter * ((h2_count - 1) // 26 + 1)
591
+
592
+ if count <= 26:
593
+ h4_letter = chr(ord('a') + count - 1)
594
+ else:
595
+ h4_letter = chr(ord('a') + ((count - 1) % 26))
596
+ h4_letter = h4_letter * ((count - 1) // 26 + 1)
597
+
598
+ return f"{h2_letter}.{h3_count}.{h4_letter}."
599
+
600
+ elif level == 'h5':
601
+ # H5: A.1.a.i. A.1.a.ii. A.1.a.iii. ...
602
+ h2_count = self._header_counters['h2']
603
+ h3_count = self._header_counters['h3']
604
+ h4_count = self._header_counters['h4']
605
+
606
+ if h2_count <= 26:
607
+ h2_letter = chr(ord('A') + h2_count - 1)
608
+ else:
609
+ h2_letter = chr(ord('A') + ((h2_count - 1) % 26))
610
+ h2_letter = h2_letter * ((h2_count - 1) // 26 + 1)
611
+
612
+ if h4_count <= 26:
613
+ h4_letter = chr(ord('a') + h4_count - 1)
614
+ else:
615
+ h4_letter = chr(ord('a') + ((h4_count - 1) % 26))
616
+ h4_letter = h4_letter * ((h4_count - 1) // 26 + 1)
617
+
618
+ h5_roman = self._int_to_roman_lower(count)
619
+ return f"{h2_letter}.{h3_count}.{h4_letter}.{h5_roman}."
620
+
621
+ return str(count)
622
+
623
+ def _int_to_roman_lower(self, num: int) -> str:
624
+ """Convert integer to lowercase Roman numeral."""
625
+ values = [1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1]
626
+ literals = ['m', 'cm', 'd', 'cd', 'c', 'xc', 'l', 'xl', 'x', 'ix', 'v', 'iv', 'i']
627
+
628
+ result = ""
629
+ for i in range(len(values)):
630
+ count = num // values[i]
631
+ if count:
632
+ result += literals[i] * count
633
+ num -= values[i] * count
634
+ return result
635
+
636
+ def get_formatting_summary(self) -> Dict[str, Any]:
637
+ """Get summary of formatting configuration and usage."""
638
+ return {
639
+ 'config': self.config.to_dict(),
640
+ 'header_counters': self._header_counters.copy(),
641
+ 'formatter_version': '1.0'
642
+ }
643
+
644
+
645
+ def create_default_formatter() -> TextFormatter:
646
+ """Create a TextFormatter with default configuration."""
647
+ return TextFormatter(FormattingConfig())
648
+
649
+
650
+ def create_minimal_formatter() -> TextFormatter:
651
+ """Create a TextFormatter with minimal formatting (only newlines)."""
652
+ config = FormattingConfig()
653
+ config.bold_formatting = False
654
+ config.italic_formatting = False
655
+ config.header_formatting = False
656
+ return TextFormatter(config)
657
+
658
+
659
+ def create_headers_only_formatter() -> TextFormatter:
660
+ """Create a TextFormatter that only processes headers."""
661
+ config = FormattingConfig()
662
+ config.bold_formatting = False
663
+ config.italic_formatting = False
664
+ config.consecutive_newlines_to_break = False
665
+ config.single_newline_to_spaces = False
666
+ return TextFormatter(config)