deepagents-printshop 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. agents/content_editor/__init__.py +1 -0
  2. agents/content_editor/agent.py +279 -0
  3. agents/content_editor/content_reviewer.py +327 -0
  4. agents/content_editor/versioned_agent.py +455 -0
  5. agents/latex_specialist/__init__.py +1 -0
  6. agents/latex_specialist/agent.py +531 -0
  7. agents/latex_specialist/latex_analyzer.py +510 -0
  8. agents/latex_specialist/latex_optimizer.py +1192 -0
  9. agents/qa_orchestrator/__init__.py +1 -0
  10. agents/qa_orchestrator/agent.py +603 -0
  11. agents/qa_orchestrator/langgraph_workflow.py +733 -0
  12. agents/qa_orchestrator/pipeline_types.py +72 -0
  13. agents/qa_orchestrator/quality_gates.py +495 -0
  14. agents/qa_orchestrator/workflow_coordinator.py +139 -0
  15. agents/research_agent/__init__.py +1 -0
  16. agents/research_agent/agent.py +258 -0
  17. agents/research_agent/llm_report_generator.py +1023 -0
  18. agents/research_agent/report_generator.py +536 -0
  19. agents/visual_qa/__init__.py +1 -0
  20. agents/visual_qa/agent.py +410 -0
  21. deepagents_printshop-0.1.0.dist-info/METADATA +744 -0
  22. deepagents_printshop-0.1.0.dist-info/RECORD +37 -0
  23. deepagents_printshop-0.1.0.dist-info/WHEEL +4 -0
  24. deepagents_printshop-0.1.0.dist-info/entry_points.txt +2 -0
  25. deepagents_printshop-0.1.0.dist-info/licenses/LICENSE +86 -0
  26. tools/__init__.py +1 -0
  27. tools/change_tracker.py +419 -0
  28. tools/content_type_loader.py +171 -0
  29. tools/graph_generator.py +281 -0
  30. tools/latex_generator.py +374 -0
  31. tools/llm_latex_generator.py +678 -0
  32. tools/magazine_layout.py +462 -0
  33. tools/pattern_injector.py +250 -0
  34. tools/pattern_learner.py +477 -0
  35. tools/pdf_compiler.py +386 -0
  36. tools/version_manager.py +346 -0
  37. tools/visual_qa.py +799 -0
@@ -0,0 +1,1192 @@
1
+ """
2
+ LaTeX Optimizer - Milestone 3
3
+
4
+ Optimizes LaTeX document structure, typography, and formatting for professional quality.
5
+ """
6
+
7
+ import re
8
+ import os
9
+ import csv
10
+ from typing import Dict, List, Tuple, Optional
11
+ from pathlib import Path
12
+ from datetime import datetime
13
+ import anthropic
14
+
15
+ # Add project root to path
16
+ project_root = Path(__file__).parent.parent.parent
17
+ if str(project_root) not in __import__('sys').path:
18
+ __import__('sys').path.insert(0, str(project_root))
19
+
20
+ from tools.latex_generator import LaTeXGenerator, DocumentConfig
21
+ from tools.content_type_loader import ContentTypeLoader
22
+
23
+
24
+ class LaTeXOptimizer:
25
+ """
26
+ Optimizes LaTeX documents for professional formatting and structure.
27
+
28
+ Features:
29
+ - Document structure optimization
30
+ - Typography enhancement
31
+ - Table and figure formatting improvement
32
+ - LaTeX best practices application
33
+ """
34
+
35
+ def __init__(self, content_source: str = "research_report"):
36
+ """Initialize the LaTeX optimizer.
37
+
38
+ Args:
39
+ content_source: Content source folder name (e.g., 'research_report', 'magazine')
40
+ """
41
+ self.content_source = content_source
42
+ self.content_dir = Path("artifacts/sample_content") / content_source
43
+ self.api_key = os.getenv('ANTHROPIC_API_KEY')
44
+ self.client = anthropic.Anthropic(api_key=self.api_key) if self.api_key else None
45
+ self.professional_packages = {
46
+ 'typography': [
47
+ '\\usepackage[T1]{fontenc}',
48
+ '\\usepackage[utf8]{inputenc}',
49
+ '\\usepackage{microtype}',
50
+ '\\usepackage{lmodern}'
51
+ ],
52
+ 'tables': [
53
+ '\\usepackage{booktabs}',
54
+ '\\usepackage{array}',
55
+ '\\usepackage{longtable}'
56
+ ],
57
+ 'figures': [
58
+ '\\usepackage{graphicx}',
59
+ '\\usepackage{float}',
60
+ '\\usepackage{caption}',
61
+ '\\usepackage{subcaption}'
62
+ ],
63
+ 'layout': [
64
+ '\\usepackage{geometry}',
65
+ '\\usepackage{fancyhdr}',
66
+ '\\usepackage{titlesec}'
67
+ ],
68
+ 'references': [
69
+ '\\usepackage{hyperref}',
70
+ '\\usepackage{cite}',
71
+ '\\usepackage{url}'
72
+ ]
73
+ }
74
+
75
+ self.document_templates = {
76
+ 'article': {
77
+ 'geometry': '\\geometry{margin=1in}',
78
+ 'spacing': '\\usepackage{setspace}\\onehalfspacing',
79
+ 'sections': ['section', 'subsection', 'subsubsection']
80
+ },
81
+ 'report': {
82
+ 'geometry': '\\geometry{margin=1in}',
83
+ 'spacing': '\\usepackage{setspace}\\onehalfspacing',
84
+ 'sections': ['chapter', 'section', 'subsection', 'subsubsection']
85
+ }
86
+ }
87
+
88
+ def optimize_document(self,
89
+ content: str,
90
+ markdown_content: Dict[str, str],
91
+ optimization_level: str = 'moderate') -> Dict:
92
+ """
93
+ Optimize LaTeX document comprehensively.
94
+
95
+ Args:
96
+ content: Original LaTeX content or markdown content
97
+ markdown_content: Dictionary of markdown files to convert
98
+ optimization_level: 'conservative', 'moderate', 'aggressive'
99
+
100
+ Returns:
101
+ Dictionary with optimized content and optimization details
102
+ """
103
+ print(f"🔧 Starting LaTeX optimization (level: {optimization_level})")
104
+
105
+ # If we have markdown content, convert to LaTeX first
106
+ has_type_preamble = False
107
+ if markdown_content:
108
+ latex_content = self._convert_markdown_to_latex(markdown_content)
109
+ # Check if the content type provided its own preamble blocks
110
+ config_data = self.load_config_from_markdown(markdown_content)
111
+ content_type = config_data.get('_content_type')
112
+ if content_type and content_type.latex_preamble_blocks:
113
+ has_type_preamble = True
114
+ else:
115
+ latex_content = content
116
+
117
+ # Apply optimizations in order
118
+ optimizations_applied = []
119
+
120
+ # Skip structure and typography optimization when content type provides its own preamble
121
+ # (these add duplicate packages and rewrite the preamble)
122
+ if not has_type_preamble:
123
+ # 1. Structure optimization
124
+ latex_content, struct_opts = self._optimize_structure(latex_content)
125
+ optimizations_applied.extend(struct_opts)
126
+
127
+ # 2. Typography optimization
128
+ latex_content, typo_opts = self._optimize_typography(latex_content, optimization_level)
129
+ optimizations_applied.extend(typo_opts)
130
+
131
+ # 5. References and citations
132
+ latex_content, ref_opts = self._optimize_references(latex_content)
133
+ optimizations_applied.extend(ref_opts)
134
+
135
+ # 3. Table optimization
136
+ latex_content, table_opts = self._optimize_tables(latex_content)
137
+ optimizations_applied.extend(table_opts)
138
+
139
+ # 4. Figure optimization
140
+ latex_content, figure_opts = self._optimize_figures(latex_content)
141
+ optimizations_applied.extend(figure_opts)
142
+
143
+ # 6. General cleanup
144
+ latex_content, cleanup_opts = self._apply_general_cleanup(latex_content)
145
+ optimizations_applied.extend(cleanup_opts)
146
+
147
+ # 7. Final formatting pass
148
+ latex_content = self._final_formatting_pass(latex_content)
149
+
150
+ print(f"✅ Applied {len(optimizations_applied)} optimizations")
151
+
152
+ return {
153
+ 'optimized_content': latex_content,
154
+ 'optimizations_applied': optimizations_applied,
155
+ 'optimization_count': len(optimizations_applied),
156
+ 'optimization_level': optimization_level,
157
+ 'timestamp': datetime.now().isoformat()
158
+ }
159
+
160
+ def load_config_from_markdown(self, markdown_content: Dict[str, str]) -> Dict:
161
+ """Load document configuration from config.md in the markdown_content dict.
162
+
163
+ Uses ContentTypeLoader to resolve the content type and extract
164
+ document class, font size, and paper size from the type definition.
165
+ Parses remaining config sections (metadata, manifest, options) from config.md.
166
+
167
+ Args:
168
+ markdown_content: Dictionary of filename -> content loaded by version manager
169
+
170
+ Returns:
171
+ Parsed configuration dictionary
172
+ """
173
+ config_md = markdown_content.get("config.md", "")
174
+ config = {}
175
+
176
+ if config_md:
177
+ lines = config_md.split('\n')
178
+ current_section = None
179
+ content_lines = []
180
+
181
+ for line in lines:
182
+ if line.startswith('## '):
183
+ if current_section and content_lines:
184
+ config[current_section] = self._parse_config_section_simple(current_section, content_lines)
185
+ current_section = line.replace('## ', '').strip().lower()
186
+ content_lines = []
187
+ elif line.strip() and not line.startswith('#'):
188
+ content_lines.append(line)
189
+
190
+ if current_section and content_lines:
191
+ config[current_section] = self._parse_config_section_simple(current_section, content_lines)
192
+
193
+ # Load content type definition
194
+ type_id = config.get('content type', 'research_report')
195
+ if isinstance(type_id, str):
196
+ type_id = type_id.strip()
197
+
198
+ loader = ContentTypeLoader(types_dir=str(project_root / "content_types"))
199
+ content_type = loader.load_type(type_id)
200
+
201
+ # Inject type defaults into config
202
+ config['document class'] = content_type.document_class
203
+ config['_content_type'] = content_type
204
+ config['_type_font_size'] = content_type.default_font_size
205
+ config['_type_paper_size'] = content_type.default_paper_size
206
+
207
+ # Parse project metadata into top-level fields
208
+ # _parse_config_section_simple already strips '- ' prefixes,
209
+ # so lines arrive as "Key: Value" not "- Key: Value"
210
+ project_meta = config.get('project metadata', '')
211
+ if isinstance(project_meta, str):
212
+ for line in project_meta.split('\n'):
213
+ line = line.strip()
214
+ if ':' in line:
215
+ key, value = line.split(':', 1)
216
+ key = key.strip().strip('*').lower()
217
+ value = value.strip()
218
+ if key == 'title':
219
+ config['title'] = value
220
+ elif key == 'authors':
221
+ config['authors'] = [a.strip() for a in value.split(',')]
222
+
223
+ return config
224
+
225
+ def _parse_config_section_simple(self, section_name: str, content_lines: list):
226
+ """Parse configuration sections from config.md."""
227
+ if section_name in ['document options', 'headers and footers']:
228
+ result = {}
229
+ for line in content_lines:
230
+ if line.startswith('- ') and ':' in line:
231
+ key_value = line[2:].split(':', 1)
232
+ if len(key_value) == 2:
233
+ key = key_value[0].strip()
234
+ value = key_value[1].strip()
235
+ if value.lower() in ['true', 'false']:
236
+ value = value.lower() == 'true'
237
+ result[key] = value
238
+ return result
239
+ elif section_name == 'content manifest':
240
+ structure = []
241
+ for line in content_lines:
242
+ if line.strip() and line[0].isdigit():
243
+ parts = line.split('.', 1)
244
+ if len(parts) == 2:
245
+ section_def = parts[1].strip()
246
+ # Try "Title (filename.md)" format first
247
+ paren_match = re.match(r'^(.+?)\s*\((\S+\.md)\)\s*$', section_def)
248
+ if paren_match:
249
+ title = paren_match.group(1).strip()
250
+ source = paren_match.group(2).strip()
251
+ structure.append({
252
+ 'title': title,
253
+ 'source': source,
254
+ 'type': 'markdown'
255
+ })
256
+ elif ':' in section_def:
257
+ title, source = section_def.split(':', 1)
258
+ structure.append({
259
+ 'title': title.strip(),
260
+ 'source': source.strip(),
261
+ 'type': 'markdown' if source.strip().endswith('.md') else 'auto'
262
+ })
263
+ else:
264
+ structure.append({
265
+ 'title': section_def,
266
+ 'source': None,
267
+ 'type': 'auto'
268
+ })
269
+ return structure
270
+ else:
271
+ content = '\n'.join(content_lines).strip()
272
+ if all(line.startswith('- ') or not line.strip() for line in content_lines if line.strip()):
273
+ return '\n'.join(line[2:] if line.startswith('- ') else line for line in content_lines).strip()
274
+ return content
275
+
276
+ def _generate_visualizations(self, gen: LaTeXGenerator):
277
+ """No-op: visualizations are now content-driven via IMAGE comments."""
278
+ pass
279
+
280
+ def _add_images(self, gen: LaTeXGenerator):
281
+ """Add images from the content directory as figures."""
282
+ images_dir = self.content_dir / "images"
283
+ if not images_dir.exists():
284
+ return
285
+
286
+ for img_file in sorted(images_dir.iterdir()):
287
+ if img_file.suffix.lower() in ['.png', '.jpg', '.jpeg']:
288
+ caption = img_file.stem.replace('_', ' ').replace('-', ' ').title()
289
+ label = f"fig:{img_file.stem}"
290
+ gen.add_figure(str(img_file), caption, label=label)
291
+
292
+ def _add_csv_tables(self, gen: LaTeXGenerator):
293
+ """Add CSV data tables from the content directory."""
294
+ data_dir = self.content_dir / "data"
295
+ if not data_dir.exists():
296
+ return
297
+
298
+ for csv_file in sorted(data_dir.iterdir()):
299
+ if csv_file.suffix.lower() == '.csv':
300
+ try:
301
+ with open(csv_file, 'r', encoding='utf-8') as f:
302
+ reader = csv.reader(f)
303
+ rows = list(reader)
304
+
305
+ if rows:
306
+ headers = rows[0]
307
+ data_rows = rows[1:]
308
+ caption = csv_file.stem.replace('_', ' ').replace('-', ' ').title()
309
+ label = f"tab:{csv_file.stem}"
310
+ gen.add_table(
311
+ caption=caption,
312
+ headers=headers,
313
+ rows=data_rows,
314
+ label=label,
315
+ )
316
+ except Exception as e:
317
+ gen.add_raw_latex(f"% Error loading CSV {csv_file.name}: {e}")
318
+
319
+ def _add_bibliography(self, gen: LaTeXGenerator):
320
+ """Add standard bibliography entries."""
321
+ gen.add_bib_entry(
322
+ "\\bibitem{vaswani2017attention}\n"
323
+ "Vaswani, A., et al. (2017). "
324
+ "Attention is all you need. "
325
+ "In Advances in neural information processing systems (pp. 5998-6008)."
326
+ )
327
+ gen.add_bib_entry(
328
+ "\\bibitem{devlin2018bert}\n"
329
+ "Devlin, J., et al. (2018). "
330
+ "BERT: Pre-training of deep bidirectional transformers for language understanding. "
331
+ "arXiv preprint arXiv:1810.04805."
332
+ )
333
+ gen.add_bib_entry(
334
+ "\\bibitem{brown2020gpt3}\n"
335
+ "Brown, T., et al. (2020). "
336
+ "Language models are few-shot learners. "
337
+ "Advances in neural information processing systems, 33, 1877-1901."
338
+ )
339
+
340
+ def _convert_markdown_to_latex(self, markdown_content: Dict[str, str]) -> str:
341
+ """Convert markdown content to a complete LaTeX document.
342
+
343
+ Uses a single holistic LLM call with the full type.md rendering instructions,
344
+ available macros, structure rules, and all section content. The preamble is
345
+ built programmatically from type.md LaTeX code blocks.
346
+ """
347
+ config_data = self.load_config_from_markdown(markdown_content)
348
+ content_type = config_data.get('_content_type')
349
+
350
+ # Pre-process all markdown sections for inline references
351
+ document_structure = config_data.get('content manifest', [])
352
+ processed_sections = []
353
+
354
+ if document_structure:
355
+ for section in document_structure:
356
+ title = section['title']
357
+ source = section.get('source')
358
+ section_type = section.get('type', 'auto')
359
+
360
+ if section_type == 'markdown' and source:
361
+ md_content = markdown_content.get(source, '')
362
+ if md_content:
363
+ processed = self._process_csv_table_references(md_content, str(self.content_dir))
364
+ processed = self._process_image_references(processed, str(self.content_dir))
365
+ processed = self._process_tikz_references(processed)
366
+ processed = re.sub(r'^#\s+[^\n]*\n*', '', processed, count=1)
367
+ processed_sections.append({'title': title, 'content': processed, 'source': source})
368
+ else:
369
+ processed_sections.append({'title': title, 'content': f'[Content not found: {source}]', 'source': source})
370
+ elif title.lower() == 'abstract':
371
+ abstract_content = config_data.get('abstract', '')
372
+ if abstract_content:
373
+ processed_sections.append({'title': 'Abstract', 'content': abstract_content.strip(), 'source': None})
374
+ else:
375
+ processed_sections.append({'title': title, 'content': '[Auto-generated content placeholder]', 'source': None})
376
+ else:
377
+ for filename, content in markdown_content.items():
378
+ if filename == 'config.md':
379
+ continue
380
+ processed = self._process_csv_table_references(content, str(self.content_dir))
381
+ processed = self._process_image_references(processed, str(self.content_dir))
382
+ processed = self._process_tikz_references(processed)
383
+ title = filename.replace('.md', '').replace('_', ' ').title()
384
+ processed_sections.append({'title': title, 'content': processed, 'source': filename})
385
+
386
+ # Get type.md properties
387
+ rendering_instructions = content_type.rendering_instructions if content_type else ""
388
+ preamble_blocks = content_type.latex_preamble_blocks if content_type else []
389
+ structure_rules = content_type.structure_rules if content_type else ""
390
+
391
+ # Build preamble
392
+ preamble = self._build_preamble(config_data, preamble_blocks)
393
+
394
+ # Assemble content for prompt
395
+ assembled_content = self._assemble_content_for_prompt(config_data, document_structure, processed_sections)
396
+
397
+ # Generate document body via single holistic LLM call
398
+ body = self._generate_document_body(
399
+ assembled_content, config_data, rendering_instructions, preamble, structure_rules
400
+ )
401
+
402
+ # Assemble final document
403
+ document = preamble + "\n\n\\begin{document}\n\n" + body + "\n\n\\end{document}\n"
404
+
405
+ return document
406
+
407
+ def _build_preamble(self, config_data: Dict, type_preamble_blocks: List[str]) -> str:
408
+ """Build the LaTeX preamble from type.md code blocks or defaults."""
409
+ doc_class = config_data.get('document class', 'article')
410
+ font_size = config_data.get('_type_font_size', '12pt')
411
+ paper_size = config_data.get('_type_paper_size', 'letterpaper')
412
+ doc_options = config_data.get('document options', {})
413
+ font_size = doc_options.get('font_size', font_size)
414
+ paper_size = doc_options.get('paper_size', paper_size)
415
+
416
+ documentclass_line = f"\\documentclass[{font_size},{paper_size}]{{{doc_class}}}"
417
+ preamble_lines = [documentclass_line]
418
+ print(f" [LaTeX] Preamble documentclass: {documentclass_line}")
419
+
420
+ content_type_id = config_data.get('content type', config_data.get('_content_type', None))
421
+ if hasattr(content_type_id, 'type_id'):
422
+ content_type_id = content_type_id.type_id
423
+
424
+ if type_preamble_blocks:
425
+ print(f" [LaTeX] Loaded {len(type_preamble_blocks)} preamble blocks from {content_type_id or 'content type'} type.md")
426
+ for block in type_preamble_blocks:
427
+ preamble_lines.append(block.strip())
428
+ else:
429
+ if content_type_id and content_type_id != 'research_report':
430
+ print(f" [LaTeX] WARNING: Content type '{content_type_id}' has ZERO preamble blocks — falling back to default preamble. "
431
+ f"This is likely a bug (type.md not found or missing ```latex blocks).")
432
+ else:
433
+ print(f" [LaTeX] Using default preamble (no content type preamble blocks)")
434
+ preamble_lines.append(self._default_preamble())
435
+
436
+ return "\n\n".join(preamble_lines)
437
+
438
+ def _default_preamble(self) -> str:
439
+ """Fallback preamble packages for content types without explicit LaTeX code blocks."""
440
+ return (
441
+ "\\usepackage[T1]{fontenc}\n"
442
+ "\\usepackage[utf8]{inputenc}\n"
443
+ "\\usepackage{lmodern}\n"
444
+ "\\usepackage{microtype}\n"
445
+ "\\usepackage{amsmath}\n"
446
+ "\\usepackage{graphicx}\n"
447
+ "\\usepackage{booktabs}\n"
448
+ "\\usepackage{array}\n"
449
+ "\\usepackage{longtable}\n"
450
+ "\\usepackage{float}\n"
451
+ "\\usepackage{caption}\n"
452
+ "\\usepackage{geometry}\n"
453
+ "\\geometry{margin=1in}\n"
454
+ "\\usepackage{fancyhdr}\n"
455
+ "\\usepackage{setspace}\n"
456
+ "\\onehalfspacing\n"
457
+ "\\usepackage{hyperref}\n"
458
+ "\\hypersetup{colorlinks=true,linkcolor=blue,citecolor=red,urlcolor=blue}\n"
459
+ "\\usepackage{tikz}\n"
460
+ )
461
+
462
+ def _assemble_content_for_prompt(self, config_data: Dict, structure: List, sections: List[Dict]) -> str:
463
+ """Concatenate all sections with delimiters for the LLM prompt."""
464
+ parts = []
465
+ for sec in sections:
466
+ parts.append(f"=== SECTION: {sec['title']} ===")
467
+ if sec.get('source'):
468
+ parts.append(f"(source: {sec['source']})")
469
+ parts.append(sec['content'])
470
+ parts.append("")
471
+ return "\n".join(parts)
472
+
473
+ def _generate_document_body(self, content: str, config: Dict, instructions: str,
474
+ preamble: str, rules: str) -> str:
475
+ """Generate the complete document body via a single holistic LLM call."""
476
+ if not self.client:
477
+ raise RuntimeError("ANTHROPIC_API_KEY not set — cannot convert markdown to LaTeX")
478
+
479
+ # Build system prompt with rendering context
480
+ system_parts = [
481
+ "You are a LaTeX document generation specialist. Generate the BODY of a LaTeX document "
482
+ "(everything between \\begin{document} and \\end{document}). Output ONLY raw LaTeX — "
483
+ "no code fences, no \\documentclass, no preamble, no \\begin{document}/\\end{document}."
484
+ ]
485
+
486
+ if instructions:
487
+ system_parts.append(f"\n\n## RENDERING INSTRUCTIONS\nFollow these instructions precisely:\n\n{instructions}")
488
+
489
+ if preamble:
490
+ system_parts.append(
491
+ f"\n\n## AVAILABLE PREAMBLE (already included — you may use all macros/environments defined here)\n\n{preamble}"
492
+ )
493
+
494
+ if rules:
495
+ system_parts.append(f"\n\n## STRUCTURE RULES\n\n{rules}")
496
+
497
+ system_prompt = "\n".join(system_parts)
498
+
499
+ # Build user prompt with config metadata and content
500
+ user_parts = ["Generate the complete LaTeX document body for the following content.\n"]
501
+
502
+ # Config metadata
503
+ title = config.get('title', '')
504
+ if title:
505
+ user_parts.append(f"Document Title: {title}")
506
+ authors = config.get('authors', [])
507
+ if authors:
508
+ user_parts.append(f"Authors: {', '.join(authors) if isinstance(authors, list) else authors}")
509
+
510
+ # Include all project metadata
511
+ project_meta = config.get('project metadata', '')
512
+ if project_meta:
513
+ user_parts.append(f"\nProject Metadata:\n{project_meta}")
514
+
515
+ # Include disclaimer if present
516
+ disclaimer = config.get('disclaimer', '')
517
+ if disclaimer:
518
+ user_parts.append(f"\nDisclaimer text (include on cover page):\n{disclaimer}")
519
+
520
+ doc_options = config.get('document options', {})
521
+ if isinstance(doc_options, dict):
522
+ if doc_options.get('include_toc', False):
523
+ user_parts.append("\nInclude a table of contents.")
524
+ if doc_options.get('include_bibliography', False):
525
+ user_parts.append("Include a bibliography/references section at the end.")
526
+
527
+ user_parts.append(f"\n\n## CONTENT\n\n{content}")
528
+
529
+ user_prompt = "\n".join(user_parts)
530
+
531
+ try:
532
+ response = self.client.messages.create(
533
+ model="claude-sonnet-4-20250514",
534
+ max_tokens=16000,
535
+ temperature=0.2,
536
+ system=system_prompt,
537
+ messages=[{
538
+ "role": "user",
539
+ "content": user_prompt,
540
+ }],
541
+ )
542
+ body = response.content[0].text
543
+ # Strip code fences if the LLM wrapped the output
544
+ body = re.sub(r'^```(?:latex)?\s*\n', '', body)
545
+ body = re.sub(r'\n```\s*$', '', body)
546
+ return self._sanitize_unicode_for_latex(body)
547
+ except Exception as e:
548
+ print(f"Error generating document body via LLM: {e}")
549
+ raise
550
+
551
+ def _markdown_to_latex_content(self, markdown: str) -> str:
552
+ """Convert markdown content to LaTeX body content using LLM.
553
+
554
+ Simple per-fragment conversion used by external callers (e.g. report_generator).
555
+ For full document generation, use _convert_markdown_to_latex instead.
556
+ """
557
+ if not self.client:
558
+ raise RuntimeError("ANTHROPIC_API_KEY not set — cannot convert markdown to LaTeX")
559
+
560
+ try:
561
+ response = self.client.messages.create(
562
+ model="claude-sonnet-4-20250514",
563
+ max_tokens=4000,
564
+ temperature=0.2,
565
+ messages=[{
566
+ "role": "user",
567
+ "content": (
568
+ "Convert the following markdown to LaTeX body content. "
569
+ "Output ONLY raw LaTeX — no preamble, no \\documentclass, "
570
+ "no \\begin{document}, no \\end{document}, no code fences. "
571
+ "Use \\subsection as the highest heading level (not \\section). "
572
+ "Use \\subsubsection for lower-level headings. "
573
+ "Use booktabs (\\toprule, \\midrule, \\bottomrule) for tables. "
574
+ "Use itemize/enumerate for lists. "
575
+ "Use \\textbf, \\textit, \\texttt for emphasis. "
576
+ "Use \\href for hyperlinks. "
577
+ "Do NOT generate \\ref{}, \\cite{}, or \\label{} commands "
578
+ "unless they already appear verbatim in the source.\n\n"
579
+ f"{markdown}"
580
+ ),
581
+ }],
582
+ )
583
+ return self._sanitize_unicode_for_latex(response.content[0].text)
584
+ except Exception as e:
585
+ print(f"Error converting markdown to LaTeX via LLM: {e}")
586
+ raise
587
+
588
+ def _sanitize_unicode_for_latex(self, text: str) -> str:
589
+ """Replace common Unicode characters with LaTeX equivalents for pdflatex compatibility."""
590
+ replacements = {
591
+ # Superscripts
592
+ '\u2070': '$^{0}$', '\u00b9': '$^{1}$', '\u00b2': '$^{2}$',
593
+ '\u00b3': '$^{3}$', '\u2074': '$^{4}$', '\u2075': '$^{5}$',
594
+ '\u2076': '$^{6}$', '\u2077': '$^{7}$', '\u2078': '$^{8}$',
595
+ '\u2079': '$^{9}$', '\u207a': '$^{+}$', '\u207b': '$^{-}$',
596
+ # Subscripts
597
+ '\u2080': '$_{0}$', '\u2081': '$_{1}$', '\u2082': '$_{2}$',
598
+ '\u2083': '$_{3}$', '\u2084': '$_{4}$', '\u2085': '$_{5}$',
599
+ '\u2086': '$_{6}$', '\u2087': '$_{7}$', '\u2088': '$_{8}$',
600
+ '\u2089': '$_{9}$',
601
+ # Math symbols
602
+ '\u00d7': '$\\times$', # ×
603
+ '\u00f7': '$\\div$', # ÷
604
+ '\u2264': '$\\leq$', # ≤
605
+ '\u2265': '$\\geq$', # ≥
606
+ '\u2260': '$\\neq$', # ≠
607
+ '\u2248': '$\\approx$', # ≈
608
+ '\u221e': '$\\infty$', # ∞
609
+ '\u00b1': '$\\pm$', # ±
610
+ '\u2190': '$\\leftarrow$', # ←
611
+ '\u2192': '$\\rightarrow$', # →
612
+ # Typography
613
+ '\u2013': '--', # en dash
614
+ '\u2014': '---', # em dash
615
+ '\u2018': '`', # left single quote
616
+ '\u2019': "'", # right single quote
617
+ '\u201c': '``', # left double quote
618
+ '\u201d': "''", # right double quote
619
+ '\u2026': '\\ldots{}', # …
620
+ }
621
+ for char, latex in replacements.items():
622
+ text = text.replace(char, latex)
623
+ return text
624
+
625
+ def _process_csv_table_references(self, content: str, content_dir: str = "artifacts/sample_content") -> str:
626
+ """Process CSV table references in markdown content."""
627
+ import re
628
+ from pathlib import Path
629
+
630
+ # Pattern to match CSV table comments (including multi-line with flexible spacing)
631
+ csv_pattern = r'<!-- CSV_TABLE:\s*(.*?)\s*-->'
632
+
633
+ def replace_csv_table(match):
634
+ metadata_text = match.group(1)
635
+ return self._convert_csv_reference_to_latex(metadata_text, content_dir)
636
+
637
+ # Replace all CSV table references (with DOTALL flag for multi-line matching)
638
+ processed_content = re.sub(csv_pattern, replace_csv_table, content, flags=re.DOTALL)
639
+ return processed_content
640
+
641
+ def _process_image_references(self, content: str, content_dir: str = "artifacts/sample_content") -> str:
642
+ """Process IMAGE references in markdown content and convert to LaTeX figures."""
643
+ import re
644
+ from pathlib import Path
645
+
646
+ # Pattern to match IMAGE comments (multi-line)
647
+ image_pattern = r'<!-- IMAGE:\s*(.*?)\s*-->'
648
+
649
+ def replace_image_ref(match):
650
+ metadata_text = match.group(1)
651
+ return self._convert_image_reference_to_latex(metadata_text, content_dir)
652
+
653
+ processed_content = re.sub(image_pattern, replace_image_ref, content, flags=re.DOTALL)
654
+ return processed_content
655
+
656
+ def _process_tikz_references(self, content: str) -> str:
657
+ """Process TIKZ references in markdown content and convert to LaTeX tikzpicture environments."""
658
+ tikz_pattern = r'<!-- TIKZ:\s*(.*?)\s*-->'
659
+
660
+ def replace_tikz_ref(match):
661
+ metadata_text = match.group(1)
662
+ return self._convert_tikz_reference_to_latex(metadata_text)
663
+
664
+ return re.sub(tikz_pattern, replace_tikz_ref, content, flags=re.DOTALL)
665
+
666
+ def _convert_tikz_reference_to_latex(self, metadata_text: str) -> str:
667
+ """Convert a single TIKZ reference to a LaTeX figure with tikzpicture."""
668
+ lines = metadata_text.strip().split('\n')
669
+
670
+ caption = ''
671
+ label = ''
672
+ code_lines = []
673
+ in_code = False
674
+
675
+ for line in lines:
676
+ stripped = line.strip()
677
+ if stripped.startswith('code:'):
678
+ in_code = True
679
+ # Check if there's code on the same line after "code:"
680
+ rest = stripped[5:].strip()
681
+ if rest:
682
+ code_lines.append(rest)
683
+ elif in_code:
684
+ code_lines.append(line.rstrip())
685
+ elif ':' in stripped:
686
+ key, value = stripped.split(':', 1)
687
+ key = key.strip().lower()
688
+ value = value.strip()
689
+ if key == 'caption':
690
+ caption = value
691
+ elif key == 'label':
692
+ label = value
693
+
694
+ if not code_lines:
695
+ return '% TIKZ reference missing code'
696
+
697
+ tikz_code = '\n'.join(code_lines)
698
+
699
+ latex_parts = [
700
+ '\\begin{figure}[htbp]',
701
+ '\\centering',
702
+ '\\begin{tikzpicture}',
703
+ tikz_code,
704
+ '\\end{tikzpicture}',
705
+ ]
706
+ if caption:
707
+ latex_parts.append(f'\\caption{{{caption}}}')
708
+ if label:
709
+ latex_parts.append(f'\\label{{{label}}}')
710
+ latex_parts.append('\\end{figure}')
711
+
712
+ return '\n'.join(latex_parts)
713
+
714
+ def _convert_image_reference_to_latex(self, metadata_text: str, content_dir: str) -> str:
715
+ """Convert a single IMAGE reference to a LaTeX figure environment."""
716
+ from pathlib import Path
717
+
718
+ lines = metadata_text.strip().split('\n')
719
+ if not lines:
720
+ return "% IMAGE reference missing path"
721
+
722
+ # First line is the image path
723
+ image_path = lines[0].strip()
724
+
725
+ # Parse key-value metadata from remaining lines
726
+ caption = ''
727
+ label = ''
728
+ width = '0.8\\textwidth'
729
+ for line in lines[1:]:
730
+ line = line.strip()
731
+ if ':' in line:
732
+ key, value = line.split(':', 1)
733
+ key = key.strip().lower()
734
+ value = value.strip()
735
+ if key == 'caption':
736
+ caption = value
737
+ elif key == 'label':
738
+ label = value
739
+ elif key == 'width':
740
+ width = value
741
+
742
+ # Resolve image path relative to content directory
743
+ full_path = Path(content_dir) / image_path
744
+ if not full_path.exists():
745
+ return f"% Image not found: {image_path}"
746
+
747
+ # Generate LaTeX figure
748
+ latex_parts = [
749
+ '\\begin{figure}[htbp]',
750
+ '\\centering',
751
+ f'\\includegraphics[width={width}]{{{full_path}}}',
752
+ ]
753
+ if caption:
754
+ latex_parts.append(f'\\caption{{{caption}}}')
755
+ if label:
756
+ latex_parts.append(f'\\label{{{label}}}')
757
+ latex_parts.append('\\end{figure}')
758
+
759
+ return '\n'.join(latex_parts)
760
+
761
+ def _convert_csv_reference_to_latex(self, metadata_text: str, content_dir: str) -> str:
762
+ """Convert a single CSV reference to LaTeX table."""
763
+ from pathlib import Path
764
+ import csv
765
+
766
+ # Parse metadata from the comment
767
+ metadata = self._parse_csv_metadata(metadata_text)
768
+
769
+ csv_filename = metadata.get('filename')
770
+ if not csv_filename:
771
+ return "% CSV table reference missing filename"
772
+
773
+ # Load CSV data
774
+ csv_path = Path(content_dir) / "data" / csv_filename
775
+ if not csv_path.exists():
776
+ return f"% CSV file not found: {csv_filename}"
777
+
778
+ try:
779
+ with open(csv_path, 'r', encoding='utf-8') as f:
780
+ reader = csv.reader(f)
781
+ rows = list(reader)
782
+
783
+ if not rows:
784
+ return f"% Empty CSV file: {csv_filename}"
785
+
786
+ # Extract headers and data based on metadata
787
+ headers = rows[0]
788
+ data_rows = rows[1:]
789
+
790
+ # Apply column filtering
791
+ columns = metadata.get('columns', 'all')
792
+ if columns != 'all':
793
+ try:
794
+ if isinstance(columns, str):
795
+ if '-' in columns:
796
+ # Range like "1-3"
797
+ start, end = map(int, columns.split('-'))
798
+ col_indices = list(range(start-1, end)) # Convert to 0-based
799
+ else:
800
+ # Single column
801
+ col_indices = [int(columns)-1]
802
+ else:
803
+ # List of columns
804
+ col_indices = [int(c)-1 for c in columns]
805
+
806
+ headers = [headers[i] for i in col_indices if i < len(headers)]
807
+ data_rows = [[row[i] if i < len(row) else '' for i in col_indices] for row in data_rows]
808
+ except (ValueError, IndexError):
809
+ # Fall back to all columns if parsing fails
810
+ pass
811
+
812
+ # Apply row filtering
813
+ rows_spec = metadata.get('rows', 'all')
814
+ if rows_spec != 'all':
815
+ try:
816
+ if isinstance(rows_spec, str) and '-' in rows_spec:
817
+ # Range like "1-5"
818
+ start, end = map(int, rows_spec.split('-'))
819
+ data_rows = data_rows[start-1:end] # Convert to 0-based
820
+ elif isinstance(rows_spec, str):
821
+ # Single row or number
822
+ max_rows = int(rows_spec)
823
+ data_rows = data_rows[:max_rows]
824
+ except (ValueError, IndexError):
825
+ # Fall back to all rows if parsing fails
826
+ pass
827
+
828
+ # Generate LaTeX table
829
+ return self._generate_csv_latex_table(headers, data_rows, metadata)
830
+
831
+ except Exception as e:
832
+ return f"% Error loading CSV {csv_filename}: {str(e)}"
833
+
834
+ def _parse_csv_metadata(self, metadata_text: str) -> dict:
835
+ """Parse CSV table metadata from comment text."""
836
+ metadata = {}
837
+ lines = metadata_text.strip().split('\n')
838
+
839
+ # First line should be the filename
840
+ if lines:
841
+ metadata['filename'] = lines[0].strip()
842
+
843
+ # Parse key-value pairs from remaining lines
844
+ for line in lines[1:]:
845
+ line = line.strip()
846
+ if ':' in line:
847
+ key, value = line.split(':', 1)
848
+ key = key.strip()
849
+ value = value.strip()
850
+ metadata[key] = value
851
+
852
+ return metadata
853
+
854
+ def _generate_csv_latex_table(self, headers: list, data_rows: list, metadata: dict) -> str:
855
+ """Generate LaTeX table from CSV data and metadata."""
856
+ if not headers:
857
+ return "% No headers found in CSV data"
858
+
859
+ num_cols = len(headers)
860
+ col_spec = 'l' * num_cols # Default to left-aligned columns
861
+
862
+ # Get metadata values
863
+ caption = metadata.get('caption', 'CSV Data Table')
864
+ label = metadata.get('label', 'tab:csv_table')
865
+ table_format = metadata.get('format', 'professional')
866
+ description = metadata.get('description', '')
867
+
868
+ latex_parts = []
869
+
870
+ # Add description if provided
871
+ if description:
872
+ latex_parts.append(f"% {description}")
873
+ latex_parts.append("")
874
+
875
+ # Start table
876
+ latex_parts.extend([
877
+ '\\begin{table}[htbp]',
878
+ '\\centering',
879
+ f'\\begin{{tabular}}{{{col_spec}}}'
880
+ ])
881
+
882
+ # Add professional formatting if requested
883
+ if table_format == 'professional':
884
+ latex_parts.append('\\toprule')
885
+ else:
886
+ latex_parts.append('\\hline')
887
+
888
+ # Add header row
889
+ header_latex = ' & '.join(headers) + ' \\\\'
890
+ latex_parts.append(header_latex)
891
+
892
+ # Add separator
893
+ if table_format == 'professional':
894
+ latex_parts.append('\\midrule')
895
+ else:
896
+ latex_parts.append('\\hline')
897
+
898
+ # Add data rows
899
+ for row in data_rows:
900
+ # Ensure row has the right number of columns
901
+ while len(row) < num_cols:
902
+ row.append('')
903
+ row = row[:num_cols] # Truncate if too many columns
904
+
905
+ row_latex = ' & '.join(str(cell) for cell in row) + ' \\\\'
906
+ latex_parts.append(row_latex)
907
+
908
+ # End table
909
+ if table_format == 'professional':
910
+ latex_parts.append('\\bottomrule')
911
+ else:
912
+ latex_parts.append('\\hline')
913
+
914
+ latex_parts.extend([
915
+ '\\end{tabular}',
916
+ f'\\caption{{{caption}}}',
917
+ f'\\label{{{label}}}',
918
+ '\\end{table}'
919
+ ])
920
+
921
+ return '\n'.join(latex_parts)
922
+
923
+ def _optimize_structure(self, content: str) -> Tuple[str, List[str]]:
924
+ """Optimize document structure and organization."""
925
+ optimizations = []
926
+ optimized = content
927
+
928
+ # Ensure proper document class
929
+ if not re.search(r'\\documentclass', optimized):
930
+ optimized = '\\documentclass[12pt,letterpaper]{article}\n\n' + optimized
931
+ optimizations.append('Added professional document class')
932
+
933
+ # Ensure title and author if missing
934
+ if not re.search(r'\\title\{', optimized) and not re.search(r'\\maketitle', optimized):
935
+ # Add after document class
936
+ class_match = re.search(r'(\\documentclass.*\n)', optimized)
937
+ if class_match:
938
+ insert_pos = class_match.end()
939
+ title_block = '\n\\title{Research Report}\n\\author{Research Team}\n\\date{\\today}\n'
940
+ optimized = optimized[:insert_pos] + title_block + optimized[insert_pos:]
941
+ optimizations.append('Added title and author information')
942
+
943
+ # Add table of contents if document has sections
944
+ if re.search(r'\\(section|chapter)', optimized) and not re.search(r'\\tableofcontents', optimized):
945
+ # Add after \begin{document} and \maketitle
946
+ begin_doc = re.search(r'\\begin\{document\}', optimized)
947
+ if begin_doc:
948
+ # Look for \maketitle or add TOC right after \begin{document}
949
+ maketitle_match = re.search(r'\\maketitle', optimized[begin_doc.end():])
950
+ if maketitle_match:
951
+ insert_pos = begin_doc.end() + maketitle_match.end()
952
+ toc_block = '\n\\tableofcontents\n\\newpage\n'
953
+ else:
954
+ insert_pos = begin_doc.end()
955
+ toc_block = '\n\\tableofcontents\n\\newpage\n'
956
+
957
+ optimized = optimized[:insert_pos] + toc_block + optimized[insert_pos:]
958
+ optimizations.append('Added table of contents')
959
+
960
+ # Ensure proper section hierarchy
961
+ optimized, hierarchy_opts = self._fix_section_hierarchy(optimized)
962
+ optimizations.extend(hierarchy_opts)
963
+
964
+ return optimized, optimizations
965
+
966
+ def _optimize_typography(self, content: str, level: str) -> Tuple[str, List[str]]:
967
+ """Optimize typography and formatting."""
968
+ optimizations = []
969
+ optimized = content
970
+
971
+ # Essential typography packages
972
+ essential_packages = [
973
+ ('fontenc', '\\usepackage[T1]{fontenc}'),
974
+ ('inputenc', '\\usepackage[utf8]{inputenc}'),
975
+ ('lmodern', '\\usepackage{lmodern}'),
976
+ ('microtype', '\\usepackage{microtype}'),
977
+ ]
978
+
979
+ # Add packages if missing
980
+ for package_name, package_line in essential_packages:
981
+ if not re.search(f'\\\\usepackage.*{{{package_name}}}', optimized):
982
+ # Insert after documentclass
983
+ class_match = re.search(r'(\\documentclass.*\n)', optimized)
984
+ if class_match:
985
+ insert_pos = class_match.end()
986
+ optimized = optimized[:insert_pos] + package_line + '\n' + optimized[insert_pos:]
987
+ optimizations.append(f'Added {package_name} package for better typography')
988
+
989
+ # Add geometry for proper margins
990
+ if not re.search(r'\\usepackage.*\{geometry\}', optimized):
991
+ class_match = re.search(r'(\\documentclass.*\n)', optimized)
992
+ if class_match:
993
+ insert_pos = class_match.end()
994
+ geometry_block = '\\usepackage{geometry}\n\\geometry{margin=1in}\n'
995
+ optimized = optimized[:insert_pos] + geometry_block + optimized[insert_pos:]
996
+ optimizations.append('Added geometry package with proper margins')
997
+
998
+ # Add spacing improvements
999
+ if level in ['moderate', 'aggressive']:
1000
+ if not re.search(r'\\usepackage.*\{setspace\}', optimized):
1001
+ class_match = re.search(r'(\\documentclass.*\n)', optimized)
1002
+ if class_match:
1003
+ insert_pos = class_match.end()
1004
+ spacing_block = '\\usepackage{setspace}\n\\onehalfspacing\n'
1005
+ optimized = optimized[:insert_pos] + spacing_block + optimized[insert_pos:]
1006
+ optimizations.append('Added improved line spacing')
1007
+
1008
+ # Fix spacing issues
1009
+ spacing_fixes = [
1010
+ (r'\s{2,}', ' ', 'Fixed multiple consecutive spaces'),
1011
+ (r'([.!?])([A-Z])', r'\1 \2', 'Added missing spaces after sentences'),
1012
+ (r'\s+([.!?])', r'\1', 'Fixed spaces before punctuation'),
1013
+ ]
1014
+
1015
+ for pattern, replacement, description in spacing_fixes:
1016
+ if re.search(pattern, optimized):
1017
+ optimized = re.sub(pattern, replacement, optimized)
1018
+ optimizations.append(description)
1019
+
1020
+ return optimized, optimizations
1021
+
1022
+ def _optimize_tables(self, content: str) -> Tuple[str, List[str]]:
1023
+ """Optimize table formatting."""
1024
+ optimizations = []
1025
+ optimized = content
1026
+
1027
+ # Check if document has tables
1028
+ has_tables = re.search(r'\\begin\{tabular\}|\\begin\{table\}', optimized)
1029
+
1030
+ if has_tables:
1031
+ # Add booktabs package
1032
+ if not re.search(r'\\usepackage.*\{booktabs\}', optimized):
1033
+ class_match = re.search(r'(\\documentclass.*\n)', optimized)
1034
+ if class_match:
1035
+ insert_pos = class_match.end()
1036
+ optimized = optimized[:insert_pos] + '\\usepackage{booktabs}\n' + optimized[insert_pos:]
1037
+ optimizations.append('Added booktabs package for professional tables')
1038
+
1039
+ # Replace \\hline with booktabs rules
1040
+ if re.search(r'\\hline', optimized):
1041
+ # This is a simplified replacement - in practice, you'd want more sophisticated logic
1042
+ optimized = re.sub(r'\\hline', '\\midrule', optimized)
1043
+ optimizations.append('Replaced \\hline with professional booktabs rules')
1044
+
1045
+ # Add array package for better column types
1046
+ if not re.search(r'\\usepackage.*\{array\}', optimized):
1047
+ class_match = re.search(r'(\\documentclass.*\n)', optimized)
1048
+ if class_match:
1049
+ insert_pos = class_match.end()
1050
+ optimized = optimized[:insert_pos] + '\\usepackage{array}\n' + optimized[insert_pos:]
1051
+ optimizations.append('Added array package for better table formatting')
1052
+
1053
+ return optimized, optimizations
1054
+
1055
+ def _optimize_figures(self, content: str) -> Tuple[str, List[str]]:
1056
+ """Optimize figure formatting and placement."""
1057
+ optimizations = []
1058
+ optimized = content
1059
+
1060
+ # Check if document has figures
1061
+ has_figures = re.search(r'\\includegraphics|\\begin\{figure\}', optimized)
1062
+
1063
+ if has_figures:
1064
+ # Essential figure packages
1065
+ figure_packages = [
1066
+ ('graphicx', '\\usepackage{graphicx}'),
1067
+ ('float', '\\usepackage{float}'),
1068
+ ('caption', '\\usepackage{caption}')
1069
+ ]
1070
+
1071
+ for package_name, package_line in figure_packages:
1072
+ if not re.search(f'\\\\usepackage.*{{{package_name}}}', optimized):
1073
+ class_match = re.search(r'(\\documentclass.*\n)', optimized)
1074
+ if class_match:
1075
+ insert_pos = class_match.end()
1076
+ optimized = optimized[:insert_pos] + package_line + '\n' + optimized[insert_pos:]
1077
+ optimizations.append(f'Added {package_name} package for better figures')
1078
+
1079
+ # Improve figure placement
1080
+ figure_placements = re.findall(r'\\begin\{figure\}\[([^\]]*)\]', optimized)
1081
+ poor_placements = [p for p in figure_placements if 'h' in p and 't' not in p and 'b' not in p]
1082
+
1083
+ if poor_placements:
1084
+ # Replace poor placements with better options
1085
+ optimized = re.sub(r'\\begin\{figure\}\[h\]', '\\begin{figure}[htbp]', optimized)
1086
+ optimizations.append('Improved figure placement options')
1087
+
1088
+ return optimized, optimizations
1089
+
1090
+ def _optimize_references(self, content: str) -> Tuple[str, List[str]]:
1091
+ """Optimize references and citations."""
1092
+ optimizations = []
1093
+ optimized = content
1094
+
1095
+ # Add hyperref for better navigation (should be last)
1096
+ if not re.search(r'\\usepackage.*\{hyperref\}', optimized):
1097
+ # Add before \begin{document}
1098
+ begin_doc = re.search(r'\\begin\{document\}', optimized)
1099
+ if begin_doc:
1100
+ insert_pos = begin_doc.start()
1101
+ hyperref_block = '\\usepackage{hyperref}\n\\hypersetup{\n colorlinks=true,\n linkcolor=blue,\n citecolor=red,\n urlcolor=blue\n}\n\n'
1102
+ optimized = optimized[:insert_pos] + hyperref_block + optimized[insert_pos:]
1103
+ optimizations.append('Added hyperref package for better navigation')
1104
+
1105
+ return optimized, optimizations
1106
+
1107
+ def _apply_general_cleanup(self, content: str) -> Tuple[str, List[str]]:
1108
+ """Apply general cleanup and improvements."""
1109
+ optimizations = []
1110
+ optimized = content
1111
+
1112
+ # Remove excessive blank lines
1113
+ original_lines = len(optimized.split('\n'))
1114
+ optimized = re.sub(r'\n{3,}', '\n\n', optimized)
1115
+ new_lines = len(optimized.split('\n'))
1116
+
1117
+ if new_lines < original_lines:
1118
+ optimizations.append(f'Cleaned up excessive blank lines ({original_lines - new_lines} lines removed)')
1119
+
1120
+ # Fix common LaTeX spacing issues
1121
+ common_fixes = [
1122
+ (r'\\section\s*\{', r'\\section{', 'Fixed section command spacing'),
1123
+ (r'\\subsection\s*\{', r'\\subsection{', 'Fixed subsection command spacing'),
1124
+ (r'\\textbf\s*\{', r'\\textbf{', 'Fixed textbf command spacing'),
1125
+ (r'\\textit\s*\{', r'\\textit{', 'Fixed textit command spacing'),
1126
+ ]
1127
+
1128
+ for pattern, replacement, description in common_fixes:
1129
+ if re.search(pattern, optimized):
1130
+ optimized = re.sub(pattern, replacement, optimized)
1131
+ optimizations.append(description)
1132
+
1133
+ return optimized, optimizations
1134
+
1135
+ def _fix_section_hierarchy(self, content: str) -> Tuple[str, List[str]]:
1136
+ """Fix section hierarchy issues."""
1137
+ optimizations = []
1138
+ # This would contain logic to fix section nesting issues
1139
+ # For now, return as-is
1140
+ return content, optimizations
1141
+
1142
+ def _final_formatting_pass(self, content: str) -> str:
1143
+ """Apply final formatting improvements.
1144
+
1145
+ Only modifies the document body — the preamble (everything before
1146
+ \\begin{document}) is returned unchanged to avoid breaking custom
1147
+ macro definitions (\\newcommand, \\newenvironment, etc.).
1148
+ """
1149
+ # Split at \begin{document} so regexes only touch the body
1150
+ split_marker = "\\begin{document}"
1151
+ marker_pos = content.find(split_marker)
1152
+ if marker_pos == -1:
1153
+ # No \begin{document} — apply to entire content (legacy path)
1154
+ body = content
1155
+ preamble = ""
1156
+ rejoin = False
1157
+ else:
1158
+ preamble = content[:marker_pos + len(split_marker)]
1159
+ body = content[marker_pos + len(split_marker):]
1160
+ rejoin = True
1161
+
1162
+ # Ensure proper spacing around environments
1163
+ # Preserve optional arguments like \begin{tikzpicture}[remember picture, overlay]
1164
+ body = re.sub(r'(\\begin\{[^}]+\}(?:\[[^\]]*\])?)\n{0,1}', r'\1\n', body)
1165
+ body = re.sub(r'\n{0,1}(\\end\{[^}]+\})', r'\n\1', body)
1166
+
1167
+ # Ensure proper spacing around sections
1168
+ body = re.sub(r'(\\(?:sub)*section\{[^}]+\})\n{0,1}', r'\1\n\n', body)
1169
+
1170
+ if rejoin:
1171
+ result = preamble + body
1172
+ else:
1173
+ result = body
1174
+
1175
+ # Clean up final whitespace
1176
+ return result.strip()
1177
+
1178
+ def calculate_optimization_score(self, before_issues: int, after_issues: int, optimizations_count: int) -> int:
1179
+ """Calculate optimization effectiveness score."""
1180
+ issues_fixed = max(0, before_issues - after_issues)
1181
+
1182
+ # Base score from issues fixed
1183
+ score = min(50, issues_fixed * 5)
1184
+
1185
+ # Bonus for optimizations applied
1186
+ score += min(30, optimizations_count * 2)
1187
+
1188
+ # Bonus for significant improvement
1189
+ if issues_fixed > before_issues * 0.5: # Fixed more than 50% of issues
1190
+ score += 20
1191
+
1192
+ return min(100, score)