deepagents-printshop 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agents/content_editor/__init__.py +1 -0
- agents/content_editor/agent.py +279 -0
- agents/content_editor/content_reviewer.py +327 -0
- agents/content_editor/versioned_agent.py +455 -0
- agents/latex_specialist/__init__.py +1 -0
- agents/latex_specialist/agent.py +531 -0
- agents/latex_specialist/latex_analyzer.py +510 -0
- agents/latex_specialist/latex_optimizer.py +1192 -0
- agents/qa_orchestrator/__init__.py +1 -0
- agents/qa_orchestrator/agent.py +603 -0
- agents/qa_orchestrator/langgraph_workflow.py +733 -0
- agents/qa_orchestrator/pipeline_types.py +72 -0
- agents/qa_orchestrator/quality_gates.py +495 -0
- agents/qa_orchestrator/workflow_coordinator.py +139 -0
- agents/research_agent/__init__.py +1 -0
- agents/research_agent/agent.py +258 -0
- agents/research_agent/llm_report_generator.py +1023 -0
- agents/research_agent/report_generator.py +536 -0
- agents/visual_qa/__init__.py +1 -0
- agents/visual_qa/agent.py +410 -0
- deepagents_printshop-0.1.0.dist-info/METADATA +744 -0
- deepagents_printshop-0.1.0.dist-info/RECORD +37 -0
- deepagents_printshop-0.1.0.dist-info/WHEEL +4 -0
- deepagents_printshop-0.1.0.dist-info/entry_points.txt +2 -0
- deepagents_printshop-0.1.0.dist-info/licenses/LICENSE +86 -0
- tools/__init__.py +1 -0
- tools/change_tracker.py +419 -0
- tools/content_type_loader.py +171 -0
- tools/graph_generator.py +281 -0
- tools/latex_generator.py +374 -0
- tools/llm_latex_generator.py +678 -0
- tools/magazine_layout.py +462 -0
- tools/pattern_injector.py +250 -0
- tools/pattern_learner.py +477 -0
- tools/pdf_compiler.py +386 -0
- tools/version_manager.py +346 -0
- tools/visual_qa.py +799 -0
|
@@ -0,0 +1,1192 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LaTeX Optimizer - Milestone 3
|
|
3
|
+
|
|
4
|
+
Optimizes LaTeX document structure, typography, and formatting for professional quality.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
import os
|
|
9
|
+
import csv
|
|
10
|
+
from typing import Dict, List, Tuple, Optional
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
import anthropic
|
|
14
|
+
|
|
15
|
+
# Add project root to path
|
|
16
|
+
project_root = Path(__file__).parent.parent.parent
|
|
17
|
+
if str(project_root) not in __import__('sys').path:
|
|
18
|
+
__import__('sys').path.insert(0, str(project_root))
|
|
19
|
+
|
|
20
|
+
from tools.latex_generator import LaTeXGenerator, DocumentConfig
|
|
21
|
+
from tools.content_type_loader import ContentTypeLoader
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class LaTeXOptimizer:
|
|
25
|
+
"""
|
|
26
|
+
Optimizes LaTeX documents for professional formatting and structure.
|
|
27
|
+
|
|
28
|
+
Features:
|
|
29
|
+
- Document structure optimization
|
|
30
|
+
- Typography enhancement
|
|
31
|
+
- Table and figure formatting improvement
|
|
32
|
+
- LaTeX best practices application
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(self, content_source: str = "research_report"):
|
|
36
|
+
"""Initialize the LaTeX optimizer.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
content_source: Content source folder name (e.g., 'research_report', 'magazine')
|
|
40
|
+
"""
|
|
41
|
+
self.content_source = content_source
|
|
42
|
+
self.content_dir = Path("artifacts/sample_content") / content_source
|
|
43
|
+
self.api_key = os.getenv('ANTHROPIC_API_KEY')
|
|
44
|
+
self.client = anthropic.Anthropic(api_key=self.api_key) if self.api_key else None
|
|
45
|
+
self.professional_packages = {
|
|
46
|
+
'typography': [
|
|
47
|
+
'\\usepackage[T1]{fontenc}',
|
|
48
|
+
'\\usepackage[utf8]{inputenc}',
|
|
49
|
+
'\\usepackage{microtype}',
|
|
50
|
+
'\\usepackage{lmodern}'
|
|
51
|
+
],
|
|
52
|
+
'tables': [
|
|
53
|
+
'\\usepackage{booktabs}',
|
|
54
|
+
'\\usepackage{array}',
|
|
55
|
+
'\\usepackage{longtable}'
|
|
56
|
+
],
|
|
57
|
+
'figures': [
|
|
58
|
+
'\\usepackage{graphicx}',
|
|
59
|
+
'\\usepackage{float}',
|
|
60
|
+
'\\usepackage{caption}',
|
|
61
|
+
'\\usepackage{subcaption}'
|
|
62
|
+
],
|
|
63
|
+
'layout': [
|
|
64
|
+
'\\usepackage{geometry}',
|
|
65
|
+
'\\usepackage{fancyhdr}',
|
|
66
|
+
'\\usepackage{titlesec}'
|
|
67
|
+
],
|
|
68
|
+
'references': [
|
|
69
|
+
'\\usepackage{hyperref}',
|
|
70
|
+
'\\usepackage{cite}',
|
|
71
|
+
'\\usepackage{url}'
|
|
72
|
+
]
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
self.document_templates = {
|
|
76
|
+
'article': {
|
|
77
|
+
'geometry': '\\geometry{margin=1in}',
|
|
78
|
+
'spacing': '\\usepackage{setspace}\\onehalfspacing',
|
|
79
|
+
'sections': ['section', 'subsection', 'subsubsection']
|
|
80
|
+
},
|
|
81
|
+
'report': {
|
|
82
|
+
'geometry': '\\geometry{margin=1in}',
|
|
83
|
+
'spacing': '\\usepackage{setspace}\\onehalfspacing',
|
|
84
|
+
'sections': ['chapter', 'section', 'subsection', 'subsubsection']
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
def optimize_document(self,
|
|
89
|
+
content: str,
|
|
90
|
+
markdown_content: Dict[str, str],
|
|
91
|
+
optimization_level: str = 'moderate') -> Dict:
|
|
92
|
+
"""
|
|
93
|
+
Optimize LaTeX document comprehensively.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
content: Original LaTeX content or markdown content
|
|
97
|
+
markdown_content: Dictionary of markdown files to convert
|
|
98
|
+
optimization_level: 'conservative', 'moderate', 'aggressive'
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Dictionary with optimized content and optimization details
|
|
102
|
+
"""
|
|
103
|
+
print(f"🔧 Starting LaTeX optimization (level: {optimization_level})")
|
|
104
|
+
|
|
105
|
+
# If we have markdown content, convert to LaTeX first
|
|
106
|
+
has_type_preamble = False
|
|
107
|
+
if markdown_content:
|
|
108
|
+
latex_content = self._convert_markdown_to_latex(markdown_content)
|
|
109
|
+
# Check if the content type provided its own preamble blocks
|
|
110
|
+
config_data = self.load_config_from_markdown(markdown_content)
|
|
111
|
+
content_type = config_data.get('_content_type')
|
|
112
|
+
if content_type and content_type.latex_preamble_blocks:
|
|
113
|
+
has_type_preamble = True
|
|
114
|
+
else:
|
|
115
|
+
latex_content = content
|
|
116
|
+
|
|
117
|
+
# Apply optimizations in order
|
|
118
|
+
optimizations_applied = []
|
|
119
|
+
|
|
120
|
+
# Skip structure and typography optimization when content type provides its own preamble
|
|
121
|
+
# (these add duplicate packages and rewrite the preamble)
|
|
122
|
+
if not has_type_preamble:
|
|
123
|
+
# 1. Structure optimization
|
|
124
|
+
latex_content, struct_opts = self._optimize_structure(latex_content)
|
|
125
|
+
optimizations_applied.extend(struct_opts)
|
|
126
|
+
|
|
127
|
+
# 2. Typography optimization
|
|
128
|
+
latex_content, typo_opts = self._optimize_typography(latex_content, optimization_level)
|
|
129
|
+
optimizations_applied.extend(typo_opts)
|
|
130
|
+
|
|
131
|
+
# 5. References and citations
|
|
132
|
+
latex_content, ref_opts = self._optimize_references(latex_content)
|
|
133
|
+
optimizations_applied.extend(ref_opts)
|
|
134
|
+
|
|
135
|
+
# 3. Table optimization
|
|
136
|
+
latex_content, table_opts = self._optimize_tables(latex_content)
|
|
137
|
+
optimizations_applied.extend(table_opts)
|
|
138
|
+
|
|
139
|
+
# 4. Figure optimization
|
|
140
|
+
latex_content, figure_opts = self._optimize_figures(latex_content)
|
|
141
|
+
optimizations_applied.extend(figure_opts)
|
|
142
|
+
|
|
143
|
+
# 6. General cleanup
|
|
144
|
+
latex_content, cleanup_opts = self._apply_general_cleanup(latex_content)
|
|
145
|
+
optimizations_applied.extend(cleanup_opts)
|
|
146
|
+
|
|
147
|
+
# 7. Final formatting pass
|
|
148
|
+
latex_content = self._final_formatting_pass(latex_content)
|
|
149
|
+
|
|
150
|
+
print(f"✅ Applied {len(optimizations_applied)} optimizations")
|
|
151
|
+
|
|
152
|
+
return {
|
|
153
|
+
'optimized_content': latex_content,
|
|
154
|
+
'optimizations_applied': optimizations_applied,
|
|
155
|
+
'optimization_count': len(optimizations_applied),
|
|
156
|
+
'optimization_level': optimization_level,
|
|
157
|
+
'timestamp': datetime.now().isoformat()
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
def load_config_from_markdown(self, markdown_content: Dict[str, str]) -> Dict:
|
|
161
|
+
"""Load document configuration from config.md in the markdown_content dict.
|
|
162
|
+
|
|
163
|
+
Uses ContentTypeLoader to resolve the content type and extract
|
|
164
|
+
document class, font size, and paper size from the type definition.
|
|
165
|
+
Parses remaining config sections (metadata, manifest, options) from config.md.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
markdown_content: Dictionary of filename -> content loaded by version manager
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
Parsed configuration dictionary
|
|
172
|
+
"""
|
|
173
|
+
config_md = markdown_content.get("config.md", "")
|
|
174
|
+
config = {}
|
|
175
|
+
|
|
176
|
+
if config_md:
|
|
177
|
+
lines = config_md.split('\n')
|
|
178
|
+
current_section = None
|
|
179
|
+
content_lines = []
|
|
180
|
+
|
|
181
|
+
for line in lines:
|
|
182
|
+
if line.startswith('## '):
|
|
183
|
+
if current_section and content_lines:
|
|
184
|
+
config[current_section] = self._parse_config_section_simple(current_section, content_lines)
|
|
185
|
+
current_section = line.replace('## ', '').strip().lower()
|
|
186
|
+
content_lines = []
|
|
187
|
+
elif line.strip() and not line.startswith('#'):
|
|
188
|
+
content_lines.append(line)
|
|
189
|
+
|
|
190
|
+
if current_section and content_lines:
|
|
191
|
+
config[current_section] = self._parse_config_section_simple(current_section, content_lines)
|
|
192
|
+
|
|
193
|
+
# Load content type definition
|
|
194
|
+
type_id = config.get('content type', 'research_report')
|
|
195
|
+
if isinstance(type_id, str):
|
|
196
|
+
type_id = type_id.strip()
|
|
197
|
+
|
|
198
|
+
loader = ContentTypeLoader(types_dir=str(project_root / "content_types"))
|
|
199
|
+
content_type = loader.load_type(type_id)
|
|
200
|
+
|
|
201
|
+
# Inject type defaults into config
|
|
202
|
+
config['document class'] = content_type.document_class
|
|
203
|
+
config['_content_type'] = content_type
|
|
204
|
+
config['_type_font_size'] = content_type.default_font_size
|
|
205
|
+
config['_type_paper_size'] = content_type.default_paper_size
|
|
206
|
+
|
|
207
|
+
# Parse project metadata into top-level fields
|
|
208
|
+
# _parse_config_section_simple already strips '- ' prefixes,
|
|
209
|
+
# so lines arrive as "Key: Value" not "- Key: Value"
|
|
210
|
+
project_meta = config.get('project metadata', '')
|
|
211
|
+
if isinstance(project_meta, str):
|
|
212
|
+
for line in project_meta.split('\n'):
|
|
213
|
+
line = line.strip()
|
|
214
|
+
if ':' in line:
|
|
215
|
+
key, value = line.split(':', 1)
|
|
216
|
+
key = key.strip().strip('*').lower()
|
|
217
|
+
value = value.strip()
|
|
218
|
+
if key == 'title':
|
|
219
|
+
config['title'] = value
|
|
220
|
+
elif key == 'authors':
|
|
221
|
+
config['authors'] = [a.strip() for a in value.split(',')]
|
|
222
|
+
|
|
223
|
+
return config
|
|
224
|
+
|
|
225
|
+
def _parse_config_section_simple(self, section_name: str, content_lines: list):
|
|
226
|
+
"""Parse configuration sections from config.md."""
|
|
227
|
+
if section_name in ['document options', 'headers and footers']:
|
|
228
|
+
result = {}
|
|
229
|
+
for line in content_lines:
|
|
230
|
+
if line.startswith('- ') and ':' in line:
|
|
231
|
+
key_value = line[2:].split(':', 1)
|
|
232
|
+
if len(key_value) == 2:
|
|
233
|
+
key = key_value[0].strip()
|
|
234
|
+
value = key_value[1].strip()
|
|
235
|
+
if value.lower() in ['true', 'false']:
|
|
236
|
+
value = value.lower() == 'true'
|
|
237
|
+
result[key] = value
|
|
238
|
+
return result
|
|
239
|
+
elif section_name == 'content manifest':
|
|
240
|
+
structure = []
|
|
241
|
+
for line in content_lines:
|
|
242
|
+
if line.strip() and line[0].isdigit():
|
|
243
|
+
parts = line.split('.', 1)
|
|
244
|
+
if len(parts) == 2:
|
|
245
|
+
section_def = parts[1].strip()
|
|
246
|
+
# Try "Title (filename.md)" format first
|
|
247
|
+
paren_match = re.match(r'^(.+?)\s*\((\S+\.md)\)\s*$', section_def)
|
|
248
|
+
if paren_match:
|
|
249
|
+
title = paren_match.group(1).strip()
|
|
250
|
+
source = paren_match.group(2).strip()
|
|
251
|
+
structure.append({
|
|
252
|
+
'title': title,
|
|
253
|
+
'source': source,
|
|
254
|
+
'type': 'markdown'
|
|
255
|
+
})
|
|
256
|
+
elif ':' in section_def:
|
|
257
|
+
title, source = section_def.split(':', 1)
|
|
258
|
+
structure.append({
|
|
259
|
+
'title': title.strip(),
|
|
260
|
+
'source': source.strip(),
|
|
261
|
+
'type': 'markdown' if source.strip().endswith('.md') else 'auto'
|
|
262
|
+
})
|
|
263
|
+
else:
|
|
264
|
+
structure.append({
|
|
265
|
+
'title': section_def,
|
|
266
|
+
'source': None,
|
|
267
|
+
'type': 'auto'
|
|
268
|
+
})
|
|
269
|
+
return structure
|
|
270
|
+
else:
|
|
271
|
+
content = '\n'.join(content_lines).strip()
|
|
272
|
+
if all(line.startswith('- ') or not line.strip() for line in content_lines if line.strip()):
|
|
273
|
+
return '\n'.join(line[2:] if line.startswith('- ') else line for line in content_lines).strip()
|
|
274
|
+
return content
|
|
275
|
+
|
|
276
|
+
def _generate_visualizations(self, gen: LaTeXGenerator):
|
|
277
|
+
"""No-op: visualizations are now content-driven via IMAGE comments."""
|
|
278
|
+
pass
|
|
279
|
+
|
|
280
|
+
def _add_images(self, gen: LaTeXGenerator):
|
|
281
|
+
"""Add images from the content directory as figures."""
|
|
282
|
+
images_dir = self.content_dir / "images"
|
|
283
|
+
if not images_dir.exists():
|
|
284
|
+
return
|
|
285
|
+
|
|
286
|
+
for img_file in sorted(images_dir.iterdir()):
|
|
287
|
+
if img_file.suffix.lower() in ['.png', '.jpg', '.jpeg']:
|
|
288
|
+
caption = img_file.stem.replace('_', ' ').replace('-', ' ').title()
|
|
289
|
+
label = f"fig:{img_file.stem}"
|
|
290
|
+
gen.add_figure(str(img_file), caption, label=label)
|
|
291
|
+
|
|
292
|
+
def _add_csv_tables(self, gen: LaTeXGenerator):
|
|
293
|
+
"""Add CSV data tables from the content directory."""
|
|
294
|
+
data_dir = self.content_dir / "data"
|
|
295
|
+
if not data_dir.exists():
|
|
296
|
+
return
|
|
297
|
+
|
|
298
|
+
for csv_file in sorted(data_dir.iterdir()):
|
|
299
|
+
if csv_file.suffix.lower() == '.csv':
|
|
300
|
+
try:
|
|
301
|
+
with open(csv_file, 'r', encoding='utf-8') as f:
|
|
302
|
+
reader = csv.reader(f)
|
|
303
|
+
rows = list(reader)
|
|
304
|
+
|
|
305
|
+
if rows:
|
|
306
|
+
headers = rows[0]
|
|
307
|
+
data_rows = rows[1:]
|
|
308
|
+
caption = csv_file.stem.replace('_', ' ').replace('-', ' ').title()
|
|
309
|
+
label = f"tab:{csv_file.stem}"
|
|
310
|
+
gen.add_table(
|
|
311
|
+
caption=caption,
|
|
312
|
+
headers=headers,
|
|
313
|
+
rows=data_rows,
|
|
314
|
+
label=label,
|
|
315
|
+
)
|
|
316
|
+
except Exception as e:
|
|
317
|
+
gen.add_raw_latex(f"% Error loading CSV {csv_file.name}: {e}")
|
|
318
|
+
|
|
319
|
+
def _add_bibliography(self, gen: LaTeXGenerator):
|
|
320
|
+
"""Add standard bibliography entries."""
|
|
321
|
+
gen.add_bib_entry(
|
|
322
|
+
"\\bibitem{vaswani2017attention}\n"
|
|
323
|
+
"Vaswani, A., et al. (2017). "
|
|
324
|
+
"Attention is all you need. "
|
|
325
|
+
"In Advances in neural information processing systems (pp. 5998-6008)."
|
|
326
|
+
)
|
|
327
|
+
gen.add_bib_entry(
|
|
328
|
+
"\\bibitem{devlin2018bert}\n"
|
|
329
|
+
"Devlin, J., et al. (2018). "
|
|
330
|
+
"BERT: Pre-training of deep bidirectional transformers for language understanding. "
|
|
331
|
+
"arXiv preprint arXiv:1810.04805."
|
|
332
|
+
)
|
|
333
|
+
gen.add_bib_entry(
|
|
334
|
+
"\\bibitem{brown2020gpt3}\n"
|
|
335
|
+
"Brown, T., et al. (2020). "
|
|
336
|
+
"Language models are few-shot learners. "
|
|
337
|
+
"Advances in neural information processing systems, 33, 1877-1901."
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
def _convert_markdown_to_latex(self, markdown_content: Dict[str, str]) -> str:
|
|
341
|
+
"""Convert markdown content to a complete LaTeX document.
|
|
342
|
+
|
|
343
|
+
Uses a single holistic LLM call with the full type.md rendering instructions,
|
|
344
|
+
available macros, structure rules, and all section content. The preamble is
|
|
345
|
+
built programmatically from type.md LaTeX code blocks.
|
|
346
|
+
"""
|
|
347
|
+
config_data = self.load_config_from_markdown(markdown_content)
|
|
348
|
+
content_type = config_data.get('_content_type')
|
|
349
|
+
|
|
350
|
+
# Pre-process all markdown sections for inline references
|
|
351
|
+
document_structure = config_data.get('content manifest', [])
|
|
352
|
+
processed_sections = []
|
|
353
|
+
|
|
354
|
+
if document_structure:
|
|
355
|
+
for section in document_structure:
|
|
356
|
+
title = section['title']
|
|
357
|
+
source = section.get('source')
|
|
358
|
+
section_type = section.get('type', 'auto')
|
|
359
|
+
|
|
360
|
+
if section_type == 'markdown' and source:
|
|
361
|
+
md_content = markdown_content.get(source, '')
|
|
362
|
+
if md_content:
|
|
363
|
+
processed = self._process_csv_table_references(md_content, str(self.content_dir))
|
|
364
|
+
processed = self._process_image_references(processed, str(self.content_dir))
|
|
365
|
+
processed = self._process_tikz_references(processed)
|
|
366
|
+
processed = re.sub(r'^#\s+[^\n]*\n*', '', processed, count=1)
|
|
367
|
+
processed_sections.append({'title': title, 'content': processed, 'source': source})
|
|
368
|
+
else:
|
|
369
|
+
processed_sections.append({'title': title, 'content': f'[Content not found: {source}]', 'source': source})
|
|
370
|
+
elif title.lower() == 'abstract':
|
|
371
|
+
abstract_content = config_data.get('abstract', '')
|
|
372
|
+
if abstract_content:
|
|
373
|
+
processed_sections.append({'title': 'Abstract', 'content': abstract_content.strip(), 'source': None})
|
|
374
|
+
else:
|
|
375
|
+
processed_sections.append({'title': title, 'content': '[Auto-generated content placeholder]', 'source': None})
|
|
376
|
+
else:
|
|
377
|
+
for filename, content in markdown_content.items():
|
|
378
|
+
if filename == 'config.md':
|
|
379
|
+
continue
|
|
380
|
+
processed = self._process_csv_table_references(content, str(self.content_dir))
|
|
381
|
+
processed = self._process_image_references(processed, str(self.content_dir))
|
|
382
|
+
processed = self._process_tikz_references(processed)
|
|
383
|
+
title = filename.replace('.md', '').replace('_', ' ').title()
|
|
384
|
+
processed_sections.append({'title': title, 'content': processed, 'source': filename})
|
|
385
|
+
|
|
386
|
+
# Get type.md properties
|
|
387
|
+
rendering_instructions = content_type.rendering_instructions if content_type else ""
|
|
388
|
+
preamble_blocks = content_type.latex_preamble_blocks if content_type else []
|
|
389
|
+
structure_rules = content_type.structure_rules if content_type else ""
|
|
390
|
+
|
|
391
|
+
# Build preamble
|
|
392
|
+
preamble = self._build_preamble(config_data, preamble_blocks)
|
|
393
|
+
|
|
394
|
+
# Assemble content for prompt
|
|
395
|
+
assembled_content = self._assemble_content_for_prompt(config_data, document_structure, processed_sections)
|
|
396
|
+
|
|
397
|
+
# Generate document body via single holistic LLM call
|
|
398
|
+
body = self._generate_document_body(
|
|
399
|
+
assembled_content, config_data, rendering_instructions, preamble, structure_rules
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
# Assemble final document
|
|
403
|
+
document = preamble + "\n\n\\begin{document}\n\n" + body + "\n\n\\end{document}\n"
|
|
404
|
+
|
|
405
|
+
return document
|
|
406
|
+
|
|
407
|
+
def _build_preamble(self, config_data: Dict, type_preamble_blocks: List[str]) -> str:
|
|
408
|
+
"""Build the LaTeX preamble from type.md code blocks or defaults."""
|
|
409
|
+
doc_class = config_data.get('document class', 'article')
|
|
410
|
+
font_size = config_data.get('_type_font_size', '12pt')
|
|
411
|
+
paper_size = config_data.get('_type_paper_size', 'letterpaper')
|
|
412
|
+
doc_options = config_data.get('document options', {})
|
|
413
|
+
font_size = doc_options.get('font_size', font_size)
|
|
414
|
+
paper_size = doc_options.get('paper_size', paper_size)
|
|
415
|
+
|
|
416
|
+
documentclass_line = f"\\documentclass[{font_size},{paper_size}]{{{doc_class}}}"
|
|
417
|
+
preamble_lines = [documentclass_line]
|
|
418
|
+
print(f" [LaTeX] Preamble documentclass: {documentclass_line}")
|
|
419
|
+
|
|
420
|
+
content_type_id = config_data.get('content type', config_data.get('_content_type', None))
|
|
421
|
+
if hasattr(content_type_id, 'type_id'):
|
|
422
|
+
content_type_id = content_type_id.type_id
|
|
423
|
+
|
|
424
|
+
if type_preamble_blocks:
|
|
425
|
+
print(f" [LaTeX] Loaded {len(type_preamble_blocks)} preamble blocks from {content_type_id or 'content type'} type.md")
|
|
426
|
+
for block in type_preamble_blocks:
|
|
427
|
+
preamble_lines.append(block.strip())
|
|
428
|
+
else:
|
|
429
|
+
if content_type_id and content_type_id != 'research_report':
|
|
430
|
+
print(f" [LaTeX] WARNING: Content type '{content_type_id}' has ZERO preamble blocks — falling back to default preamble. "
|
|
431
|
+
f"This is likely a bug (type.md not found or missing ```latex blocks).")
|
|
432
|
+
else:
|
|
433
|
+
print(f" [LaTeX] Using default preamble (no content type preamble blocks)")
|
|
434
|
+
preamble_lines.append(self._default_preamble())
|
|
435
|
+
|
|
436
|
+
return "\n\n".join(preamble_lines)
|
|
437
|
+
|
|
438
|
+
def _default_preamble(self) -> str:
|
|
439
|
+
"""Fallback preamble packages for content types without explicit LaTeX code blocks."""
|
|
440
|
+
return (
|
|
441
|
+
"\\usepackage[T1]{fontenc}\n"
|
|
442
|
+
"\\usepackage[utf8]{inputenc}\n"
|
|
443
|
+
"\\usepackage{lmodern}\n"
|
|
444
|
+
"\\usepackage{microtype}\n"
|
|
445
|
+
"\\usepackage{amsmath}\n"
|
|
446
|
+
"\\usepackage{graphicx}\n"
|
|
447
|
+
"\\usepackage{booktabs}\n"
|
|
448
|
+
"\\usepackage{array}\n"
|
|
449
|
+
"\\usepackage{longtable}\n"
|
|
450
|
+
"\\usepackage{float}\n"
|
|
451
|
+
"\\usepackage{caption}\n"
|
|
452
|
+
"\\usepackage{geometry}\n"
|
|
453
|
+
"\\geometry{margin=1in}\n"
|
|
454
|
+
"\\usepackage{fancyhdr}\n"
|
|
455
|
+
"\\usepackage{setspace}\n"
|
|
456
|
+
"\\onehalfspacing\n"
|
|
457
|
+
"\\usepackage{hyperref}\n"
|
|
458
|
+
"\\hypersetup{colorlinks=true,linkcolor=blue,citecolor=red,urlcolor=blue}\n"
|
|
459
|
+
"\\usepackage{tikz}\n"
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
def _assemble_content_for_prompt(self, config_data: Dict, structure: List, sections: List[Dict]) -> str:
|
|
463
|
+
"""Concatenate all sections with delimiters for the LLM prompt."""
|
|
464
|
+
parts = []
|
|
465
|
+
for sec in sections:
|
|
466
|
+
parts.append(f"=== SECTION: {sec['title']} ===")
|
|
467
|
+
if sec.get('source'):
|
|
468
|
+
parts.append(f"(source: {sec['source']})")
|
|
469
|
+
parts.append(sec['content'])
|
|
470
|
+
parts.append("")
|
|
471
|
+
return "\n".join(parts)
|
|
472
|
+
|
|
473
|
+
def _generate_document_body(self, content: str, config: Dict, instructions: str,
|
|
474
|
+
preamble: str, rules: str) -> str:
|
|
475
|
+
"""Generate the complete document body via a single holistic LLM call."""
|
|
476
|
+
if not self.client:
|
|
477
|
+
raise RuntimeError("ANTHROPIC_API_KEY not set — cannot convert markdown to LaTeX")
|
|
478
|
+
|
|
479
|
+
# Build system prompt with rendering context
|
|
480
|
+
system_parts = [
|
|
481
|
+
"You are a LaTeX document generation specialist. Generate the BODY of a LaTeX document "
|
|
482
|
+
"(everything between \\begin{document} and \\end{document}). Output ONLY raw LaTeX — "
|
|
483
|
+
"no code fences, no \\documentclass, no preamble, no \\begin{document}/\\end{document}."
|
|
484
|
+
]
|
|
485
|
+
|
|
486
|
+
if instructions:
|
|
487
|
+
system_parts.append(f"\n\n## RENDERING INSTRUCTIONS\nFollow these instructions precisely:\n\n{instructions}")
|
|
488
|
+
|
|
489
|
+
if preamble:
|
|
490
|
+
system_parts.append(
|
|
491
|
+
f"\n\n## AVAILABLE PREAMBLE (already included — you may use all macros/environments defined here)\n\n{preamble}"
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
if rules:
|
|
495
|
+
system_parts.append(f"\n\n## STRUCTURE RULES\n\n{rules}")
|
|
496
|
+
|
|
497
|
+
system_prompt = "\n".join(system_parts)
|
|
498
|
+
|
|
499
|
+
# Build user prompt with config metadata and content
|
|
500
|
+
user_parts = ["Generate the complete LaTeX document body for the following content.\n"]
|
|
501
|
+
|
|
502
|
+
# Config metadata
|
|
503
|
+
title = config.get('title', '')
|
|
504
|
+
if title:
|
|
505
|
+
user_parts.append(f"Document Title: {title}")
|
|
506
|
+
authors = config.get('authors', [])
|
|
507
|
+
if authors:
|
|
508
|
+
user_parts.append(f"Authors: {', '.join(authors) if isinstance(authors, list) else authors}")
|
|
509
|
+
|
|
510
|
+
# Include all project metadata
|
|
511
|
+
project_meta = config.get('project metadata', '')
|
|
512
|
+
if project_meta:
|
|
513
|
+
user_parts.append(f"\nProject Metadata:\n{project_meta}")
|
|
514
|
+
|
|
515
|
+
# Include disclaimer if present
|
|
516
|
+
disclaimer = config.get('disclaimer', '')
|
|
517
|
+
if disclaimer:
|
|
518
|
+
user_parts.append(f"\nDisclaimer text (include on cover page):\n{disclaimer}")
|
|
519
|
+
|
|
520
|
+
doc_options = config.get('document options', {})
|
|
521
|
+
if isinstance(doc_options, dict):
|
|
522
|
+
if doc_options.get('include_toc', False):
|
|
523
|
+
user_parts.append("\nInclude a table of contents.")
|
|
524
|
+
if doc_options.get('include_bibliography', False):
|
|
525
|
+
user_parts.append("Include a bibliography/references section at the end.")
|
|
526
|
+
|
|
527
|
+
user_parts.append(f"\n\n## CONTENT\n\n{content}")
|
|
528
|
+
|
|
529
|
+
user_prompt = "\n".join(user_parts)
|
|
530
|
+
|
|
531
|
+
try:
|
|
532
|
+
response = self.client.messages.create(
|
|
533
|
+
model="claude-sonnet-4-20250514",
|
|
534
|
+
max_tokens=16000,
|
|
535
|
+
temperature=0.2,
|
|
536
|
+
system=system_prompt,
|
|
537
|
+
messages=[{
|
|
538
|
+
"role": "user",
|
|
539
|
+
"content": user_prompt,
|
|
540
|
+
}],
|
|
541
|
+
)
|
|
542
|
+
body = response.content[0].text
|
|
543
|
+
# Strip code fences if the LLM wrapped the output
|
|
544
|
+
body = re.sub(r'^```(?:latex)?\s*\n', '', body)
|
|
545
|
+
body = re.sub(r'\n```\s*$', '', body)
|
|
546
|
+
return self._sanitize_unicode_for_latex(body)
|
|
547
|
+
except Exception as e:
|
|
548
|
+
print(f"Error generating document body via LLM: {e}")
|
|
549
|
+
raise
|
|
550
|
+
|
|
551
|
+
def _markdown_to_latex_content(self, markdown: str) -> str:
|
|
552
|
+
"""Convert markdown content to LaTeX body content using LLM.
|
|
553
|
+
|
|
554
|
+
Simple per-fragment conversion used by external callers (e.g. report_generator).
|
|
555
|
+
For full document generation, use _convert_markdown_to_latex instead.
|
|
556
|
+
"""
|
|
557
|
+
if not self.client:
|
|
558
|
+
raise RuntimeError("ANTHROPIC_API_KEY not set — cannot convert markdown to LaTeX")
|
|
559
|
+
|
|
560
|
+
try:
|
|
561
|
+
response = self.client.messages.create(
|
|
562
|
+
model="claude-sonnet-4-20250514",
|
|
563
|
+
max_tokens=4000,
|
|
564
|
+
temperature=0.2,
|
|
565
|
+
messages=[{
|
|
566
|
+
"role": "user",
|
|
567
|
+
"content": (
|
|
568
|
+
"Convert the following markdown to LaTeX body content. "
|
|
569
|
+
"Output ONLY raw LaTeX — no preamble, no \\documentclass, "
|
|
570
|
+
"no \\begin{document}, no \\end{document}, no code fences. "
|
|
571
|
+
"Use \\subsection as the highest heading level (not \\section). "
|
|
572
|
+
"Use \\subsubsection for lower-level headings. "
|
|
573
|
+
"Use booktabs (\\toprule, \\midrule, \\bottomrule) for tables. "
|
|
574
|
+
"Use itemize/enumerate for lists. "
|
|
575
|
+
"Use \\textbf, \\textit, \\texttt for emphasis. "
|
|
576
|
+
"Use \\href for hyperlinks. "
|
|
577
|
+
"Do NOT generate \\ref{}, \\cite{}, or \\label{} commands "
|
|
578
|
+
"unless they already appear verbatim in the source.\n\n"
|
|
579
|
+
f"{markdown}"
|
|
580
|
+
),
|
|
581
|
+
}],
|
|
582
|
+
)
|
|
583
|
+
return self._sanitize_unicode_for_latex(response.content[0].text)
|
|
584
|
+
except Exception as e:
|
|
585
|
+
print(f"Error converting markdown to LaTeX via LLM: {e}")
|
|
586
|
+
raise
|
|
587
|
+
|
|
588
|
+
def _sanitize_unicode_for_latex(self, text: str) -> str:
|
|
589
|
+
"""Replace common Unicode characters with LaTeX equivalents for pdflatex compatibility."""
|
|
590
|
+
replacements = {
|
|
591
|
+
# Superscripts
|
|
592
|
+
'\u2070': '$^{0}$', '\u00b9': '$^{1}$', '\u00b2': '$^{2}$',
|
|
593
|
+
'\u00b3': '$^{3}$', '\u2074': '$^{4}$', '\u2075': '$^{5}$',
|
|
594
|
+
'\u2076': '$^{6}$', '\u2077': '$^{7}$', '\u2078': '$^{8}$',
|
|
595
|
+
'\u2079': '$^{9}$', '\u207a': '$^{+}$', '\u207b': '$^{-}$',
|
|
596
|
+
# Subscripts
|
|
597
|
+
'\u2080': '$_{0}$', '\u2081': '$_{1}$', '\u2082': '$_{2}$',
|
|
598
|
+
'\u2083': '$_{3}$', '\u2084': '$_{4}$', '\u2085': '$_{5}$',
|
|
599
|
+
'\u2086': '$_{6}$', '\u2087': '$_{7}$', '\u2088': '$_{8}$',
|
|
600
|
+
'\u2089': '$_{9}$',
|
|
601
|
+
# Math symbols
|
|
602
|
+
'\u00d7': '$\\times$', # ×
|
|
603
|
+
'\u00f7': '$\\div$', # ÷
|
|
604
|
+
'\u2264': '$\\leq$', # ≤
|
|
605
|
+
'\u2265': '$\\geq$', # ≥
|
|
606
|
+
'\u2260': '$\\neq$', # ≠
|
|
607
|
+
'\u2248': '$\\approx$', # ≈
|
|
608
|
+
'\u221e': '$\\infty$', # ∞
|
|
609
|
+
'\u00b1': '$\\pm$', # ±
|
|
610
|
+
'\u2190': '$\\leftarrow$', # ←
|
|
611
|
+
'\u2192': '$\\rightarrow$', # →
|
|
612
|
+
# Typography
|
|
613
|
+
'\u2013': '--', # en dash
|
|
614
|
+
'\u2014': '---', # em dash
|
|
615
|
+
'\u2018': '`', # left single quote
|
|
616
|
+
'\u2019': "'", # right single quote
|
|
617
|
+
'\u201c': '``', # left double quote
|
|
618
|
+
'\u201d': "''", # right double quote
|
|
619
|
+
'\u2026': '\\ldots{}', # …
|
|
620
|
+
}
|
|
621
|
+
for char, latex in replacements.items():
|
|
622
|
+
text = text.replace(char, latex)
|
|
623
|
+
return text
|
|
624
|
+
|
|
625
|
+
def _process_csv_table_references(self, content: str, content_dir: str = "artifacts/sample_content") -> str:
|
|
626
|
+
"""Process CSV table references in markdown content."""
|
|
627
|
+
import re
|
|
628
|
+
from pathlib import Path
|
|
629
|
+
|
|
630
|
+
# Pattern to match CSV table comments (including multi-line with flexible spacing)
|
|
631
|
+
csv_pattern = r'<!-- CSV_TABLE:\s*(.*?)\s*-->'
|
|
632
|
+
|
|
633
|
+
def replace_csv_table(match):
|
|
634
|
+
metadata_text = match.group(1)
|
|
635
|
+
return self._convert_csv_reference_to_latex(metadata_text, content_dir)
|
|
636
|
+
|
|
637
|
+
# Replace all CSV table references (with DOTALL flag for multi-line matching)
|
|
638
|
+
processed_content = re.sub(csv_pattern, replace_csv_table, content, flags=re.DOTALL)
|
|
639
|
+
return processed_content
|
|
640
|
+
|
|
641
|
+
def _process_image_references(self, content: str, content_dir: str = "artifacts/sample_content") -> str:
|
|
642
|
+
"""Process IMAGE references in markdown content and convert to LaTeX figures."""
|
|
643
|
+
import re
|
|
644
|
+
from pathlib import Path
|
|
645
|
+
|
|
646
|
+
# Pattern to match IMAGE comments (multi-line)
|
|
647
|
+
image_pattern = r'<!-- IMAGE:\s*(.*?)\s*-->'
|
|
648
|
+
|
|
649
|
+
def replace_image_ref(match):
|
|
650
|
+
metadata_text = match.group(1)
|
|
651
|
+
return self._convert_image_reference_to_latex(metadata_text, content_dir)
|
|
652
|
+
|
|
653
|
+
processed_content = re.sub(image_pattern, replace_image_ref, content, flags=re.DOTALL)
|
|
654
|
+
return processed_content
|
|
655
|
+
|
|
656
|
+
def _process_tikz_references(self, content: str) -> str:
|
|
657
|
+
"""Process TIKZ references in markdown content and convert to LaTeX tikzpicture environments."""
|
|
658
|
+
tikz_pattern = r'<!-- TIKZ:\s*(.*?)\s*-->'
|
|
659
|
+
|
|
660
|
+
def replace_tikz_ref(match):
|
|
661
|
+
metadata_text = match.group(1)
|
|
662
|
+
return self._convert_tikz_reference_to_latex(metadata_text)
|
|
663
|
+
|
|
664
|
+
return re.sub(tikz_pattern, replace_tikz_ref, content, flags=re.DOTALL)
|
|
665
|
+
|
|
666
|
+
def _convert_tikz_reference_to_latex(self, metadata_text: str) -> str:
|
|
667
|
+
"""Convert a single TIKZ reference to a LaTeX figure with tikzpicture."""
|
|
668
|
+
lines = metadata_text.strip().split('\n')
|
|
669
|
+
|
|
670
|
+
caption = ''
|
|
671
|
+
label = ''
|
|
672
|
+
code_lines = []
|
|
673
|
+
in_code = False
|
|
674
|
+
|
|
675
|
+
for line in lines:
|
|
676
|
+
stripped = line.strip()
|
|
677
|
+
if stripped.startswith('code:'):
|
|
678
|
+
in_code = True
|
|
679
|
+
# Check if there's code on the same line after "code:"
|
|
680
|
+
rest = stripped[5:].strip()
|
|
681
|
+
if rest:
|
|
682
|
+
code_lines.append(rest)
|
|
683
|
+
elif in_code:
|
|
684
|
+
code_lines.append(line.rstrip())
|
|
685
|
+
elif ':' in stripped:
|
|
686
|
+
key, value = stripped.split(':', 1)
|
|
687
|
+
key = key.strip().lower()
|
|
688
|
+
value = value.strip()
|
|
689
|
+
if key == 'caption':
|
|
690
|
+
caption = value
|
|
691
|
+
elif key == 'label':
|
|
692
|
+
label = value
|
|
693
|
+
|
|
694
|
+
if not code_lines:
|
|
695
|
+
return '% TIKZ reference missing code'
|
|
696
|
+
|
|
697
|
+
tikz_code = '\n'.join(code_lines)
|
|
698
|
+
|
|
699
|
+
latex_parts = [
|
|
700
|
+
'\\begin{figure}[htbp]',
|
|
701
|
+
'\\centering',
|
|
702
|
+
'\\begin{tikzpicture}',
|
|
703
|
+
tikz_code,
|
|
704
|
+
'\\end{tikzpicture}',
|
|
705
|
+
]
|
|
706
|
+
if caption:
|
|
707
|
+
latex_parts.append(f'\\caption{{{caption}}}')
|
|
708
|
+
if label:
|
|
709
|
+
latex_parts.append(f'\\label{{{label}}}')
|
|
710
|
+
latex_parts.append('\\end{figure}')
|
|
711
|
+
|
|
712
|
+
return '\n'.join(latex_parts)
|
|
713
|
+
|
|
714
|
+
def _convert_image_reference_to_latex(self, metadata_text: str, content_dir: str) -> str:
|
|
715
|
+
"""Convert a single IMAGE reference to a LaTeX figure environment."""
|
|
716
|
+
from pathlib import Path
|
|
717
|
+
|
|
718
|
+
lines = metadata_text.strip().split('\n')
|
|
719
|
+
if not lines:
|
|
720
|
+
return "% IMAGE reference missing path"
|
|
721
|
+
|
|
722
|
+
# First line is the image path
|
|
723
|
+
image_path = lines[0].strip()
|
|
724
|
+
|
|
725
|
+
# Parse key-value metadata from remaining lines
|
|
726
|
+
caption = ''
|
|
727
|
+
label = ''
|
|
728
|
+
width = '0.8\\textwidth'
|
|
729
|
+
for line in lines[1:]:
|
|
730
|
+
line = line.strip()
|
|
731
|
+
if ':' in line:
|
|
732
|
+
key, value = line.split(':', 1)
|
|
733
|
+
key = key.strip().lower()
|
|
734
|
+
value = value.strip()
|
|
735
|
+
if key == 'caption':
|
|
736
|
+
caption = value
|
|
737
|
+
elif key == 'label':
|
|
738
|
+
label = value
|
|
739
|
+
elif key == 'width':
|
|
740
|
+
width = value
|
|
741
|
+
|
|
742
|
+
# Resolve image path relative to content directory
|
|
743
|
+
full_path = Path(content_dir) / image_path
|
|
744
|
+
if not full_path.exists():
|
|
745
|
+
return f"% Image not found: {image_path}"
|
|
746
|
+
|
|
747
|
+
# Generate LaTeX figure
|
|
748
|
+
latex_parts = [
|
|
749
|
+
'\\begin{figure}[htbp]',
|
|
750
|
+
'\\centering',
|
|
751
|
+
f'\\includegraphics[width={width}]{{{full_path}}}',
|
|
752
|
+
]
|
|
753
|
+
if caption:
|
|
754
|
+
latex_parts.append(f'\\caption{{{caption}}}')
|
|
755
|
+
if label:
|
|
756
|
+
latex_parts.append(f'\\label{{{label}}}')
|
|
757
|
+
latex_parts.append('\\end{figure}')
|
|
758
|
+
|
|
759
|
+
return '\n'.join(latex_parts)
|
|
760
|
+
|
|
761
|
+
def _convert_csv_reference_to_latex(self, metadata_text: str, content_dir: str) -> str:
|
|
762
|
+
"""Convert a single CSV reference to LaTeX table."""
|
|
763
|
+
from pathlib import Path
|
|
764
|
+
import csv
|
|
765
|
+
|
|
766
|
+
# Parse metadata from the comment
|
|
767
|
+
metadata = self._parse_csv_metadata(metadata_text)
|
|
768
|
+
|
|
769
|
+
csv_filename = metadata.get('filename')
|
|
770
|
+
if not csv_filename:
|
|
771
|
+
return "% CSV table reference missing filename"
|
|
772
|
+
|
|
773
|
+
# Load CSV data
|
|
774
|
+
csv_path = Path(content_dir) / "data" / csv_filename
|
|
775
|
+
if not csv_path.exists():
|
|
776
|
+
return f"% CSV file not found: {csv_filename}"
|
|
777
|
+
|
|
778
|
+
try:
|
|
779
|
+
with open(csv_path, 'r', encoding='utf-8') as f:
|
|
780
|
+
reader = csv.reader(f)
|
|
781
|
+
rows = list(reader)
|
|
782
|
+
|
|
783
|
+
if not rows:
|
|
784
|
+
return f"% Empty CSV file: {csv_filename}"
|
|
785
|
+
|
|
786
|
+
# Extract headers and data based on metadata
|
|
787
|
+
headers = rows[0]
|
|
788
|
+
data_rows = rows[1:]
|
|
789
|
+
|
|
790
|
+
# Apply column filtering
|
|
791
|
+
columns = metadata.get('columns', 'all')
|
|
792
|
+
if columns != 'all':
|
|
793
|
+
try:
|
|
794
|
+
if isinstance(columns, str):
|
|
795
|
+
if '-' in columns:
|
|
796
|
+
# Range like "1-3"
|
|
797
|
+
start, end = map(int, columns.split('-'))
|
|
798
|
+
col_indices = list(range(start-1, end)) # Convert to 0-based
|
|
799
|
+
else:
|
|
800
|
+
# Single column
|
|
801
|
+
col_indices = [int(columns)-1]
|
|
802
|
+
else:
|
|
803
|
+
# List of columns
|
|
804
|
+
col_indices = [int(c)-1 for c in columns]
|
|
805
|
+
|
|
806
|
+
headers = [headers[i] for i in col_indices if i < len(headers)]
|
|
807
|
+
data_rows = [[row[i] if i < len(row) else '' for i in col_indices] for row in data_rows]
|
|
808
|
+
except (ValueError, IndexError):
|
|
809
|
+
# Fall back to all columns if parsing fails
|
|
810
|
+
pass
|
|
811
|
+
|
|
812
|
+
# Apply row filtering
|
|
813
|
+
rows_spec = metadata.get('rows', 'all')
|
|
814
|
+
if rows_spec != 'all':
|
|
815
|
+
try:
|
|
816
|
+
if isinstance(rows_spec, str) and '-' in rows_spec:
|
|
817
|
+
# Range like "1-5"
|
|
818
|
+
start, end = map(int, rows_spec.split('-'))
|
|
819
|
+
data_rows = data_rows[start-1:end] # Convert to 0-based
|
|
820
|
+
elif isinstance(rows_spec, str):
|
|
821
|
+
# Single row or number
|
|
822
|
+
max_rows = int(rows_spec)
|
|
823
|
+
data_rows = data_rows[:max_rows]
|
|
824
|
+
except (ValueError, IndexError):
|
|
825
|
+
# Fall back to all rows if parsing fails
|
|
826
|
+
pass
|
|
827
|
+
|
|
828
|
+
# Generate LaTeX table
|
|
829
|
+
return self._generate_csv_latex_table(headers, data_rows, metadata)
|
|
830
|
+
|
|
831
|
+
except Exception as e:
|
|
832
|
+
return f"% Error loading CSV {csv_filename}: {str(e)}"
|
|
833
|
+
|
|
834
|
+
def _parse_csv_metadata(self, metadata_text: str) -> dict:
|
|
835
|
+
"""Parse CSV table metadata from comment text."""
|
|
836
|
+
metadata = {}
|
|
837
|
+
lines = metadata_text.strip().split('\n')
|
|
838
|
+
|
|
839
|
+
# First line should be the filename
|
|
840
|
+
if lines:
|
|
841
|
+
metadata['filename'] = lines[0].strip()
|
|
842
|
+
|
|
843
|
+
# Parse key-value pairs from remaining lines
|
|
844
|
+
for line in lines[1:]:
|
|
845
|
+
line = line.strip()
|
|
846
|
+
if ':' in line:
|
|
847
|
+
key, value = line.split(':', 1)
|
|
848
|
+
key = key.strip()
|
|
849
|
+
value = value.strip()
|
|
850
|
+
metadata[key] = value
|
|
851
|
+
|
|
852
|
+
return metadata
|
|
853
|
+
|
|
854
|
+
def _generate_csv_latex_table(self, headers: list, data_rows: list, metadata: dict) -> str:
|
|
855
|
+
"""Generate LaTeX table from CSV data and metadata."""
|
|
856
|
+
if not headers:
|
|
857
|
+
return "% No headers found in CSV data"
|
|
858
|
+
|
|
859
|
+
num_cols = len(headers)
|
|
860
|
+
col_spec = 'l' * num_cols # Default to left-aligned columns
|
|
861
|
+
|
|
862
|
+
# Get metadata values
|
|
863
|
+
caption = metadata.get('caption', 'CSV Data Table')
|
|
864
|
+
label = metadata.get('label', 'tab:csv_table')
|
|
865
|
+
table_format = metadata.get('format', 'professional')
|
|
866
|
+
description = metadata.get('description', '')
|
|
867
|
+
|
|
868
|
+
latex_parts = []
|
|
869
|
+
|
|
870
|
+
# Add description if provided
|
|
871
|
+
if description:
|
|
872
|
+
latex_parts.append(f"% {description}")
|
|
873
|
+
latex_parts.append("")
|
|
874
|
+
|
|
875
|
+
# Start table
|
|
876
|
+
latex_parts.extend([
|
|
877
|
+
'\\begin{table}[htbp]',
|
|
878
|
+
'\\centering',
|
|
879
|
+
f'\\begin{{tabular}}{{{col_spec}}}'
|
|
880
|
+
])
|
|
881
|
+
|
|
882
|
+
# Add professional formatting if requested
|
|
883
|
+
if table_format == 'professional':
|
|
884
|
+
latex_parts.append('\\toprule')
|
|
885
|
+
else:
|
|
886
|
+
latex_parts.append('\\hline')
|
|
887
|
+
|
|
888
|
+
# Add header row
|
|
889
|
+
header_latex = ' & '.join(headers) + ' \\\\'
|
|
890
|
+
latex_parts.append(header_latex)
|
|
891
|
+
|
|
892
|
+
# Add separator
|
|
893
|
+
if table_format == 'professional':
|
|
894
|
+
latex_parts.append('\\midrule')
|
|
895
|
+
else:
|
|
896
|
+
latex_parts.append('\\hline')
|
|
897
|
+
|
|
898
|
+
# Add data rows
|
|
899
|
+
for row in data_rows:
|
|
900
|
+
# Ensure row has the right number of columns
|
|
901
|
+
while len(row) < num_cols:
|
|
902
|
+
row.append('')
|
|
903
|
+
row = row[:num_cols] # Truncate if too many columns
|
|
904
|
+
|
|
905
|
+
row_latex = ' & '.join(str(cell) for cell in row) + ' \\\\'
|
|
906
|
+
latex_parts.append(row_latex)
|
|
907
|
+
|
|
908
|
+
# End table
|
|
909
|
+
if table_format == 'professional':
|
|
910
|
+
latex_parts.append('\\bottomrule')
|
|
911
|
+
else:
|
|
912
|
+
latex_parts.append('\\hline')
|
|
913
|
+
|
|
914
|
+
latex_parts.extend([
|
|
915
|
+
'\\end{tabular}',
|
|
916
|
+
f'\\caption{{{caption}}}',
|
|
917
|
+
f'\\label{{{label}}}',
|
|
918
|
+
'\\end{table}'
|
|
919
|
+
])
|
|
920
|
+
|
|
921
|
+
return '\n'.join(latex_parts)
|
|
922
|
+
|
|
923
|
+
def _optimize_structure(self, content: str) -> Tuple[str, List[str]]:
|
|
924
|
+
"""Optimize document structure and organization."""
|
|
925
|
+
optimizations = []
|
|
926
|
+
optimized = content
|
|
927
|
+
|
|
928
|
+
# Ensure proper document class
|
|
929
|
+
if not re.search(r'\\documentclass', optimized):
|
|
930
|
+
optimized = '\\documentclass[12pt,letterpaper]{article}\n\n' + optimized
|
|
931
|
+
optimizations.append('Added professional document class')
|
|
932
|
+
|
|
933
|
+
# Ensure title and author if missing
|
|
934
|
+
if not re.search(r'\\title\{', optimized) and not re.search(r'\\maketitle', optimized):
|
|
935
|
+
# Add after document class
|
|
936
|
+
class_match = re.search(r'(\\documentclass.*\n)', optimized)
|
|
937
|
+
if class_match:
|
|
938
|
+
insert_pos = class_match.end()
|
|
939
|
+
title_block = '\n\\title{Research Report}\n\\author{Research Team}\n\\date{\\today}\n'
|
|
940
|
+
optimized = optimized[:insert_pos] + title_block + optimized[insert_pos:]
|
|
941
|
+
optimizations.append('Added title and author information')
|
|
942
|
+
|
|
943
|
+
# Add table of contents if document has sections
|
|
944
|
+
if re.search(r'\\(section|chapter)', optimized) and not re.search(r'\\tableofcontents', optimized):
|
|
945
|
+
# Add after \begin{document} and \maketitle
|
|
946
|
+
begin_doc = re.search(r'\\begin\{document\}', optimized)
|
|
947
|
+
if begin_doc:
|
|
948
|
+
# Look for \maketitle or add TOC right after \begin{document}
|
|
949
|
+
maketitle_match = re.search(r'\\maketitle', optimized[begin_doc.end():])
|
|
950
|
+
if maketitle_match:
|
|
951
|
+
insert_pos = begin_doc.end() + maketitle_match.end()
|
|
952
|
+
toc_block = '\n\\tableofcontents\n\\newpage\n'
|
|
953
|
+
else:
|
|
954
|
+
insert_pos = begin_doc.end()
|
|
955
|
+
toc_block = '\n\\tableofcontents\n\\newpage\n'
|
|
956
|
+
|
|
957
|
+
optimized = optimized[:insert_pos] + toc_block + optimized[insert_pos:]
|
|
958
|
+
optimizations.append('Added table of contents')
|
|
959
|
+
|
|
960
|
+
# Ensure proper section hierarchy
|
|
961
|
+
optimized, hierarchy_opts = self._fix_section_hierarchy(optimized)
|
|
962
|
+
optimizations.extend(hierarchy_opts)
|
|
963
|
+
|
|
964
|
+
return optimized, optimizations
|
|
965
|
+
|
|
966
|
+
def _optimize_typography(self, content: str, level: str) -> Tuple[str, List[str]]:
|
|
967
|
+
"""Optimize typography and formatting."""
|
|
968
|
+
optimizations = []
|
|
969
|
+
optimized = content
|
|
970
|
+
|
|
971
|
+
# Essential typography packages
|
|
972
|
+
essential_packages = [
|
|
973
|
+
('fontenc', '\\usepackage[T1]{fontenc}'),
|
|
974
|
+
('inputenc', '\\usepackage[utf8]{inputenc}'),
|
|
975
|
+
('lmodern', '\\usepackage{lmodern}'),
|
|
976
|
+
('microtype', '\\usepackage{microtype}'),
|
|
977
|
+
]
|
|
978
|
+
|
|
979
|
+
# Add packages if missing
|
|
980
|
+
for package_name, package_line in essential_packages:
|
|
981
|
+
if not re.search(f'\\\\usepackage.*{{{package_name}}}', optimized):
|
|
982
|
+
# Insert after documentclass
|
|
983
|
+
class_match = re.search(r'(\\documentclass.*\n)', optimized)
|
|
984
|
+
if class_match:
|
|
985
|
+
insert_pos = class_match.end()
|
|
986
|
+
optimized = optimized[:insert_pos] + package_line + '\n' + optimized[insert_pos:]
|
|
987
|
+
optimizations.append(f'Added {package_name} package for better typography')
|
|
988
|
+
|
|
989
|
+
# Add geometry for proper margins
|
|
990
|
+
if not re.search(r'\\usepackage.*\{geometry\}', optimized):
|
|
991
|
+
class_match = re.search(r'(\\documentclass.*\n)', optimized)
|
|
992
|
+
if class_match:
|
|
993
|
+
insert_pos = class_match.end()
|
|
994
|
+
geometry_block = '\\usepackage{geometry}\n\\geometry{margin=1in}\n'
|
|
995
|
+
optimized = optimized[:insert_pos] + geometry_block + optimized[insert_pos:]
|
|
996
|
+
optimizations.append('Added geometry package with proper margins')
|
|
997
|
+
|
|
998
|
+
# Add spacing improvements
|
|
999
|
+
if level in ['moderate', 'aggressive']:
|
|
1000
|
+
if not re.search(r'\\usepackage.*\{setspace\}', optimized):
|
|
1001
|
+
class_match = re.search(r'(\\documentclass.*\n)', optimized)
|
|
1002
|
+
if class_match:
|
|
1003
|
+
insert_pos = class_match.end()
|
|
1004
|
+
spacing_block = '\\usepackage{setspace}\n\\onehalfspacing\n'
|
|
1005
|
+
optimized = optimized[:insert_pos] + spacing_block + optimized[insert_pos:]
|
|
1006
|
+
optimizations.append('Added improved line spacing')
|
|
1007
|
+
|
|
1008
|
+
# Fix spacing issues
|
|
1009
|
+
spacing_fixes = [
|
|
1010
|
+
(r'\s{2,}', ' ', 'Fixed multiple consecutive spaces'),
|
|
1011
|
+
(r'([.!?])([A-Z])', r'\1 \2', 'Added missing spaces after sentences'),
|
|
1012
|
+
(r'\s+([.!?])', r'\1', 'Fixed spaces before punctuation'),
|
|
1013
|
+
]
|
|
1014
|
+
|
|
1015
|
+
for pattern, replacement, description in spacing_fixes:
|
|
1016
|
+
if re.search(pattern, optimized):
|
|
1017
|
+
optimized = re.sub(pattern, replacement, optimized)
|
|
1018
|
+
optimizations.append(description)
|
|
1019
|
+
|
|
1020
|
+
return optimized, optimizations
|
|
1021
|
+
|
|
1022
|
+
def _optimize_tables(self, content: str) -> Tuple[str, List[str]]:
|
|
1023
|
+
"""Optimize table formatting."""
|
|
1024
|
+
optimizations = []
|
|
1025
|
+
optimized = content
|
|
1026
|
+
|
|
1027
|
+
# Check if document has tables
|
|
1028
|
+
has_tables = re.search(r'\\begin\{tabular\}|\\begin\{table\}', optimized)
|
|
1029
|
+
|
|
1030
|
+
if has_tables:
|
|
1031
|
+
# Add booktabs package
|
|
1032
|
+
if not re.search(r'\\usepackage.*\{booktabs\}', optimized):
|
|
1033
|
+
class_match = re.search(r'(\\documentclass.*\n)', optimized)
|
|
1034
|
+
if class_match:
|
|
1035
|
+
insert_pos = class_match.end()
|
|
1036
|
+
optimized = optimized[:insert_pos] + '\\usepackage{booktabs}\n' + optimized[insert_pos:]
|
|
1037
|
+
optimizations.append('Added booktabs package for professional tables')
|
|
1038
|
+
|
|
1039
|
+
# Replace \\hline with booktabs rules
|
|
1040
|
+
if re.search(r'\\hline', optimized):
|
|
1041
|
+
# This is a simplified replacement - in practice, you'd want more sophisticated logic
|
|
1042
|
+
optimized = re.sub(r'\\hline', '\\midrule', optimized)
|
|
1043
|
+
optimizations.append('Replaced \\hline with professional booktabs rules')
|
|
1044
|
+
|
|
1045
|
+
# Add array package for better column types
|
|
1046
|
+
if not re.search(r'\\usepackage.*\{array\}', optimized):
|
|
1047
|
+
class_match = re.search(r'(\\documentclass.*\n)', optimized)
|
|
1048
|
+
if class_match:
|
|
1049
|
+
insert_pos = class_match.end()
|
|
1050
|
+
optimized = optimized[:insert_pos] + '\\usepackage{array}\n' + optimized[insert_pos:]
|
|
1051
|
+
optimizations.append('Added array package for better table formatting')
|
|
1052
|
+
|
|
1053
|
+
return optimized, optimizations
|
|
1054
|
+
|
|
1055
|
+
def _optimize_figures(self, content: str) -> Tuple[str, List[str]]:
|
|
1056
|
+
"""Optimize figure formatting and placement."""
|
|
1057
|
+
optimizations = []
|
|
1058
|
+
optimized = content
|
|
1059
|
+
|
|
1060
|
+
# Check if document has figures
|
|
1061
|
+
has_figures = re.search(r'\\includegraphics|\\begin\{figure\}', optimized)
|
|
1062
|
+
|
|
1063
|
+
if has_figures:
|
|
1064
|
+
# Essential figure packages
|
|
1065
|
+
figure_packages = [
|
|
1066
|
+
('graphicx', '\\usepackage{graphicx}'),
|
|
1067
|
+
('float', '\\usepackage{float}'),
|
|
1068
|
+
('caption', '\\usepackage{caption}')
|
|
1069
|
+
]
|
|
1070
|
+
|
|
1071
|
+
for package_name, package_line in figure_packages:
|
|
1072
|
+
if not re.search(f'\\\\usepackage.*{{{package_name}}}', optimized):
|
|
1073
|
+
class_match = re.search(r'(\\documentclass.*\n)', optimized)
|
|
1074
|
+
if class_match:
|
|
1075
|
+
insert_pos = class_match.end()
|
|
1076
|
+
optimized = optimized[:insert_pos] + package_line + '\n' + optimized[insert_pos:]
|
|
1077
|
+
optimizations.append(f'Added {package_name} package for better figures')
|
|
1078
|
+
|
|
1079
|
+
# Improve figure placement
|
|
1080
|
+
figure_placements = re.findall(r'\\begin\{figure\}\[([^\]]*)\]', optimized)
|
|
1081
|
+
poor_placements = [p for p in figure_placements if 'h' in p and 't' not in p and 'b' not in p]
|
|
1082
|
+
|
|
1083
|
+
if poor_placements:
|
|
1084
|
+
# Replace poor placements with better options
|
|
1085
|
+
optimized = re.sub(r'\\begin\{figure\}\[h\]', '\\begin{figure}[htbp]', optimized)
|
|
1086
|
+
optimizations.append('Improved figure placement options')
|
|
1087
|
+
|
|
1088
|
+
return optimized, optimizations
|
|
1089
|
+
|
|
1090
|
+
def _optimize_references(self, content: str) -> Tuple[str, List[str]]:
|
|
1091
|
+
"""Optimize references and citations."""
|
|
1092
|
+
optimizations = []
|
|
1093
|
+
optimized = content
|
|
1094
|
+
|
|
1095
|
+
# Add hyperref for better navigation (should be last)
|
|
1096
|
+
if not re.search(r'\\usepackage.*\{hyperref\}', optimized):
|
|
1097
|
+
# Add before \begin{document}
|
|
1098
|
+
begin_doc = re.search(r'\\begin\{document\}', optimized)
|
|
1099
|
+
if begin_doc:
|
|
1100
|
+
insert_pos = begin_doc.start()
|
|
1101
|
+
hyperref_block = '\\usepackage{hyperref}\n\\hypersetup{\n colorlinks=true,\n linkcolor=blue,\n citecolor=red,\n urlcolor=blue\n}\n\n'
|
|
1102
|
+
optimized = optimized[:insert_pos] + hyperref_block + optimized[insert_pos:]
|
|
1103
|
+
optimizations.append('Added hyperref package for better navigation')
|
|
1104
|
+
|
|
1105
|
+
return optimized, optimizations
|
|
1106
|
+
|
|
1107
|
+
def _apply_general_cleanup(self, content: str) -> Tuple[str, List[str]]:
|
|
1108
|
+
"""Apply general cleanup and improvements."""
|
|
1109
|
+
optimizations = []
|
|
1110
|
+
optimized = content
|
|
1111
|
+
|
|
1112
|
+
# Remove excessive blank lines
|
|
1113
|
+
original_lines = len(optimized.split('\n'))
|
|
1114
|
+
optimized = re.sub(r'\n{3,}', '\n\n', optimized)
|
|
1115
|
+
new_lines = len(optimized.split('\n'))
|
|
1116
|
+
|
|
1117
|
+
if new_lines < original_lines:
|
|
1118
|
+
optimizations.append(f'Cleaned up excessive blank lines ({original_lines - new_lines} lines removed)')
|
|
1119
|
+
|
|
1120
|
+
# Fix common LaTeX spacing issues
|
|
1121
|
+
common_fixes = [
|
|
1122
|
+
(r'\\section\s*\{', r'\\section{', 'Fixed section command spacing'),
|
|
1123
|
+
(r'\\subsection\s*\{', r'\\subsection{', 'Fixed subsection command spacing'),
|
|
1124
|
+
(r'\\textbf\s*\{', r'\\textbf{', 'Fixed textbf command spacing'),
|
|
1125
|
+
(r'\\textit\s*\{', r'\\textit{', 'Fixed textit command spacing'),
|
|
1126
|
+
]
|
|
1127
|
+
|
|
1128
|
+
for pattern, replacement, description in common_fixes:
|
|
1129
|
+
if re.search(pattern, optimized):
|
|
1130
|
+
optimized = re.sub(pattern, replacement, optimized)
|
|
1131
|
+
optimizations.append(description)
|
|
1132
|
+
|
|
1133
|
+
return optimized, optimizations
|
|
1134
|
+
|
|
1135
|
+
def _fix_section_hierarchy(self, content: str) -> Tuple[str, List[str]]:
|
|
1136
|
+
"""Fix section hierarchy issues."""
|
|
1137
|
+
optimizations = []
|
|
1138
|
+
# This would contain logic to fix section nesting issues
|
|
1139
|
+
# For now, return as-is
|
|
1140
|
+
return content, optimizations
|
|
1141
|
+
|
|
1142
|
+
def _final_formatting_pass(self, content: str) -> str:
|
|
1143
|
+
"""Apply final formatting improvements.
|
|
1144
|
+
|
|
1145
|
+
Only modifies the document body — the preamble (everything before
|
|
1146
|
+
\\begin{document}) is returned unchanged to avoid breaking custom
|
|
1147
|
+
macro definitions (\\newcommand, \\newenvironment, etc.).
|
|
1148
|
+
"""
|
|
1149
|
+
# Split at \begin{document} so regexes only touch the body
|
|
1150
|
+
split_marker = "\\begin{document}"
|
|
1151
|
+
marker_pos = content.find(split_marker)
|
|
1152
|
+
if marker_pos == -1:
|
|
1153
|
+
# No \begin{document} — apply to entire content (legacy path)
|
|
1154
|
+
body = content
|
|
1155
|
+
preamble = ""
|
|
1156
|
+
rejoin = False
|
|
1157
|
+
else:
|
|
1158
|
+
preamble = content[:marker_pos + len(split_marker)]
|
|
1159
|
+
body = content[marker_pos + len(split_marker):]
|
|
1160
|
+
rejoin = True
|
|
1161
|
+
|
|
1162
|
+
# Ensure proper spacing around environments
|
|
1163
|
+
# Preserve optional arguments like \begin{tikzpicture}[remember picture, overlay]
|
|
1164
|
+
body = re.sub(r'(\\begin\{[^}]+\}(?:\[[^\]]*\])?)\n{0,1}', r'\1\n', body)
|
|
1165
|
+
body = re.sub(r'\n{0,1}(\\end\{[^}]+\})', r'\n\1', body)
|
|
1166
|
+
|
|
1167
|
+
# Ensure proper spacing around sections
|
|
1168
|
+
body = re.sub(r'(\\(?:sub)*section\{[^}]+\})\n{0,1}', r'\1\n\n', body)
|
|
1169
|
+
|
|
1170
|
+
if rejoin:
|
|
1171
|
+
result = preamble + body
|
|
1172
|
+
else:
|
|
1173
|
+
result = body
|
|
1174
|
+
|
|
1175
|
+
# Clean up final whitespace
|
|
1176
|
+
return result.strip()
|
|
1177
|
+
|
|
1178
|
+
def calculate_optimization_score(self, before_issues: int, after_issues: int, optimizations_count: int) -> int:
|
|
1179
|
+
"""Calculate optimization effectiveness score."""
|
|
1180
|
+
issues_fixed = max(0, before_issues - after_issues)
|
|
1181
|
+
|
|
1182
|
+
# Base score from issues fixed
|
|
1183
|
+
score = min(50, issues_fixed * 5)
|
|
1184
|
+
|
|
1185
|
+
# Bonus for optimizations applied
|
|
1186
|
+
score += min(30, optimizations_count * 2)
|
|
1187
|
+
|
|
1188
|
+
# Bonus for significant improvement
|
|
1189
|
+
if issues_fixed > before_issues * 0.5: # Fixed more than 50% of issues
|
|
1190
|
+
score += 20
|
|
1191
|
+
|
|
1192
|
+
return min(100, score)
|