algomath-extract 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +260 -0
- package/bin/algo-extract.js +143 -0
- package/bin/algo-generate.js +102 -0
- package/bin/algo-help.js +136 -0
- package/bin/algo-list.js +56 -0
- package/bin/algo-run.js +141 -0
- package/bin/algo-status.js +88 -0
- package/bin/algo-verify.js +189 -0
- package/bin/install.js +349 -0
- package/package.json +57 -0
- package/requirements.txt +20 -0
- package/src/__pycache__/intent.cpython-313.pyc +0 -0
- package/src/cli/__pycache__/commands.cpython-313.pyc +0 -0
- package/src/cli/cli_entry.py +106 -0
- package/src/cli/commands.py +339 -0
- package/src/execution/__init__.py +74 -0
- package/src/execution/__pycache__/__init__.cpython-313.pyc +0 -0
- package/src/execution/__pycache__/display.cpython-313.pyc +0 -0
- package/src/execution/__pycache__/errors.cpython-313.pyc +0 -0
- package/src/execution/__pycache__/executor.cpython-313.pyc +0 -0
- package/src/execution/__pycache__/sandbox.cpython-313.pyc +0 -0
- package/src/execution/display.py +261 -0
- package/src/execution/errors.py +158 -0
- package/src/execution/executor.py +253 -0
- package/src/execution/sandbox.py +333 -0
- package/src/extraction/__init__.py +102 -0
- package/src/extraction/__pycache__/__init__.cpython-313.pyc +0 -0
- package/src/extraction/__pycache__/boundaries.cpython-313.pyc +0 -0
- package/src/extraction/__pycache__/errors.cpython-313.pyc +0 -0
- package/src/extraction/__pycache__/llm_extraction.cpython-313.pyc +0 -0
- package/src/extraction/__pycache__/notation.cpython-313.pyc +0 -0
- package/src/extraction/__pycache__/parser.cpython-313.pyc +0 -0
- package/src/extraction/__pycache__/pdf_processor.cpython-313.pyc +0 -0
- package/src/extraction/__pycache__/prompts.cpython-313.pyc +0 -0
- package/src/extraction/__pycache__/review.cpython-313.pyc +0 -0
- package/src/extraction/__pycache__/schema.cpython-313.pyc +0 -0
- package/src/extraction/__pycache__/validation.cpython-313.pyc +0 -0
- package/src/extraction/boundaries.py +281 -0
- package/src/extraction/errors.py +156 -0
- package/src/extraction/llm_extraction.py +225 -0
- package/src/extraction/notation.py +240 -0
- package/src/extraction/parser.py +402 -0
- package/src/extraction/pdf_processor.py +281 -0
- package/src/extraction/prompts.py +90 -0
- package/src/extraction/review.py +298 -0
- package/src/extraction/schema.py +173 -0
- package/src/extraction/validation.py +202 -0
- package/src/generation/__init__.py +79 -0
- package/src/generation/__pycache__/__init__.cpython-313.pyc +0 -0
- package/src/generation/__pycache__/code_generator.cpython-313.pyc +0 -0
- package/src/generation/__pycache__/errors.cpython-313.pyc +0 -0
- package/src/generation/__pycache__/hybrid.cpython-313.pyc +0 -0
- package/src/generation/__pycache__/llm_generator.cpython-313.pyc +0 -0
- package/src/generation/__pycache__/persistence.cpython-313.pyc +0 -0
- package/src/generation/__pycache__/prompts.cpython-313.pyc +0 -0
- package/src/generation/__pycache__/review.cpython-313.pyc +0 -0
- package/src/generation/__pycache__/templates.cpython-313.pyc +0 -0
- package/src/generation/__pycache__/types.cpython-313.pyc +0 -0
- package/src/generation/__pycache__/validation.cpython-313.pyc +0 -0
- package/src/generation/code_generator.py +375 -0
- package/src/generation/errors.py +84 -0
- package/src/generation/hybrid.py +210 -0
- package/src/generation/llm_generator.py +223 -0
- package/src/generation/persistence.py +221 -0
- package/src/generation/prompts.py +202 -0
- package/src/generation/review.py +254 -0
- package/src/generation/templates.py +208 -0
- package/src/generation/types.py +196 -0
- package/src/generation/validation.py +278 -0
- package/src/intent.py +323 -0
- package/src/verification/__init__.py +63 -0
- package/src/verification/__pycache__/__init__.cpython-313.pyc +0 -0
- package/src/verification/__pycache__/checker.cpython-313.pyc +0 -0
- package/src/verification/__pycache__/comparison.cpython-313.pyc +0 -0
- package/src/verification/__pycache__/explainer.cpython-313.pyc +0 -0
- package/src/verification/__pycache__/static_analysis.cpython-313.pyc +0 -0
- package/src/verification/checker.py +220 -0
- package/src/verification/comparison.py +492 -0
- package/src/verification/explainer.py +414 -0
- package/src/verification/static_analysis.py +540 -0
- package/src/workflows/__init__.py +21 -0
- package/src/workflows/__pycache__/__init__.cpython-313.pyc +0 -0
- package/src/workflows/__pycache__/extract.cpython-313.pyc +0 -0
- package/src/workflows/__pycache__/generate.cpython-313.pyc +0 -0
- package/src/workflows/__pycache__/run.cpython-313.pyc +0 -0
- package/src/workflows/__pycache__/verify.cpython-313.pyc +0 -0
- package/src/workflows/extract.py +181 -0
- package/src/workflows/generate.py +155 -0
- package/src/workflows/run.py +187 -0
- package/src/workflows/verify.py +334 -0
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
"""Algorithm boundary detection for extraction.
|
|
2
|
+
|
|
3
|
+
Identifies algorithm sections including headers, inputs, outputs,
|
|
4
|
+
and step boundaries within mathematical text.
|
|
5
|
+
|
|
6
|
+
Per D-12, D-13, D-14, D-15, D-16, D-17 from 02-CONTEXT.md.
|
|
7
|
+
"""
|
|
8
|
+
import re
|
|
9
|
+
from typing import Dict, List, Optional, Tuple, NamedTuple
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class AlgorithmBoundaries:
|
|
15
|
+
"""Represents detected algorithm boundaries."""
|
|
16
|
+
name: str
|
|
17
|
+
name_line: Optional[int] = None
|
|
18
|
+
input_start: Optional[int] = None
|
|
19
|
+
input_end: Optional[int] = None
|
|
20
|
+
output_start: Optional[int] = None
|
|
21
|
+
output_end: Optional[int] = None
|
|
22
|
+
steps_start: Optional[int] = None
|
|
23
|
+
steps_end: Optional[int] = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# Patterns for detecting algorithm headers
|
|
27
|
+
HEADER_PATTERNS = [
|
|
28
|
+
r'^\s*(?:Algorithm|ALGORITHM)[\s:]+([A-Za-z][A-Za-z0-9_\s]*)',
|
|
29
|
+
r'^\s*(?:Procedure|PROCEDURE)[\s:]+([A-Za-z][A-Za-z0-9_\s]*)',
|
|
30
|
+
r'^\s*(?:Function|FUNCTION)[\s:]+([A-Za-z][A-Za-z0-9_\s]*)',
|
|
31
|
+
r'^\s*(?:Method|METHOD)[\s:]+([A-Za-z][A-Za-z0-9_\s]*)',
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
# Patterns for input sections
|
|
35
|
+
INPUT_PATTERNS = [
|
|
36
|
+
r'^\s*(?:Input|INPUT|Inputs|INPUTS)[\s:]*',
|
|
37
|
+
r'^\s*(?:Given|GIVEN)[\s:]*',
|
|
38
|
+
r'^\s*(?:Parameters|PARAMETERS)[\s:]*',
|
|
39
|
+
r'^\s*(?:Takes|TAKES)[\s:]*',
|
|
40
|
+
r'^\s*(?:Requires|REQUIRES)[\s:]*',
|
|
41
|
+
r'^\s*(?:Precondition|PRECONDITION)[\s:]*',
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
# Patterns for output sections
|
|
45
|
+
OUTPUT_PATTERNS = [
|
|
46
|
+
r'^\s*(?:Output|OUTPUT|Outputs|OUTPUTS)[\s:]*',
|
|
47
|
+
r'^\s*(?:Returns|RETURNS)[\s:]*',
|
|
48
|
+
r'^\s*(?:Result|RESULT|Results|RESULTS)[\s:]*',
|
|
49
|
+
r'^\s*(?:Produces|PRODUCES)[\s:]*',
|
|
50
|
+
r'^\s*(?:Postcondition|POSTCONDITION)[\s:]*',
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def find_algorithm_name(text: str) -> Tuple[str, Optional[int]]:
|
|
55
|
+
"""
|
|
56
|
+
Find algorithm name from header.
|
|
57
|
+
|
|
58
|
+
Searches for patterns like:
|
|
59
|
+
- "Algorithm: Name"
|
|
60
|
+
- "Algorithm Name"
|
|
61
|
+
- "Procedure: Name"
|
|
62
|
+
- "Function Name"
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
text: Algorithm text
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
Tuple of (name, line_number) or ("unnamed", None)
|
|
69
|
+
|
|
70
|
+
Per D-13 from 02-CONTEXT.md.
|
|
71
|
+
"""
|
|
72
|
+
lines = text.split('\n')
|
|
73
|
+
|
|
74
|
+
for line_num, line in enumerate(lines, 1):
|
|
75
|
+
for pattern in HEADER_PATTERNS:
|
|
76
|
+
match = re.match(pattern, line, re.IGNORECASE)
|
|
77
|
+
if match:
|
|
78
|
+
name = match.group(1).strip()
|
|
79
|
+
# Clean up the name
|
|
80
|
+
name = re.sub(r'\s+', ' ', name)
|
|
81
|
+
if name:
|
|
82
|
+
return name, line_num
|
|
83
|
+
|
|
84
|
+
return "unnamed", None
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def extract_input_section(text: str) -> Tuple[Optional[int], Optional[int], List[str]]:
|
|
88
|
+
"""
|
|
89
|
+
Extract input section from algorithm text.
|
|
90
|
+
|
|
91
|
+
Identifies input section boundaries and returns the content.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
text: Algorithm text
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
Tuple of (start_line, end_line, input_descriptions)
|
|
98
|
+
Lines are 1-indexed, None if not found
|
|
99
|
+
|
|
100
|
+
Per D-15 from 02-CONTEXT.md.
|
|
101
|
+
"""
|
|
102
|
+
lines = text.split('\n')
|
|
103
|
+
start_line = None
|
|
104
|
+
end_line = None
|
|
105
|
+
|
|
106
|
+
# Find input section header
|
|
107
|
+
for line_num, line in enumerate(lines, 1):
|
|
108
|
+
for pattern in INPUT_PATTERNS:
|
|
109
|
+
if re.match(pattern, line, re.IGNORECASE):
|
|
110
|
+
start_line = line_num
|
|
111
|
+
break
|
|
112
|
+
if start_line:
|
|
113
|
+
break
|
|
114
|
+
|
|
115
|
+
if not start_line:
|
|
116
|
+
return None, None, []
|
|
117
|
+
|
|
118
|
+
# Extract input descriptions until next section or end
|
|
119
|
+
input_descriptions = []
|
|
120
|
+
end_line = start_line
|
|
121
|
+
|
|
122
|
+
for line_num in range(start_line, len(lines) + 1):
|
|
123
|
+
line = lines[line_num - 1]
|
|
124
|
+
|
|
125
|
+
# Check for end of input section (output section or steps)
|
|
126
|
+
if line_num > start_line:
|
|
127
|
+
if _is_section_boundary(line):
|
|
128
|
+
break
|
|
129
|
+
|
|
130
|
+
# Skip the header line itself
|
|
131
|
+
if line_num == start_line:
|
|
132
|
+
# Remove header part
|
|
133
|
+
clean_line = re.sub(r'^\s*(?:Input|INPUT)[\s:]*', '', line).strip()
|
|
134
|
+
if clean_line:
|
|
135
|
+
input_descriptions.append(clean_line)
|
|
136
|
+
else:
|
|
137
|
+
stripped = line.strip()
|
|
138
|
+
if stripped and not _is_section_boundary(line):
|
|
139
|
+
input_descriptions.append(stripped)
|
|
140
|
+
|
|
141
|
+
end_line = line_num
|
|
142
|
+
|
|
143
|
+
return start_line, end_line, input_descriptions
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def extract_output_section(text: str) -> Tuple[Optional[int], Optional[int], List[str]]:
|
|
147
|
+
"""
|
|
148
|
+
Extract output section from algorithm text.
|
|
149
|
+
|
|
150
|
+
Identifies output section boundaries and returns the content.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
text: Algorithm text
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
Tuple of (start_line, end_line, output_descriptions)
|
|
157
|
+
Lines are 1-indexed, None if not found
|
|
158
|
+
|
|
159
|
+
Per D-16 from 02-CONTEXT.md.
|
|
160
|
+
"""
|
|
161
|
+
lines = text.split('\n')
|
|
162
|
+
start_line = None
|
|
163
|
+
end_line = None
|
|
164
|
+
|
|
165
|
+
# Find output section header
|
|
166
|
+
for line_num, line in enumerate(lines, 1):
|
|
167
|
+
for pattern in OUTPUT_PATTERNS:
|
|
168
|
+
if re.match(pattern, line, re.IGNORECASE):
|
|
169
|
+
start_line = line_num
|
|
170
|
+
break
|
|
171
|
+
if start_line:
|
|
172
|
+
break
|
|
173
|
+
|
|
174
|
+
if not start_line:
|
|
175
|
+
return None, None, []
|
|
176
|
+
|
|
177
|
+
# Extract output descriptions until next section or end
|
|
178
|
+
output_descriptions = []
|
|
179
|
+
end_line = start_line
|
|
180
|
+
|
|
181
|
+
for line_num in range(start_line, len(lines) + 1):
|
|
182
|
+
line = lines[line_num - 1]
|
|
183
|
+
|
|
184
|
+
# Check for end of output section
|
|
185
|
+
if line_num > start_line:
|
|
186
|
+
if _is_section_boundary(line):
|
|
187
|
+
break
|
|
188
|
+
|
|
189
|
+
# Skip the header line itself
|
|
190
|
+
if line_num == start_line:
|
|
191
|
+
clean_line = re.sub(r'^\s*(?:Output|OUTPUT)[\s:]*', '', line).strip()
|
|
192
|
+
if clean_line:
|
|
193
|
+
output_descriptions.append(clean_line)
|
|
194
|
+
else:
|
|
195
|
+
stripped = line.strip()
|
|
196
|
+
if stripped and not _is_section_boundary(line):
|
|
197
|
+
output_descriptions.append(stripped)
|
|
198
|
+
|
|
199
|
+
end_line = line_num
|
|
200
|
+
|
|
201
|
+
return start_line, end_line, output_descriptions
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def detect_algorithm_boundaries(text: str) -> AlgorithmBoundaries:
|
|
205
|
+
"""
|
|
206
|
+
Detect all algorithm boundaries in text.
|
|
207
|
+
|
|
208
|
+
Per D-12, D-13, D-14 from 02-CONTEXT.md.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
text: Algorithm text
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
AlgorithmBoundaries with detected sections
|
|
215
|
+
"""
|
|
216
|
+
name, name_line = find_algorithm_name(text)
|
|
217
|
+
|
|
218
|
+
input_start, input_end, _ = extract_input_section(text)
|
|
219
|
+
output_start, output_end, _ = extract_output_section(text)
|
|
220
|
+
|
|
221
|
+
# Detect steps section (after outputs or after name if no I/O)
|
|
222
|
+
lines = text.split('\n')
|
|
223
|
+
steps_start = None
|
|
224
|
+
steps_end = len(lines)
|
|
225
|
+
|
|
226
|
+
# Start after the latest of: name, input, output
|
|
227
|
+
potential_start = name_line or 1
|
|
228
|
+
if input_end:
|
|
229
|
+
potential_start = max(potential_start, input_end + 1)
|
|
230
|
+
if output_end:
|
|
231
|
+
potential_start = max(potential_start, output_end + 1)
|
|
232
|
+
|
|
233
|
+
# Look for numbered steps
|
|
234
|
+
for line_num in range(potential_start, len(lines) + 1):
|
|
235
|
+
line = lines[line_num - 1]
|
|
236
|
+
if re.match(r'^\s*\d+[.\)]\s+', line) or re.match(r'^\s*[Ss]tep\s+\d+', line):
|
|
237
|
+
steps_start = line_num
|
|
238
|
+
break
|
|
239
|
+
|
|
240
|
+
# If no numbered steps found, use the line after sections
|
|
241
|
+
if not steps_start:
|
|
242
|
+
steps_start = potential_start
|
|
243
|
+
|
|
244
|
+
return AlgorithmBoundaries(
|
|
245
|
+
name=name,
|
|
246
|
+
name_line=name_line,
|
|
247
|
+
input_start=input_start,
|
|
248
|
+
input_end=input_end,
|
|
249
|
+
output_start=output_start,
|
|
250
|
+
output_end=output_end,
|
|
251
|
+
steps_start=steps_start,
|
|
252
|
+
steps_end=steps_end
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def _is_section_boundary(line: str) -> bool:
|
|
257
|
+
"""
|
|
258
|
+
Check if line marks a section boundary.
|
|
259
|
+
|
|
260
|
+
Returns True for:
|
|
261
|
+
- Empty lines (double newline)
|
|
262
|
+
- Output headers after input
|
|
263
|
+
- Step indicators
|
|
264
|
+
- Algorithm boundaries
|
|
265
|
+
"""
|
|
266
|
+
stripped = line.strip()
|
|
267
|
+
|
|
268
|
+
if not stripped:
|
|
269
|
+
return True
|
|
270
|
+
|
|
271
|
+
# Check for section headers
|
|
272
|
+
all_headers = HEADER_PATTERNS + INPUT_PATTERNS + OUTPUT_PATTERNS
|
|
273
|
+
for pattern in all_headers:
|
|
274
|
+
if re.match(pattern, line, re.IGNORECASE):
|
|
275
|
+
return True
|
|
276
|
+
|
|
277
|
+
# Check for numbered steps
|
|
278
|
+
if re.match(r'^\s*\d+[.\)]', line):
|
|
279
|
+
return True
|
|
280
|
+
|
|
281
|
+
return False
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""Extraction error types for AlgoMath."""
|
|
2
|
+
from typing import Optional, List
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class ExtractionError(Exception):
|
|
8
|
+
"""Base class for extraction errors."""
|
|
9
|
+
|
|
10
|
+
message: str
|
|
11
|
+
line_number: Optional[int] = None
|
|
12
|
+
suggestion: Optional[str] = None
|
|
13
|
+
|
|
14
|
+
def __str__(self) -> str:
|
|
15
|
+
parts = [self.message]
|
|
16
|
+
if self.line_number:
|
|
17
|
+
parts.append(f" (at line {self.line_number})")
|
|
18
|
+
if self.suggestion:
|
|
19
|
+
parts.append(f"\nSuggestion: {self.suggestion}")
|
|
20
|
+
return "".join(parts)
|
|
21
|
+
|
|
22
|
+
def to_dict(self) -> dict:
|
|
23
|
+
"""Convert to dictionary for JSON serialization."""
|
|
24
|
+
return {
|
|
25
|
+
"type": self.__class__.__name__,
|
|
26
|
+
"message": self.message,
|
|
27
|
+
"line_number": self.line_number,
|
|
28
|
+
"suggestion": self.suggestion
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ParseError(ExtractionError):
|
|
33
|
+
"""
|
|
34
|
+
Raised when text cannot be parsed.
|
|
35
|
+
|
|
36
|
+
Per D-23 from 02-CONTEXT.md.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(self, message: str, line_number: Optional[int] = None,
|
|
40
|
+
suggestion: Optional[str] = None):
|
|
41
|
+
super().__init__(
|
|
42
|
+
message=f"Parse error: {message}",
|
|
43
|
+
line_number=line_number,
|
|
44
|
+
suggestion=suggestion or "Check syntax and try again"
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class AmbiguityError(ExtractionError):
|
|
49
|
+
"""
|
|
50
|
+
Raised when multiple valid interpretations exist.
|
|
51
|
+
|
|
52
|
+
Per D-23 from 02-CONTEXT.md.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def __init__(self, message: str, line_number: Optional[int] = None,
|
|
56
|
+
interpretations: Optional[List[str]] = None,
|
|
57
|
+
suggestion: Optional[str] = None):
|
|
58
|
+
super().__init__(
|
|
59
|
+
message=f"Ambiguity: {message}",
|
|
60
|
+
line_number=line_number,
|
|
61
|
+
suggestion=suggestion or "Provide more context"
|
|
62
|
+
)
|
|
63
|
+
self.interpretations = interpretations or []
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class IncompleteError(ExtractionError):
|
|
67
|
+
"""
|
|
68
|
+
Raised when algorithm appears incomplete.
|
|
69
|
+
|
|
70
|
+
Per D-23 from 02-CONTEXT.md.
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
def __init__(self, message: str, line_number: Optional[int] = None,
|
|
74
|
+
missing: Optional[List[str]] = None,
|
|
75
|
+
suggestion: Optional[str] = None):
|
|
76
|
+
super().__init__(
|
|
77
|
+
message=f"Incomplete: {message}",
|
|
78
|
+
line_number=line_number,
|
|
79
|
+
suggestion=suggestion or "Add missing information"
|
|
80
|
+
)
|
|
81
|
+
self.missing = missing or []
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def categorize_error(error_text: str, line_number: Optional[int] = None) -> ExtractionError:
|
|
85
|
+
"""
|
|
86
|
+
Categorize an error message into appropriate error type.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
error_text: Raw error message
|
|
90
|
+
line_number: Line where error occurred
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Categorized ExtractionError
|
|
94
|
+
|
|
95
|
+
Per D-23 from 02-CONTEXT.md.
|
|
96
|
+
"""
|
|
97
|
+
text_lower = error_text.lower()
|
|
98
|
+
|
|
99
|
+
# Parse errors
|
|
100
|
+
parse_patterns = [
|
|
101
|
+
"unmatched", "unexpected", "invalid syntax", "parse",
|
|
102
|
+
"cannot parse", "syntax error", "malformed"
|
|
103
|
+
]
|
|
104
|
+
if any(p in text_lower for p in parse_patterns):
|
|
105
|
+
return ParseError(
|
|
106
|
+
message=error_text,
|
|
107
|
+
line_number=line_number,
|
|
108
|
+
suggestion="Check for matching parentheses, brackets, or quotes"
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# Ambiguity errors
|
|
112
|
+
ambiguity_patterns = [
|
|
113
|
+
"ambiguous", "could mean", "could be", "unclear",
|
|
114
|
+
"multiple interpretations", "not sure if", "could refer to"
|
|
115
|
+
]
|
|
116
|
+
if any(p in text_lower for p in ambiguity_patterns):
|
|
117
|
+
return AmbiguityError(
|
|
118
|
+
message=error_text,
|
|
119
|
+
line_number=line_number,
|
|
120
|
+
suggestion="Clarify the meaning with more specific language"
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Incomplete errors
|
|
124
|
+
incomplete_patterns = [
|
|
125
|
+
"incomplete", "missing", "not found", "expected",
|
|
126
|
+
"end of input", "unexpected end", "truncated"
|
|
127
|
+
]
|
|
128
|
+
if any(p in text_lower for p in incomplete_patterns):
|
|
129
|
+
return IncompleteError(
|
|
130
|
+
message=error_text,
|
|
131
|
+
line_number=line_number,
|
|
132
|
+
suggestion="Ensure the algorithm has a complete description"
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# Default to generic extraction error
|
|
136
|
+
return ExtractionError(
|
|
137
|
+
message=error_text,
|
|
138
|
+
line_number=line_number,
|
|
139
|
+
suggestion="Review the text and try again"
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def format_errors_for_user(errors: List[ExtractionError]) -> str:
|
|
144
|
+
"""
|
|
145
|
+
Format multiple errors into user-friendly message.
|
|
146
|
+
|
|
147
|
+
Per D-24 from 02-CONTEXT.md.
|
|
148
|
+
"""
|
|
149
|
+
if not errors:
|
|
150
|
+
return "No errors found."
|
|
151
|
+
|
|
152
|
+
lines = ["Extraction completed with issues:"]
|
|
153
|
+
for i, error in enumerate(errors, 1):
|
|
154
|
+
lines.append(f"\n{i}. {error}")
|
|
155
|
+
|
|
156
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
"""LLM-based extraction with hybrid fallback to rule-based parser."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
from typing import Optional, List, Any
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
|
|
8
|
+
from .schema import Algorithm, Step, StepType
|
|
9
|
+
from .parser import RuleBasedParser
|
|
10
|
+
from .prompts import EXTRACTION_SYSTEM_PROMPT, format_extraction_prompt
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class ExtractionResult:
|
|
15
|
+
"""Result of extraction with metadata."""
|
|
16
|
+
algorithm: Algorithm
|
|
17
|
+
success: bool
|
|
18
|
+
method: str # "llm" or "rule_based"
|
|
19
|
+
errors: List[str]
|
|
20
|
+
line_references: List[List[int]]
|
|
21
|
+
|
|
22
|
+
def __post_init__(self):
|
|
23
|
+
if self.errors is None:
|
|
24
|
+
self.errors = []
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def extract_algorithm_llm(
|
|
28
|
+
text: str,
|
|
29
|
+
timeout: int = 30
|
|
30
|
+
) -> ExtractionResult:
|
|
31
|
+
"""
|
|
32
|
+
Extract algorithm using LLM with rule-based fallback.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
text: Algorithm description text
|
|
36
|
+
timeout: Maximum time in seconds (per D-27)
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
ExtractionResult with algorithm and metadata
|
|
40
|
+
|
|
41
|
+
Per D-01, D-04 from 02-CONTEXT.md.
|
|
42
|
+
"""
|
|
43
|
+
errors = []
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
# Format prompt with line numbers
|
|
47
|
+
user_prompt = format_extraction_prompt(text)
|
|
48
|
+
|
|
49
|
+
# Call LLM (using agent's completion capability)
|
|
50
|
+
llm_response = _call_llm(
|
|
51
|
+
system=EXTRACTION_SYSTEM_PROMPT,
|
|
52
|
+
user=user_prompt,
|
|
53
|
+
timeout=timeout
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
if llm_response:
|
|
57
|
+
# Parse JSON response
|
|
58
|
+
algorithm = _parse_llm_response(llm_response, text)
|
|
59
|
+
if algorithm:
|
|
60
|
+
return ExtractionResult(
|
|
61
|
+
algorithm=algorithm,
|
|
62
|
+
success=True,
|
|
63
|
+
method="llm",
|
|
64
|
+
errors=[],
|
|
65
|
+
line_references=[step.line_refs for step in algorithm.steps]
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
errors.append("LLM extraction returned no valid result")
|
|
69
|
+
|
|
70
|
+
except Exception as e:
|
|
71
|
+
errors.append(f"LLM extraction failed: {str(e)}")
|
|
72
|
+
|
|
73
|
+
# Fallback to rule-based parser
|
|
74
|
+
try:
|
|
75
|
+
parser = RuleBasedParser()
|
|
76
|
+
algorithm = parser.parse(text)
|
|
77
|
+
|
|
78
|
+
return ExtractionResult(
|
|
79
|
+
algorithm=algorithm,
|
|
80
|
+
success=True,
|
|
81
|
+
method="rule_based",
|
|
82
|
+
errors=errors + ["Fell back to rule-based parser"],
|
|
83
|
+
line_references=[step.line_refs for step in algorithm.steps]
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
except Exception as e:
|
|
87
|
+
errors.append(f"Rule-based fallback failed: {str(e)}")
|
|
88
|
+
|
|
89
|
+
# Return empty algorithm
|
|
90
|
+
return ExtractionResult(
|
|
91
|
+
algorithm=Algorithm(name="unnamed", source_text=text),
|
|
92
|
+
success=False,
|
|
93
|
+
method="failed",
|
|
94
|
+
errors=errors,
|
|
95
|
+
line_references=[]
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _call_llm(system: str, user: str, timeout: int) -> Optional[str]:
|
|
100
|
+
"""
|
|
101
|
+
Call LLM for extraction. Uses agent's native capabilities.
|
|
102
|
+
|
|
103
|
+
In actual implementation, this would call the AI assistant.
|
|
104
|
+
For now, returns None to trigger fallback.
|
|
105
|
+
"""
|
|
106
|
+
# Placeholder - actual implementation would use agent
|
|
107
|
+
return None
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _parse_llm_response(response: str, original_text: str) -> Optional[Algorithm]:
|
|
111
|
+
"""
|
|
112
|
+
Parse LLM JSON response into Algorithm object.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
response: JSON string from LLM
|
|
116
|
+
original_text: Original algorithm text
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
Algorithm object or None if parsing fails
|
|
120
|
+
"""
|
|
121
|
+
try:
|
|
122
|
+
# Extract JSON from response (in case of markdown code blocks)
|
|
123
|
+
json_match = re.search(r'```(?:json)?\s*\n?(.*?)```', response, re.DOTALL)
|
|
124
|
+
if json_match:
|
|
125
|
+
json_str = json_match.group(1)
|
|
126
|
+
else:
|
|
127
|
+
json_str = response
|
|
128
|
+
|
|
129
|
+
# Clean up
|
|
130
|
+
json_str = json_str.strip()
|
|
131
|
+
|
|
132
|
+
# Parse JSON
|
|
133
|
+
data = json.loads(json_str)
|
|
134
|
+
|
|
135
|
+
# Build Algorithm
|
|
136
|
+
algorithm = Algorithm(
|
|
137
|
+
name=data.get("name", "unnamed"),
|
|
138
|
+
description=data.get("description", ""),
|
|
139
|
+
source_text=original_text
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# Parse inputs
|
|
143
|
+
algorithm.inputs = data.get("inputs", [])
|
|
144
|
+
|
|
145
|
+
# Parse outputs
|
|
146
|
+
algorithm.outputs = data.get("outputs", [])
|
|
147
|
+
|
|
148
|
+
# Parse steps
|
|
149
|
+
steps = []
|
|
150
|
+
for step_data in data.get("steps", []):
|
|
151
|
+
step_type_str = step_data.get("type", "comment")
|
|
152
|
+
try:
|
|
153
|
+
step_type = StepType(step_type_str)
|
|
154
|
+
except ValueError:
|
|
155
|
+
step_type = StepType.COMMENT
|
|
156
|
+
|
|
157
|
+
step = Step(
|
|
158
|
+
id=step_data.get("id", len(steps) + 1),
|
|
159
|
+
type=step_type,
|
|
160
|
+
description=step_data.get("description", ""),
|
|
161
|
+
inputs=step_data.get("inputs", []),
|
|
162
|
+
outputs=step_data.get("outputs", []),
|
|
163
|
+
line_refs=step_data.get("line_refs", []),
|
|
164
|
+
condition=step_data.get("condition"),
|
|
165
|
+
body=step_data.get("body", []),
|
|
166
|
+
else_body=step_data.get("else_body", []),
|
|
167
|
+
iter_var=step_data.get("iter_var"),
|
|
168
|
+
iter_range=step_data.get("iter_range"),
|
|
169
|
+
expression=step_data.get("expression"),
|
|
170
|
+
call_target=step_data.get("call_target"),
|
|
171
|
+
arguments=step_data.get("arguments", []),
|
|
172
|
+
annotation=step_data.get("annotation")
|
|
173
|
+
)
|
|
174
|
+
steps.append(step)
|
|
175
|
+
|
|
176
|
+
algorithm.steps = steps
|
|
177
|
+
|
|
178
|
+
return algorithm
|
|
179
|
+
|
|
180
|
+
except Exception:
|
|
181
|
+
return None
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
class HybridExtractor:
|
|
185
|
+
"""
|
|
186
|
+
Hybrid extractor combining rule-based and LLM extraction.
|
|
187
|
+
|
|
188
|
+
Per D-01, D-02 from 02-CONTEXT.md.
|
|
189
|
+
"""
|
|
190
|
+
|
|
191
|
+
def __init__(self):
|
|
192
|
+
self.rule_parser = RuleBasedParser()
|
|
193
|
+
self.use_llm = True
|
|
194
|
+
|
|
195
|
+
def extract(self, text: str, prefer_llm: bool = True) -> ExtractionResult:
|
|
196
|
+
"""
|
|
197
|
+
Extract algorithm using preferred method.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
text: Algorithm description
|
|
201
|
+
prefer_llm: If True, try LLM first; else use rule-based
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
ExtractionResult with extracted algorithm
|
|
205
|
+
"""
|
|
206
|
+
if prefer_llm and self.use_llm:
|
|
207
|
+
return extract_algorithm_llm(text)
|
|
208
|
+
else:
|
|
209
|
+
try:
|
|
210
|
+
algorithm = self.rule_parser.parse(text)
|
|
211
|
+
return ExtractionResult(
|
|
212
|
+
algorithm=algorithm,
|
|
213
|
+
success=True,
|
|
214
|
+
method="rule_based",
|
|
215
|
+
errors=[],
|
|
216
|
+
line_references=[step.line_refs for step in algorithm.steps]
|
|
217
|
+
)
|
|
218
|
+
except Exception as e:
|
|
219
|
+
return ExtractionResult(
|
|
220
|
+
algorithm=Algorithm(name="unnamed", source_text=text),
|
|
221
|
+
success=False,
|
|
222
|
+
method="failed",
|
|
223
|
+
errors=[str(e)],
|
|
224
|
+
line_references=[]
|
|
225
|
+
)
|