algomath-extract 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +260 -0
- package/bin/algo-extract.js +143 -0
- package/bin/algo-generate.js +102 -0
- package/bin/algo-help.js +136 -0
- package/bin/algo-list.js +56 -0
- package/bin/algo-run.js +141 -0
- package/bin/algo-status.js +88 -0
- package/bin/algo-verify.js +189 -0
- package/bin/install.js +349 -0
- package/package.json +57 -0
- package/requirements.txt +20 -0
- package/src/__pycache__/intent.cpython-313.pyc +0 -0
- package/src/cli/__pycache__/commands.cpython-313.pyc +0 -0
- package/src/cli/cli_entry.py +106 -0
- package/src/cli/commands.py +339 -0
- package/src/execution/__init__.py +74 -0
- package/src/execution/__pycache__/__init__.cpython-313.pyc +0 -0
- package/src/execution/__pycache__/display.cpython-313.pyc +0 -0
- package/src/execution/__pycache__/errors.cpython-313.pyc +0 -0
- package/src/execution/__pycache__/executor.cpython-313.pyc +0 -0
- package/src/execution/__pycache__/sandbox.cpython-313.pyc +0 -0
- package/src/execution/display.py +261 -0
- package/src/execution/errors.py +158 -0
- package/src/execution/executor.py +253 -0
- package/src/execution/sandbox.py +333 -0
- package/src/extraction/__init__.py +102 -0
- package/src/extraction/__pycache__/__init__.cpython-313.pyc +0 -0
- package/src/extraction/__pycache__/boundaries.cpython-313.pyc +0 -0
- package/src/extraction/__pycache__/errors.cpython-313.pyc +0 -0
- package/src/extraction/__pycache__/llm_extraction.cpython-313.pyc +0 -0
- package/src/extraction/__pycache__/notation.cpython-313.pyc +0 -0
- package/src/extraction/__pycache__/parser.cpython-313.pyc +0 -0
- package/src/extraction/__pycache__/pdf_processor.cpython-313.pyc +0 -0
- package/src/extraction/__pycache__/prompts.cpython-313.pyc +0 -0
- package/src/extraction/__pycache__/review.cpython-313.pyc +0 -0
- package/src/extraction/__pycache__/schema.cpython-313.pyc +0 -0
- package/src/extraction/__pycache__/validation.cpython-313.pyc +0 -0
- package/src/extraction/boundaries.py +281 -0
- package/src/extraction/errors.py +156 -0
- package/src/extraction/llm_extraction.py +225 -0
- package/src/extraction/notation.py +240 -0
- package/src/extraction/parser.py +402 -0
- package/src/extraction/pdf_processor.py +281 -0
- package/src/extraction/prompts.py +90 -0
- package/src/extraction/review.py +298 -0
- package/src/extraction/schema.py +173 -0
- package/src/extraction/validation.py +202 -0
- package/src/generation/__init__.py +79 -0
- package/src/generation/__pycache__/__init__.cpython-313.pyc +0 -0
- package/src/generation/__pycache__/code_generator.cpython-313.pyc +0 -0
- package/src/generation/__pycache__/errors.cpython-313.pyc +0 -0
- package/src/generation/__pycache__/hybrid.cpython-313.pyc +0 -0
- package/src/generation/__pycache__/llm_generator.cpython-313.pyc +0 -0
- package/src/generation/__pycache__/persistence.cpython-313.pyc +0 -0
- package/src/generation/__pycache__/prompts.cpython-313.pyc +0 -0
- package/src/generation/__pycache__/review.cpython-313.pyc +0 -0
- package/src/generation/__pycache__/templates.cpython-313.pyc +0 -0
- package/src/generation/__pycache__/types.cpython-313.pyc +0 -0
- package/src/generation/__pycache__/validation.cpython-313.pyc +0 -0
- package/src/generation/code_generator.py +375 -0
- package/src/generation/errors.py +84 -0
- package/src/generation/hybrid.py +210 -0
- package/src/generation/llm_generator.py +223 -0
- package/src/generation/persistence.py +221 -0
- package/src/generation/prompts.py +202 -0
- package/src/generation/review.py +254 -0
- package/src/generation/templates.py +208 -0
- package/src/generation/types.py +196 -0
- package/src/generation/validation.py +278 -0
- package/src/intent.py +323 -0
- package/src/verification/__init__.py +63 -0
- package/src/verification/__pycache__/__init__.cpython-313.pyc +0 -0
- package/src/verification/__pycache__/checker.cpython-313.pyc +0 -0
- package/src/verification/__pycache__/comparison.cpython-313.pyc +0 -0
- package/src/verification/__pycache__/explainer.cpython-313.pyc +0 -0
- package/src/verification/__pycache__/static_analysis.cpython-313.pyc +0 -0
- package/src/verification/checker.py +220 -0
- package/src/verification/comparison.py +492 -0
- package/src/verification/explainer.py +414 -0
- package/src/verification/static_analysis.py +540 -0
- package/src/workflows/__init__.py +21 -0
- package/src/workflows/__pycache__/__init__.cpython-313.pyc +0 -0
- package/src/workflows/__pycache__/extract.cpython-313.pyc +0 -0
- package/src/workflows/__pycache__/generate.cpython-313.pyc +0 -0
- package/src/workflows/__pycache__/run.cpython-313.pyc +0 -0
- package/src/workflows/__pycache__/verify.cpython-313.pyc +0 -0
- package/src/workflows/extract.py +181 -0
- package/src/workflows/generate.py +155 -0
- package/src/workflows/run.py +187 -0
- package/src/workflows/verify.py +334 -0
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
"""Mathematical notation normalization for algorithm extraction.
|
|
2
|
+
|
|
3
|
+
Transforms common mathematical notation into normalized forms
|
|
4
|
+
that can be parsed by rule-based and LLM extractors.
|
|
5
|
+
|
|
6
|
+
Per D-09, D-10, D-11 from 02-CONTEXT.md.
|
|
7
|
+
"""
|
|
8
|
+
import re
|
|
9
|
+
from typing import Tuple, Optional
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def normalize_notation(text: str) -> str:
|
|
13
|
+
"""
|
|
14
|
+
Normalize mathematical notation in algorithm text.
|
|
15
|
+
|
|
16
|
+
Performs transformations in order:
|
|
17
|
+
1. Summation notation (Σ)
|
|
18
|
+
2. Product notation (Π)
|
|
19
|
+
3. Set membership (∈, ∉, ⊆, ⊇, ⊂, ⊃)
|
|
20
|
+
4. Arrow notation (→, ←)
|
|
21
|
+
5. Subscripts (x_i → x[i])
|
|
22
|
+
6. Superscripts (x^2 → x**2)
|
|
23
|
+
7. Mathematical operators (×, ÷, √, ±)
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
text: Raw algorithm text with mathematical notation
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
Normalized text ready for parsing
|
|
30
|
+
|
|
31
|
+
Per D-09, D-10 from 02-CONTEXT.md.
|
|
32
|
+
"""
|
|
33
|
+
result = text
|
|
34
|
+
|
|
35
|
+
# Transform summation and product notation first (multi-line constructs)
|
|
36
|
+
result = transform_summation(result)
|
|
37
|
+
result = transform_product(result)
|
|
38
|
+
|
|
39
|
+
# Transform set membership
|
|
40
|
+
result = transform_set_membership(result)
|
|
41
|
+
|
|
42
|
+
# Transform arrow notation
|
|
43
|
+
result = transform_arrow_notation(result)
|
|
44
|
+
|
|
45
|
+
# Transform subscripts and superscripts
|
|
46
|
+
result = transform_subscripts(result)
|
|
47
|
+
result = transform_superscripts(result)
|
|
48
|
+
|
|
49
|
+
# Transform mathematical operators
|
|
50
|
+
result = transform_operators(result)
|
|
51
|
+
|
|
52
|
+
return result
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def transform_summation(text: str) -> str:
|
|
56
|
+
"""
|
|
57
|
+
Transform summation notation Σ into normalized form.
|
|
58
|
+
|
|
59
|
+
Patterns handled:
|
|
60
|
+
- Σ_{i=1}^{n} f(i) → sum over i from 1 to n of f(i)
|
|
61
|
+
- Σ_{i∈S} f(i) → sum over i in S of f(i)
|
|
62
|
+
- Σ_{i=1}^{n} Σ_{j=1}^{m} → nested sums
|
|
63
|
+
|
|
64
|
+
Per D-09 from 02-CONTEXT.md.
|
|
65
|
+
"""
|
|
66
|
+
# Pattern: Σ_{var=range}^{limit} expression
|
|
67
|
+
# Handle nested sums iteratively
|
|
68
|
+
result = text
|
|
69
|
+
|
|
70
|
+
# Single summation with range
|
|
71
|
+
pattern1 = r'[Σ\\sum]_\{(\w+)=([^}]+)\}\^\{([^}]+)\}\s*([^(]+?)(?:\(([^)]+)\))?$'
|
|
72
|
+
def replace_sum(match):
|
|
73
|
+
var = match.group(1)
|
|
74
|
+
start = match.group(2)
|
|
75
|
+
end = match.group(3)
|
|
76
|
+
func = match.group(4).strip() if match.group(4) else ""
|
|
77
|
+
arg = match.group(5) if match.group(5) else var
|
|
78
|
+
if func:
|
|
79
|
+
return f"sum over {var} from {start} to {end} of {func}({arg})"
|
|
80
|
+
return f"sum over {var} from {start} to {end}"
|
|
81
|
+
|
|
82
|
+
result = re.sub(pattern1, replace_sum, result, flags=re.MULTILINE)
|
|
83
|
+
|
|
84
|
+
# Summation over set
|
|
85
|
+
pattern2 = r'[Σ\\sum]_\{(\w+)\s*∈\s*(\w+)\}'
|
|
86
|
+
result = re.sub(pattern2, r'sum over \1 in \2', result)
|
|
87
|
+
|
|
88
|
+
return result
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def transform_product(text: str) -> str:
|
|
92
|
+
"""
|
|
93
|
+
Transform product notation Π into normalized form.
|
|
94
|
+
|
|
95
|
+
Patterns handled:
|
|
96
|
+
- Π_{i=1}^{n} f(i) → product over i from 1 to n of f(i)
|
|
97
|
+
- Π_{i∈S} f(i) → product over i in S of f(i)
|
|
98
|
+
|
|
99
|
+
Per D-09 from 02-CONTEXT.md.
|
|
100
|
+
"""
|
|
101
|
+
result = text
|
|
102
|
+
|
|
103
|
+
# Single product with range
|
|
104
|
+
pattern1 = r'[Π\\prod]_\{(\w+)=([^}]+)\}\^\{([^}]+)\}'
|
|
105
|
+
result = re.sub(pattern1, r'product over \1 from \2 to \3', result)
|
|
106
|
+
|
|
107
|
+
# Product over set
|
|
108
|
+
pattern2 = r'[Π\\prod]_\{(\w+)\s*∈\s*(\w+)\}'
|
|
109
|
+
result = re.sub(pattern2, r'product over \1 in \2', result)
|
|
110
|
+
|
|
111
|
+
return result
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def transform_set_membership(text: str) -> str:
|
|
115
|
+
"""
|
|
116
|
+
Transform set membership notation into Python equivalents.
|
|
117
|
+
|
|
118
|
+
Transformations:
|
|
119
|
+
- x ∈ S → x in S
|
|
120
|
+
- x ∉ S → x not in S
|
|
121
|
+
- A ⊆ B → A.issubset(B) [or A subset of B for description]
|
|
122
|
+
- A ⊂ B → A proper subset of B
|
|
123
|
+
- A ⊇ B → A superset of B
|
|
124
|
+
- A ⊃ B → A proper superset of B
|
|
125
|
+
|
|
126
|
+
Per D-09 from 02-CONTEXT.md.
|
|
127
|
+
"""
|
|
128
|
+
result = text
|
|
129
|
+
|
|
130
|
+
# Not in set
|
|
131
|
+
result = re.sub(r'(\w+)\s*∉\s*(\w+)', r'\1 not in \2', result)
|
|
132
|
+
|
|
133
|
+
# In set
|
|
134
|
+
result = re.sub(r'(\w+)\s*∈\s*(\w+)', r'\1 in \2', result)
|
|
135
|
+
|
|
136
|
+
# Subset and superset (use natural language for algorithm descriptions)
|
|
137
|
+
result = re.sub(r'(\w+)\s*⊆\s*(\w+)', r'\1 is subset of \2', result)
|
|
138
|
+
result = re.sub(r'(\w+)\s*⊂\s*(\w+)', r'\1 is proper subset of \2', result)
|
|
139
|
+
result = re.sub(r'(\w+)\s*⊇\s*(\w+)', r'\1 is superset of \2', result)
|
|
140
|
+
result = re.sub(r'(\w+)\s*⊃\s*(\w+)', r'\1 is proper superset of \2', result)
|
|
141
|
+
|
|
142
|
+
return result
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def transform_arrow_notation(text: str) -> str:
|
|
146
|
+
"""
|
|
147
|
+
Transform arrow notation into assignments.
|
|
148
|
+
|
|
149
|
+
Transformations:
|
|
150
|
+
- x → y → x = y (assignment)
|
|
151
|
+
- x ← y → x = y (assignment)
|
|
152
|
+
- x ↦ y → x maps to y
|
|
153
|
+
|
|
154
|
+
Per D-09 from 02-CONTEXT.md.
|
|
155
|
+
"""
|
|
156
|
+
result = text
|
|
157
|
+
|
|
158
|
+
# Assignment arrows (preserve direction as =)
|
|
159
|
+
result = re.sub(r'(\w+)\s*→\s*(.+?)(?=$|\s+\w+\s*=|\s+[,.])', r'\1 = \2', result)
|
|
160
|
+
result = re.sub(r'(\w+)\s*←\s*(.+?)(?=$|\s+\w+\s*=|\s+[,.])', r'\1 = \2', result)
|
|
161
|
+
|
|
162
|
+
return result
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def transform_subscripts(text: str) -> str:
|
|
166
|
+
"""
|
|
167
|
+
Transform subscript notation into array indexing.
|
|
168
|
+
|
|
169
|
+
Transformations:
|
|
170
|
+
- x_i → x[i]
|
|
171
|
+
- x_{i,j} → x[i][j] or x[i, j]
|
|
172
|
+
- A_{i,j} → A[i][j] (matrix access)
|
|
173
|
+
|
|
174
|
+
Per D-11 from 02-CONTEXT.md.
|
|
175
|
+
"""
|
|
176
|
+
result = text
|
|
177
|
+
|
|
178
|
+
# Simple subscript x_i
|
|
179
|
+
result = re.sub(r'(\w+)_\{(\w+)\}', r'\1[\2]', result)
|
|
180
|
+
result = re.sub(r'(\w+)_([a-zA-Z0-9])', r'\1[\2]', result)
|
|
181
|
+
|
|
182
|
+
return result
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def transform_superscripts(text: str) -> str:
|
|
186
|
+
"""
|
|
187
|
+
Transform superscript notation into power notation.
|
|
188
|
+
|
|
189
|
+
Transformations:
|
|
190
|
+
- x^2 → x**2
|
|
191
|
+
- x^{n} → x**n
|
|
192
|
+
- x^2_i → x[i]**2 (subscript takes precedence in rendering)
|
|
193
|
+
|
|
194
|
+
Per D-11 from 02-CONTEXT.md.
|
|
195
|
+
"""
|
|
196
|
+
result = text
|
|
197
|
+
|
|
198
|
+
# Braced superscript x^{n}
|
|
199
|
+
result = re.sub(r'(\w+)\^\{(\w+)\}', r'\1**\2', result)
|
|
200
|
+
|
|
201
|
+
# Simple superscript x^2 (single char or digit)
|
|
202
|
+
result = re.sub(r'(\w+)\^(\d)', r'\1**\2', result)
|
|
203
|
+
result = re.sub(r'(\w+)\^([a-zA-Z])', r'\1**\2', result)
|
|
204
|
+
|
|
205
|
+
return result
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def transform_operators(text: str) -> str:
|
|
209
|
+
"""
|
|
210
|
+
Transform mathematical operators into Python equivalents.
|
|
211
|
+
|
|
212
|
+
Transformations:
|
|
213
|
+
- × → *
|
|
214
|
+
- ÷ → /
|
|
215
|
+
- √x → sqrt(x)
|
|
216
|
+
- ± → +/-
|
|
217
|
+
- ≤ → <=
|
|
218
|
+
- ≥ → >=
|
|
219
|
+
- ≠ → !=
|
|
220
|
+
|
|
221
|
+
Per D-10 from 02-CONTEXT.md.
|
|
222
|
+
"""
|
|
223
|
+
result = text
|
|
224
|
+
|
|
225
|
+
# Comparison operators
|
|
226
|
+
result = result.replace('≤', '<=')
|
|
227
|
+
result = result.replace('≥', '>=')
|
|
228
|
+
result = result.replace('≠', '!=')
|
|
229
|
+
result = result.replace('≈', '~=')
|
|
230
|
+
|
|
231
|
+
# Arithmetic operators
|
|
232
|
+
result = result.replace('×', '*')
|
|
233
|
+
result = result.replace('÷', '/')
|
|
234
|
+
result = result.replace('±', '+/-')
|
|
235
|
+
|
|
236
|
+
# Square root
|
|
237
|
+
result = re.sub(r'√([\w\[\]]+)', r'sqrt(\1)', result)
|
|
238
|
+
result = re.sub(r'√\{([^}]+)\}', r'sqrt(\1)', result)
|
|
239
|
+
|
|
240
|
+
return result
|
|
@@ -0,0 +1,402 @@
|
|
|
1
|
+
"""Rule-based parser for algorithm extraction.
|
|
2
|
+
|
|
3
|
+
Uses pattern matching and heuristics to extract structured
|
|
4
|
+
algorithm steps from mathematical text descriptions.
|
|
5
|
+
|
|
6
|
+
Per D-02 from 02-CONTEXT.md.
|
|
7
|
+
"""
|
|
8
|
+
import re
|
|
9
|
+
from typing import List, Optional, Dict, Any, Tuple
|
|
10
|
+
|
|
11
|
+
from .schema import Algorithm, Step, StepType
|
|
12
|
+
from .notation import normalize_notation
|
|
13
|
+
from .boundaries import (
|
|
14
|
+
find_algorithm_name,
|
|
15
|
+
extract_input_section,
|
|
16
|
+
extract_output_section,
|
|
17
|
+
AlgorithmBoundaries,
|
|
18
|
+
detect_algorithm_boundaries
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class RuleBasedParser:
|
|
23
|
+
"""
|
|
24
|
+
Parser using regex rules to extract algorithm steps.
|
|
25
|
+
|
|
26
|
+
Integrates with notation normalization and boundary detection
|
|
27
|
+
to produce structured algorithm representations.
|
|
28
|
+
|
|
29
|
+
Per D-02, D-04 from 02-CONTEXT.md.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self):
|
|
33
|
+
# Step detection patterns
|
|
34
|
+
self.step_patterns = [
|
|
35
|
+
# Numbered steps: "1. Do something" or "1) Do something"
|
|
36
|
+
(r'^(?:\s*)(\d+)[.\)]\s*(.+)$', self._parse_numbered_step),
|
|
37
|
+
# Step keyword: "Step 1: Do something"
|
|
38
|
+
(r'^(?:\s*)[Ss]tep\s*(\d+)[:.\)]\s*(.+)$', self._parse_numbered_step),
|
|
39
|
+
# Bullet points as steps
|
|
40
|
+
(r'^(?:\s*)[-*•]\s*(.+)$', self._parse_bullet_step),
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
# Step type detection patterns
|
|
44
|
+
self.type_patterns = [
|
|
45
|
+
(r'^\s*[Rr]eturn', StepType.RETURN),
|
|
46
|
+
(r'^\s*[Oo]utput', StepType.RETURN),
|
|
47
|
+
(r'^\s*[Ff]or\s+each', StepType.LOOP_FOR),
|
|
48
|
+
(r'^\s*[Ff]or\s+\w+\s+(?:from|in|=)', StepType.LOOP_FOR),
|
|
49
|
+
(r'^\s*[Ff]or\s*\(', StepType.LOOP_FOR),
|
|
50
|
+
(r'^\s*[Rr]epeat', StepType.LOOP_FOR),
|
|
51
|
+
(r'^\s*[Ww]hile', StepType.LOOP_WHILE),
|
|
52
|
+
(r'^\s*[Uu]ntil', StepType.LOOP_WHILE),
|
|
53
|
+
(r'^\s*[Ii]f', StepType.CONDITIONAL),
|
|
54
|
+
(r'^\s*[Ww]hen', StepType.CONDITIONAL),
|
|
55
|
+
(r'^\s*[Cc]all\s+\w+\s*\(', StepType.CALL),
|
|
56
|
+
(r'^\s*[Ii]nvoke', StepType.CALL),
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
def parse(self, text: str, name: Optional[str] = None) -> Algorithm:
|
|
60
|
+
"""
|
|
61
|
+
Parse text into an Algorithm.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
text: Raw algorithm text
|
|
65
|
+
name: Optional algorithm name (auto-detected if not provided)
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
Algorithm object with extracted steps
|
|
69
|
+
|
|
70
|
+
Per D-02, D-04 from 02-CONTEXT.md.
|
|
71
|
+
"""
|
|
72
|
+
# Normalize mathematical notation
|
|
73
|
+
normalized = normalize_notation(text)
|
|
74
|
+
|
|
75
|
+
# Detect boundaries
|
|
76
|
+
boundaries = detect_algorithm_boundaries(text)
|
|
77
|
+
|
|
78
|
+
# Determine algorithm name
|
|
79
|
+
if not name:
|
|
80
|
+
name = boundaries.name
|
|
81
|
+
if not name:
|
|
82
|
+
name = "unnamed"
|
|
83
|
+
|
|
84
|
+
# Extract inputs and outputs
|
|
85
|
+
_, _, input_descs = extract_input_section(text)
|
|
86
|
+
_, _, output_descs = extract_output_section(text)
|
|
87
|
+
|
|
88
|
+
# Parse steps
|
|
89
|
+
steps = self._parse_steps(normalized, boundaries)
|
|
90
|
+
|
|
91
|
+
return Algorithm(
|
|
92
|
+
name=name,
|
|
93
|
+
inputs=self._parse_inputs(input_descs),
|
|
94
|
+
outputs=self._parse_outputs(output_descs),
|
|
95
|
+
steps=steps,
|
|
96
|
+
source_text=text
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
def _parse_steps(self, text: str, boundaries: AlgorithmBoundaries) -> List[Step]:
|
|
100
|
+
"""
|
|
101
|
+
Parse steps from normalized text.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
text: Normalized text
|
|
105
|
+
boundaries: Detected boundaries
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
List of Step objects
|
|
109
|
+
"""
|
|
110
|
+
lines = text.split('\n')
|
|
111
|
+
steps = []
|
|
112
|
+
step_id = 1
|
|
113
|
+
|
|
114
|
+
# Determine which lines to parse
|
|
115
|
+
start_line = boundaries.steps_start or 1
|
|
116
|
+
end_line = boundaries.steps_end or len(lines)
|
|
117
|
+
|
|
118
|
+
for line_num in range(start_line, min(end_line + 1, len(lines) + 1)):
|
|
119
|
+
line = lines[line_num - 1]
|
|
120
|
+
stripped = line.strip()
|
|
121
|
+
|
|
122
|
+
if not stripped:
|
|
123
|
+
continue
|
|
124
|
+
|
|
125
|
+
# Try to match step patterns
|
|
126
|
+
matched = False
|
|
127
|
+
for pattern, handler in self.step_patterns:
|
|
128
|
+
match = re.match(pattern, line)
|
|
129
|
+
if match:
|
|
130
|
+
step = handler(match, step_id, line_num)
|
|
131
|
+
if step:
|
|
132
|
+
steps.append(step)
|
|
133
|
+
step_id += 1
|
|
134
|
+
matched = True
|
|
135
|
+
break
|
|
136
|
+
|
|
137
|
+
# If no pattern matched but line looks like a step
|
|
138
|
+
if not matched and len(stripped) > 10 and not self._is_section_header(stripped):
|
|
139
|
+
step = self._create_step(step_id, StepType.ASSIGNMENT, stripped, line_num)
|
|
140
|
+
steps.append(step)
|
|
141
|
+
step_id += 1
|
|
142
|
+
|
|
143
|
+
return steps
|
|
144
|
+
|
|
145
|
+
def _parse_numbered_step(self, match, step_id: int, line_num: int) -> Optional[Step]:
|
|
146
|
+
"""Parse a numbered step match."""
|
|
147
|
+
# For numbered pattern, group 2 contains the content
|
|
148
|
+
text = match.group(2).strip() if len(match.groups()) > 1 else match.group(1).strip()
|
|
149
|
+
|
|
150
|
+
step_type = self._classify_step_type(text)
|
|
151
|
+
return self._create_step(step_id, step_type, text, line_num)
|
|
152
|
+
|
|
153
|
+
def _parse_bullet_step(self, match, step_id: int, line_num: int) -> Optional[Step]:
|
|
154
|
+
"""Parse a bullet point step."""
|
|
155
|
+
text = match.group(1).strip()
|
|
156
|
+
step_type = self._classify_step_type(text)
|
|
157
|
+
return self._create_step(step_id, step_type, text, line_num)
|
|
158
|
+
|
|
159
|
+
def _create_step(self, step_id: int, step_type: StepType, text: str, line_num: int) -> Step:
|
|
160
|
+
"""Create a step with extracted metadata."""
|
|
161
|
+
inputs, outputs = self._extract_variables(text)
|
|
162
|
+
|
|
163
|
+
# Extract additional fields based on type
|
|
164
|
+
condition = None
|
|
165
|
+
expression = None
|
|
166
|
+
iter_var = None
|
|
167
|
+
iter_range = None
|
|
168
|
+
|
|
169
|
+
if step_type == StepType.LOOP_FOR:
|
|
170
|
+
iter_var, iter_range = self._extract_for_loop_details(text)
|
|
171
|
+
elif step_type == StepType.LOOP_WHILE:
|
|
172
|
+
condition = self._extract_while_condition(text)
|
|
173
|
+
elif step_type == StepType.CONDITIONAL:
|
|
174
|
+
condition = self._extract_if_condition(text)
|
|
175
|
+
elif step_type == StepType.RETURN:
|
|
176
|
+
expression = self._extract_return_value(text)
|
|
177
|
+
elif step_type == StepType.ASSIGNMENT:
|
|
178
|
+
expression = self._extract_assignment_expression(text)
|
|
179
|
+
|
|
180
|
+
return Step(
|
|
181
|
+
id=step_id,
|
|
182
|
+
type=step_type,
|
|
183
|
+
description=text,
|
|
184
|
+
inputs=inputs,
|
|
185
|
+
outputs=outputs,
|
|
186
|
+
line_refs=[line_num],
|
|
187
|
+
condition=condition,
|
|
188
|
+
expression=expression,
|
|
189
|
+
iter_var=iter_var,
|
|
190
|
+
iter_range=iter_range
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
def _classify_step_type(self, text: str) -> StepType:
|
|
194
|
+
"""
|
|
195
|
+
Classify step type from text using patterns.
|
|
196
|
+
|
|
197
|
+
Returns most specific matching type.
|
|
198
|
+
"""
|
|
199
|
+
text_lower = text.lower().strip()
|
|
200
|
+
|
|
201
|
+
for pattern, step_type in self.type_patterns:
|
|
202
|
+
if re.search(pattern, text_lower):
|
|
203
|
+
return step_type
|
|
204
|
+
|
|
205
|
+
# Check for assignment
|
|
206
|
+
if re.search(r'[=←]|\s+is\s+|\s+gets\s+|\s+set\s+to\s+', text_lower):
|
|
207
|
+
return StepType.ASSIGNMENT
|
|
208
|
+
|
|
209
|
+
# Check for function call
|
|
210
|
+
if re.search(r'\w+\s*\([^)]*\)', text_lower):
|
|
211
|
+
return StepType.CALL
|
|
212
|
+
|
|
213
|
+
return StepType.COMMENT
|
|
214
|
+
|
|
215
|
+
def _extract_variables(self, text: str) -> Tuple[List[str], List[str]]:
|
|
216
|
+
"""
|
|
217
|
+
Extract input and output variables from step text.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
text: Step description
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
Tuple of (input_vars, output_vars)
|
|
224
|
+
"""
|
|
225
|
+
inputs = []
|
|
226
|
+
outputs = []
|
|
227
|
+
|
|
228
|
+
# Find assignments: x = ..., x ← ..., x gets ..., etc.
|
|
229
|
+
assign_patterns = [
|
|
230
|
+
r'(?:initialize|set)\s+(\w+)',
|
|
231
|
+
r'(\w+)\s*[=←]',
|
|
232
|
+
r'(\w+)\s+is\s+set\s+to',
|
|
233
|
+
r'(\w+)\s+gets',
|
|
234
|
+
]
|
|
235
|
+
|
|
236
|
+
for pattern in assign_patterns:
|
|
237
|
+
match = re.search(pattern, text, re.IGNORECASE)
|
|
238
|
+
if match:
|
|
239
|
+
outputs.append(match.group(1))
|
|
240
|
+
break
|
|
241
|
+
|
|
242
|
+
# Find all variable references (avoid keywords)
|
|
243
|
+
keywords = {'for', 'while', 'if', 'else', 'return', 'output',
|
|
244
|
+
'end', 'then', 'do', 'in', 'to', 'from', 'and', 'or'}
|
|
245
|
+
|
|
246
|
+
var_pattern = r'\b([a-zA-Z_][a-zA-Z0-9_]*)\b'
|
|
247
|
+
for match in re.finditer(var_pattern, text):
|
|
248
|
+
var = match.group(1)
|
|
249
|
+
if var not in keywords and var not in outputs:
|
|
250
|
+
inputs.append(var)
|
|
251
|
+
|
|
252
|
+
return inputs, outputs
|
|
253
|
+
|
|
254
|
+
def _extract_for_loop_details(self, text: str) -> Tuple[Optional[str], Optional[str]]:
|
|
255
|
+
"""Extract iteration variable and range from for loop."""
|
|
256
|
+
# Pattern: for i from 1 to n
|
|
257
|
+
match = re.search(r'[Ff]or\s+(\w+)\s+(?:from|in)\s+(.+?)(?:\s+to|\s+do|\s*:|$)', text)
|
|
258
|
+
if match:
|
|
259
|
+
return match.group(1), match.group(2)
|
|
260
|
+
|
|
261
|
+
# Pattern: for each x in S
|
|
262
|
+
match = re.search(r'[Ff]or\s+each\s+(\w+)\s+in\s+(\w+)', text)
|
|
263
|
+
if match:
|
|
264
|
+
return match.group(1), match.group(2)
|
|
265
|
+
|
|
266
|
+
return None, None
|
|
267
|
+
|
|
268
|
+
def _extract_while_condition(self, text: str) -> Optional[str]:
|
|
269
|
+
"""Extract condition from while loop."""
|
|
270
|
+
match = re.search(r'[Ww]hile\s+(.+?)(?:\s*:|\s+do|\s*$)', text)
|
|
271
|
+
if match:
|
|
272
|
+
return match.group(1).strip()
|
|
273
|
+
return None
|
|
274
|
+
|
|
275
|
+
def _extract_if_condition(self, text: str) -> Optional[str]:
|
|
276
|
+
"""Extract condition from if statement."""
|
|
277
|
+
match = re.search(r'[Ii]f\s+(.+?)(?:\s*:|\s+then|\s*$)', text)
|
|
278
|
+
if match:
|
|
279
|
+
return match.group(1).strip()
|
|
280
|
+
return None
|
|
281
|
+
|
|
282
|
+
def _extract_return_value(self, text: str) -> Optional[str]:
|
|
283
|
+
"""Extract return value from return statement."""
|
|
284
|
+
match = re.search(r'[Rr]eturn\s+(.+)$', text)
|
|
285
|
+
if match:
|
|
286
|
+
return match.group(1).strip()
|
|
287
|
+
# Also match Output
|
|
288
|
+
match = re.search(r'[Oo]utput\s+(.+)$', text)
|
|
289
|
+
if match:
|
|
290
|
+
return match.group(1).strip()
|
|
291
|
+
return None
|
|
292
|
+
|
|
293
|
+
def _extract_assignment_expression(self, text: str) -> Optional[str]:
|
|
294
|
+
"""Extract expression from assignment."""
|
|
295
|
+
match = re.search(r'[=←]\s*(.+)$', text)
|
|
296
|
+
if match:
|
|
297
|
+
return match.group(1).strip()
|
|
298
|
+
return None
|
|
299
|
+
|
|
300
|
+
def _is_section_header(self, text: str) -> bool:
|
|
301
|
+
"""Check if text is a section header."""
|
|
302
|
+
header_patterns = [
|
|
303
|
+
r'^(?:Input|Output|Algorithm|Procedure|Function|Method)',
|
|
304
|
+
r'^(?:Given|Parameters|Returns|Result)',
|
|
305
|
+
]
|
|
306
|
+
for pattern in header_patterns:
|
|
307
|
+
if re.match(pattern, text, re.IGNORECASE):
|
|
308
|
+
return True
|
|
309
|
+
return False
|
|
310
|
+
|
|
311
|
+
def _parse_inputs(self, input_descriptions: List[str]) -> List[Dict[str, Any]]:
|
|
312
|
+
"""
|
|
313
|
+
Parse input descriptions into structured format.
|
|
314
|
+
|
|
315
|
+
Per D-15 from 02-CONTEXT.md.
|
|
316
|
+
"""
|
|
317
|
+
inputs = []
|
|
318
|
+
|
|
319
|
+
for desc in input_descriptions:
|
|
320
|
+
# Try to extract variable name and type
|
|
321
|
+
# Pattern: "A[1..n] - array of integers"
|
|
322
|
+
match = re.search(r'(\w+(?:\[[^\]]*\])?)\s*(?:-|,|\s)\s*(.+)', desc)
|
|
323
|
+
if match:
|
|
324
|
+
name = match.group(1)
|
|
325
|
+
type_desc = match.group(2)
|
|
326
|
+
var_type = self._infer_type(type_desc)
|
|
327
|
+
else:
|
|
328
|
+
# Just variable name
|
|
329
|
+
name = desc.strip()
|
|
330
|
+
type_desc = ""
|
|
331
|
+
var_type = "unknown"
|
|
332
|
+
|
|
333
|
+
inputs.append({
|
|
334
|
+
"name": name,
|
|
335
|
+
"type": var_type,
|
|
336
|
+
"description": desc
|
|
337
|
+
})
|
|
338
|
+
|
|
339
|
+
return inputs
|
|
340
|
+
|
|
341
|
+
def _parse_outputs(self, output_descriptions: List[str]) -> List[Dict[str, Any]]:
|
|
342
|
+
"""
|
|
343
|
+
Parse output descriptions into structured format.
|
|
344
|
+
|
|
345
|
+
Per D-16 from 02-CONTEXT.md.
|
|
346
|
+
"""
|
|
347
|
+
outputs = []
|
|
348
|
+
|
|
349
|
+
for desc in output_descriptions:
|
|
350
|
+
match = re.search(r'(\w+(?:\[[^\]]*\])?)\s*(?:-|,|\s)\s*(.+)', desc)
|
|
351
|
+
if match:
|
|
352
|
+
name = match.group(1)
|
|
353
|
+
type_desc = match.group(2)
|
|
354
|
+
var_type = self._infer_type(type_desc)
|
|
355
|
+
else:
|
|
356
|
+
name = desc.strip()
|
|
357
|
+
type_desc = ""
|
|
358
|
+
var_type = "unknown"
|
|
359
|
+
|
|
360
|
+
outputs.append({
|
|
361
|
+
"name": name,
|
|
362
|
+
"type": var_type,
|
|
363
|
+
"description": desc
|
|
364
|
+
})
|
|
365
|
+
|
|
366
|
+
return outputs
|
|
367
|
+
|
|
368
|
+
def _infer_type(self, description: str) -> str:
|
|
369
|
+
"""Infer variable type from description."""
|
|
370
|
+
desc_lower = description.lower()
|
|
371
|
+
|
|
372
|
+
if any(word in desc_lower for word in ['array', 'list', 'sequence']):
|
|
373
|
+
if any(word in desc_lower for word in ['matrix', '2d', 'two-dimensional']):
|
|
374
|
+
return "matrix"
|
|
375
|
+
return "array"
|
|
376
|
+
if any(word in desc_lower for word in ['matrix', 'grid', 'table']):
|
|
377
|
+
return "matrix"
|
|
378
|
+
if any(word in desc_lower for word in ['integer', 'int', 'whole number']):
|
|
379
|
+
return "int"
|
|
380
|
+
if any(word in desc_lower for word in ['float', 'real', 'decimal', 'number']):
|
|
381
|
+
return "float"
|
|
382
|
+
if any(word in desc_lower for word in ['boolean', 'bool', 'true', 'false']):
|
|
383
|
+
return "bool"
|
|
384
|
+
if any(word in desc_lower for word in ['string', 'text']):
|
|
385
|
+
return "str"
|
|
386
|
+
|
|
387
|
+
return "unknown"
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def parse_algorithm(text: str, name: Optional[str] = None) -> Algorithm:
|
|
391
|
+
"""
|
|
392
|
+
Convenience function to parse algorithm text.
|
|
393
|
+
|
|
394
|
+
Args:
|
|
395
|
+
text: Algorithm description
|
|
396
|
+
name: Optional algorithm name
|
|
397
|
+
|
|
398
|
+
Returns:
|
|
399
|
+
Parsed Algorithm object
|
|
400
|
+
"""
|
|
401
|
+
parser = RuleBasedParser()
|
|
402
|
+
return parser.parse(text, name)
|