code2llm 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code2flow/__init__.py +47 -0
- code2flow/__main__.py +6 -0
- code2flow/analysis/__init__.py +23 -0
- code2flow/analysis/call_graph.py +210 -0
- code2flow/analysis/cfg.py +293 -0
- code2flow/analysis/coupling.py +77 -0
- code2flow/analysis/data_analysis.py +249 -0
- code2flow/analysis/dfg.py +224 -0
- code2flow/analysis/pipeline_detector.py +445 -0
- code2flow/analysis/side_effects.py +313 -0
- code2flow/analysis/smells.py +192 -0
- code2flow/analysis/type_inference.py +306 -0
- code2flow/cli.py +493 -0
- code2flow/core/__init__.py +36 -0
- code2flow/core/analyzer.py +765 -0
- code2flow/core/config.py +177 -0
- code2flow/core/models.py +194 -0
- code2flow/core/streaming_analyzer.py +666 -0
- code2flow/exporters/__init__.py +35 -0
- code2flow/exporters/base.py +13 -0
- code2flow/exporters/context_exporter.py +207 -0
- code2flow/exporters/flow_exporter.py +570 -0
- code2flow/exporters/json_exporter.py +17 -0
- code2flow/exporters/llm_exporter.py +12 -0
- code2flow/exporters/map_exporter.py +218 -0
- code2flow/exporters/mermaid_exporter.py +67 -0
- code2flow/exporters/toon.py +982 -0
- code2flow/exporters/yaml_exporter.py +108 -0
- code2flow/llm_flow_generator.py +451 -0
- code2flow/llm_task_generator.py +263 -0
- code2flow/mermaid_generator.py +481 -0
- code2flow/nlp/__init__.py +23 -0
- code2flow/nlp/config.py +174 -0
- code2flow/nlp/entity_resolution.py +326 -0
- code2flow/nlp/intent_matching.py +297 -0
- code2flow/nlp/normalization.py +122 -0
- code2flow/nlp/pipeline.py +388 -0
- code2flow/patterns/__init__.py +0 -0
- code2flow/patterns/detector.py +168 -0
- code2flow/refactor/__init__.py +0 -0
- code2flow/refactor/prompt_engine.py +150 -0
- code2flow/visualizers/__init__.py +0 -0
- code2flow/visualizers/graph.py +196 -0
- code2llm-0.3.7.dist-info/METADATA +604 -0
- code2llm-0.3.7.dist-info/RECORD +49 -0
- code2llm-0.3.7.dist-info/WHEEL +5 -0
- code2llm-0.3.7.dist-info/entry_points.txt +2 -0
- code2llm-0.3.7.dist-info/licenses/LICENSE +201 -0
- code2llm-0.3.7.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,481 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Mermaid PNG Generator for code2flow
|
|
3
|
+
Integrates with CLI to auto-generate PNG from Mermaid files.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
import sys
|
|
8
|
+
import subprocess
|
|
9
|
+
import tempfile
|
|
10
|
+
import json
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import List, Optional
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def validate_mermaid_file(mmd_path: Path) -> List[str]:
|
|
16
|
+
"""Validate Mermaid file and return list of errors."""
|
|
17
|
+
errors = []
|
|
18
|
+
|
|
19
|
+
if not mmd_path.exists():
|
|
20
|
+
return [f"File not found: {mmd_path}"]
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
content = mmd_path.read_text(encoding='utf-8')
|
|
24
|
+
|
|
25
|
+
# Basic syntax checks
|
|
26
|
+
lines = content.strip().split('\n')
|
|
27
|
+
|
|
28
|
+
# Check for proper graph declaration
|
|
29
|
+
if not lines or not any(line.strip().startswith(('graph', 'flowchart')) for line in lines):
|
|
30
|
+
errors.append("Missing graph declaration (should start with 'graph' or 'flowchart')")
|
|
31
|
+
|
|
32
|
+
def strip_label_segments(s: str) -> str:
|
|
33
|
+
"""Remove label segments that frequently contain Mermaid syntax chars.
|
|
34
|
+
|
|
35
|
+
We ignore bracket/paren balancing inside:
|
|
36
|
+
- edge labels: -->|...|
|
|
37
|
+
- node labels: N1["..."] or N1[/'...'/] etc.
|
|
38
|
+
"""
|
|
39
|
+
import re
|
|
40
|
+
|
|
41
|
+
# Remove edge labels |...|
|
|
42
|
+
s = re.sub(r"\|[^|]*\|", "||", s)
|
|
43
|
+
|
|
44
|
+
# Remove common node label forms: ["..."], ("..."), {"..."}
|
|
45
|
+
s = re.sub(r"\[\"[^\"]*\"\]", "[]", s)
|
|
46
|
+
s = re.sub(r"\(\"[^\"]*\"\)", "()", s)
|
|
47
|
+
s = re.sub(r"\{\"[^\"]*\"\}", "{}", s)
|
|
48
|
+
|
|
49
|
+
# Remove Mermaid special bracket label variants like [/'...'/]
|
|
50
|
+
s = re.sub(r"\[/[^\]]*?/\]", "[]", s)
|
|
51
|
+
s = re.sub(r"\(/[^)]*?/\)", "()", s)
|
|
52
|
+
|
|
53
|
+
return s
|
|
54
|
+
|
|
55
|
+
# Check for unmatched brackets/parentheses (outside label segments)
|
|
56
|
+
bracket_stack = []
|
|
57
|
+
paren_stack = []
|
|
58
|
+
|
|
59
|
+
for line_num, line in enumerate(lines, 1):
|
|
60
|
+
line = line.strip()
|
|
61
|
+
if not line or line.startswith('%%'):
|
|
62
|
+
continue
|
|
63
|
+
|
|
64
|
+
# Skip validation for lines that are clearly node definitions with content
|
|
65
|
+
# Node definitions have the pattern: ID[content] or ID(content) or ID{content}
|
|
66
|
+
if (('[' in line and ']' in line) or
|
|
67
|
+
('(' in line and ')' in line) or
|
|
68
|
+
('{' in line and '}' in line)):
|
|
69
|
+
# This looks like a node definition, check if it's properly formed
|
|
70
|
+
# but don't count parentheses inside the node content
|
|
71
|
+
continue
|
|
72
|
+
|
|
73
|
+
# Count brackets and parentheses (ignoring those inside label segments)
|
|
74
|
+
check_line = strip_label_segments(line)
|
|
75
|
+
for char in check_line:
|
|
76
|
+
if char == '[':
|
|
77
|
+
bracket_stack.append((']', line_num))
|
|
78
|
+
elif char == ']':
|
|
79
|
+
if not bracket_stack or bracket_stack[-1][0] != ']':
|
|
80
|
+
errors.append(f"Line {line_num}: Unmatched ']'")
|
|
81
|
+
else:
|
|
82
|
+
bracket_stack.pop()
|
|
83
|
+
elif char == '(':
|
|
84
|
+
paren_stack.append((')', line_num))
|
|
85
|
+
elif char == ')':
|
|
86
|
+
if not paren_stack or paren_stack[-1][0] != ')':
|
|
87
|
+
errors.append(f"Line {line_num}: Unmatched ')'")
|
|
88
|
+
else:
|
|
89
|
+
paren_stack.pop()
|
|
90
|
+
|
|
91
|
+
# Report unclosed brackets (only for structural ones, not node content)
|
|
92
|
+
for expected, line_num in bracket_stack:
|
|
93
|
+
errors.append(f"Line {line_num}: Unclosed '[' (missing '{expected}')")
|
|
94
|
+
for expected, line_num in paren_stack:
|
|
95
|
+
errors.append(f"Line {line_num}: Unclosed '(' (missing '{expected}')")
|
|
96
|
+
|
|
97
|
+
# Check for invalid node IDs
|
|
98
|
+
import re
|
|
99
|
+
node_pattern = re.compile(r'^\s*([A-Z]\d+|[Ff]\d+_\w+)\s*["\'\[\{]')
|
|
100
|
+
|
|
101
|
+
for line_num, line in enumerate(lines, 1):
|
|
102
|
+
line = line.strip()
|
|
103
|
+
if not line or line.startswith('%%'):
|
|
104
|
+
continue
|
|
105
|
+
|
|
106
|
+
# Skip subgraph lines for node ID validation
|
|
107
|
+
if line.startswith('subgraph ') or line == 'end':
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
# Skip validation for lines that are clearly node definitions with content
|
|
111
|
+
# Node definitions have the pattern: ID[content] or ID(content) or ID{content}
|
|
112
|
+
if (('[' in line and ']' in line) or
|
|
113
|
+
('(' in line and ')' in line) or
|
|
114
|
+
('{' in line and '}' in line)):
|
|
115
|
+
# This looks like a node definition, check if it's properly formed
|
|
116
|
+
# but don't count parentheses inside the node content
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
# Check node definitions
|
|
120
|
+
if any(char in line for char in ['[', '(', '{']):
|
|
121
|
+
if not node_pattern.match(line):
|
|
122
|
+
# Try to extract node ID
|
|
123
|
+
match = re.match(r'^\s*([A-Za-z0-9_]+)', line)
|
|
124
|
+
if match:
|
|
125
|
+
node_id = match.group(1)
|
|
126
|
+
if not re.match(r'^[A-Z]\d+$|^[Ff]\d+_\w+$', node_id):
|
|
127
|
+
errors.append(f"Line {line_num}: Invalid node ID '{node_id}' (should be like 'N1' or 'F123_name')")
|
|
128
|
+
|
|
129
|
+
except Exception as e:
|
|
130
|
+
errors.append(f"Error reading file: {e}")
|
|
131
|
+
|
|
132
|
+
return errors
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def fix_mermaid_file(mmd_path: Path) -> bool:
|
|
136
|
+
"""Attempt to fix common Mermaid syntax errors."""
|
|
137
|
+
try:
|
|
138
|
+
content = mmd_path.read_text(encoding='utf-8')
|
|
139
|
+
lines = content.split('\n')
|
|
140
|
+
fixed_lines = []
|
|
141
|
+
|
|
142
|
+
import re
|
|
143
|
+
|
|
144
|
+
def sanitize_label_text(txt: str) -> str:
|
|
145
|
+
# Mermaid labels frequently break parsing when they contain Mermaid syntax chars.
|
|
146
|
+
# Replace with HTML entities.
|
|
147
|
+
return (
|
|
148
|
+
txt.replace('&', '&')
|
|
149
|
+
.replace('"', '"')
|
|
150
|
+
.replace('[', '[')
|
|
151
|
+
.replace(']', ']')
|
|
152
|
+
.replace('(', '(')
|
|
153
|
+
.replace(')', ')')
|
|
154
|
+
.replace('{', '{')
|
|
155
|
+
.replace('}', '}')
|
|
156
|
+
.replace('|', '|')
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
def sanitize_node_id(node_id: str) -> str:
|
|
160
|
+
"""Make a Mermaid-safe node identifier.
|
|
161
|
+
|
|
162
|
+
Mermaid node IDs should avoid characters like '[', ']', '(', ')', '{', '}', '"', '|'.
|
|
163
|
+
For call-graph exports, we only need stable-ish identifiers.
|
|
164
|
+
"""
|
|
165
|
+
node_id = (node_id or '').strip()
|
|
166
|
+
# Cut off at first clearly dangerous Mermaid syntax char.
|
|
167
|
+
node_id = re.split(r"[\[\]\(\)\{\}\"\|\s]", node_id, maxsplit=1)[0]
|
|
168
|
+
# Replace remaining non-word chars just in case.
|
|
169
|
+
node_id = re.sub(r"[^A-Za-z0-9_]", "_", node_id)
|
|
170
|
+
return node_id or "N"
|
|
171
|
+
|
|
172
|
+
for line in lines:
|
|
173
|
+
original_line = line
|
|
174
|
+
|
|
175
|
+
# 2. Fix edge labels that might have pipe issues
|
|
176
|
+
if '-->' in line and '|' in line:
|
|
177
|
+
# Handle edge labels like: N1 -->|"label"| N2
|
|
178
|
+
if '-->|' in line:
|
|
179
|
+
parts = line.split('-->|', 1)
|
|
180
|
+
if len(parts) == 2:
|
|
181
|
+
label_and_target = parts[1]
|
|
182
|
+
# Find the closing |
|
|
183
|
+
if '|' in label_and_target:
|
|
184
|
+
parts2 = label_and_target.split('|', 1)
|
|
185
|
+
if len(parts2) == 2:
|
|
186
|
+
label_content, target = parts2
|
|
187
|
+
# Clean up the label content - remove extra pipes if any
|
|
188
|
+
label_content = label_content.strip('|')
|
|
189
|
+
# Fix incomplete parentheses in edge labels
|
|
190
|
+
if label_content.endswith('('):
|
|
191
|
+
label_content = label_content[:-1] # Remove trailing parenthesis
|
|
192
|
+
elif label_content.count('(') > label_content.count(')'):
|
|
193
|
+
# Add missing closing parentheses
|
|
194
|
+
missing_parens = label_content.count('(') - label_content.count(')')
|
|
195
|
+
label_content += ')' * missing_parens
|
|
196
|
+
line = f"{parts[0]}-->|{label_content}|{target}"
|
|
197
|
+
|
|
198
|
+
# 2b. Fix stray trailing '|' after node IDs (common breakage: N123|)
|
|
199
|
+
# Only apply to edge lines to avoid touching other Mermaid constructs.
|
|
200
|
+
if '-->' in line:
|
|
201
|
+
line = re.sub(r"(\b[A-Za-z]\w*)\|\s*$", r"\1", line)
|
|
202
|
+
|
|
203
|
+
# 2c. Sanitize edge label content inside |...|
|
|
204
|
+
# Example bad line: N1 -->|"char == '('"| N2
|
|
205
|
+
def _sanitize_edge_label(m: re.Match) -> str:
|
|
206
|
+
inner = m.group(1)
|
|
207
|
+
return f"|{sanitize_label_text(inner)}|"
|
|
208
|
+
|
|
209
|
+
if '-->' in line and '|' in line:
|
|
210
|
+
line = re.sub(r"\|([^|]{1,200})\|", _sanitize_edge_label, line)
|
|
211
|
+
|
|
212
|
+
# 2d. Sanitize edge endpoints for simple call-graph lines: A --> B
|
|
213
|
+
# This fixes cases where a node ID contains '[' which Mermaid treats as a label start.
|
|
214
|
+
if '-->' in line and '|' not in line:
|
|
215
|
+
m = re.match(r"^(\s*)([^\s-]+)\s*-->\s*([^\s]+)\s*$", line)
|
|
216
|
+
if m:
|
|
217
|
+
indent, src, dst = m.groups()
|
|
218
|
+
src_id = sanitize_node_id(src)
|
|
219
|
+
dst_id = sanitize_node_id(dst)
|
|
220
|
+
line = f"{indent}{src_id} --> {dst_id}"
|
|
221
|
+
|
|
222
|
+
# 3. Fix malformed subgraph IDs
|
|
223
|
+
if line.strip().startswith('subgraph '):
|
|
224
|
+
subgraph_part = line.strip()[9:].split('(', 1)
|
|
225
|
+
if len(subgraph_part) == 2:
|
|
226
|
+
subgraph_id, rest = subgraph_part
|
|
227
|
+
# Clean subgraph ID
|
|
228
|
+
subgraph_id = subgraph_id.replace('.', '_').replace('-', '_').replace(':', '_')
|
|
229
|
+
line = f" subgraph {subgraph_id}({rest}"
|
|
230
|
+
|
|
231
|
+
# 5. Fix class definitions with too many nodes
|
|
232
|
+
if line.strip().startswith('class ') and ',' in line:
|
|
233
|
+
# Split long class lines
|
|
234
|
+
class_parts = line.split(' ', 1)
|
|
235
|
+
if len(class_parts) == 2:
|
|
236
|
+
nodes_and_class = class_parts[1]
|
|
237
|
+
nodes, class_name = nodes_and_class.rsplit(' ', 1)
|
|
238
|
+
node_list = nodes.split(',')
|
|
239
|
+
if len(node_list) > 10: # Split if too many nodes
|
|
240
|
+
# Create multiple lines
|
|
241
|
+
for i in range(0, len(node_list), 10):
|
|
242
|
+
chunk = ','.join(node_list[i:i+10])
|
|
243
|
+
fixed_lines.append(f" class {chunk} {class_name}")
|
|
244
|
+
continue
|
|
245
|
+
|
|
246
|
+
fixed_lines.append(line)
|
|
247
|
+
|
|
248
|
+
# Write back if changed
|
|
249
|
+
fixed_content = '\n'.join(fixed_lines)
|
|
250
|
+
if fixed_content != content:
|
|
251
|
+
mmd_path.write_text(fixed_content, encoding='utf-8')
|
|
252
|
+
return True
|
|
253
|
+
|
|
254
|
+
except Exception as e:
|
|
255
|
+
print(f"Error fixing {mmd_path}: {e}")
|
|
256
|
+
|
|
257
|
+
return False
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def generate_pngs(input_dir: Path, output_dir: Path, timeout: int = 60) -> int:
|
|
261
|
+
"""Generate PNG files from all .mmd files in input_dir."""
|
|
262
|
+
mmd_files = list(input_dir.glob('*.mmd'))
|
|
263
|
+
|
|
264
|
+
if not mmd_files:
|
|
265
|
+
return 0
|
|
266
|
+
|
|
267
|
+
success_count = 0
|
|
268
|
+
|
|
269
|
+
for mmd_file in mmd_files:
|
|
270
|
+
output_file = output_dir / f"{mmd_file.stem}.png"
|
|
271
|
+
|
|
272
|
+
# Validate first
|
|
273
|
+
errors = validate_mermaid_file(mmd_file)
|
|
274
|
+
if errors:
|
|
275
|
+
print(f" Fixing {mmd_file.name}: {len(errors)} issues")
|
|
276
|
+
fix_mermaid_file(mmd_file)
|
|
277
|
+
|
|
278
|
+
# Re-validate
|
|
279
|
+
errors = validate_mermaid_file(mmd_file)
|
|
280
|
+
if errors:
|
|
281
|
+
print(f" Still has errors: {errors[:3]}") # Show first 3 errors
|
|
282
|
+
continue
|
|
283
|
+
|
|
284
|
+
# Try to generate PNG
|
|
285
|
+
if generate_single_png(mmd_file, output_file, timeout):
|
|
286
|
+
success_count += 1
|
|
287
|
+
|
|
288
|
+
return success_count
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def generate_single_png(mmd_file: Path, output_file: Path, timeout: int = 60) -> bool:
|
|
292
|
+
"""Generate PNG from single Mermaid file using available renderers."""
|
|
293
|
+
|
|
294
|
+
# Create output directory
|
|
295
|
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
296
|
+
|
|
297
|
+
# Mermaid's default maxTextSize is often too low for large projects,
|
|
298
|
+
# resulting in placeholder PNGs that say "Maximum text size in diagram exceeded".
|
|
299
|
+
# Provide a temporary config with a higher limit.
|
|
300
|
+
try:
|
|
301
|
+
max_text_size = int(os.getenv('CODE2FLOW_MERMAID_MAX_TEXT_SIZE', '2000000'))
|
|
302
|
+
except Exception:
|
|
303
|
+
max_text_size = 2000000
|
|
304
|
+
|
|
305
|
+
try:
|
|
306
|
+
max_edges = int(os.getenv('CODE2FLOW_MERMAID_MAX_EDGES', '20000'))
|
|
307
|
+
except Exception:
|
|
308
|
+
max_edges = 20000
|
|
309
|
+
|
|
310
|
+
cfg_path: Optional[str] = None
|
|
311
|
+
try:
|
|
312
|
+
cfg = {
|
|
313
|
+
"maxTextSize": max_text_size,
|
|
314
|
+
"maxEdges": max_edges,
|
|
315
|
+
"theme": "default",
|
|
316
|
+
}
|
|
317
|
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as tmp_cfg:
|
|
318
|
+
tmp_cfg.write(json.dumps(cfg))
|
|
319
|
+
cfg_path = tmp_cfg.name
|
|
320
|
+
|
|
321
|
+
# Try different renderers in order of preference
|
|
322
|
+
renderers = [
|
|
323
|
+
(
|
|
324
|
+
'mmdc',
|
|
325
|
+
[
|
|
326
|
+
'mmdc',
|
|
327
|
+
'-i',
|
|
328
|
+
str(mmd_file),
|
|
329
|
+
'-o',
|
|
330
|
+
str(output_file),
|
|
331
|
+
'-t',
|
|
332
|
+
'default',
|
|
333
|
+
'-b',
|
|
334
|
+
'white',
|
|
335
|
+
'-c',
|
|
336
|
+
cfg_path,
|
|
337
|
+
],
|
|
338
|
+
),
|
|
339
|
+
(
|
|
340
|
+
'npx',
|
|
341
|
+
[
|
|
342
|
+
'npx',
|
|
343
|
+
'-y',
|
|
344
|
+
'@mermaid-js/mermaid-cli',
|
|
345
|
+
'-i',
|
|
346
|
+
str(mmd_file),
|
|
347
|
+
'-o',
|
|
348
|
+
str(output_file),
|
|
349
|
+
'-t',
|
|
350
|
+
'default',
|
|
351
|
+
'-b',
|
|
352
|
+
'white',
|
|
353
|
+
'-c',
|
|
354
|
+
cfg_path,
|
|
355
|
+
],
|
|
356
|
+
),
|
|
357
|
+
('puppeteer', None), # Special handling
|
|
358
|
+
]
|
|
359
|
+
except Exception:
|
|
360
|
+
# If creating config fails for any reason, fall back to renderer defaults.
|
|
361
|
+
renderers = [
|
|
362
|
+
('mmdc', ['mmdc', '-i', str(mmd_file), '-o', str(output_file), '-t', 'default', '-b', 'white']),
|
|
363
|
+
('npx', ['npx', '-y', '@mermaid-js/mermaid-cli', '-i', str(mmd_file), '-o', str(output_file)]),
|
|
364
|
+
('puppeteer', None),
|
|
365
|
+
]
|
|
366
|
+
|
|
367
|
+
try:
|
|
368
|
+
for renderer_name, cmd in renderers:
|
|
369
|
+
try:
|
|
370
|
+
if renderer_name == 'puppeteer':
|
|
371
|
+
# Special puppeteer handling
|
|
372
|
+
if generate_with_puppeteer(
|
|
373
|
+
mmd_file,
|
|
374
|
+
output_file,
|
|
375
|
+
timeout,
|
|
376
|
+
max_text_size=max_text_size,
|
|
377
|
+
max_edges=max_edges,
|
|
378
|
+
):
|
|
379
|
+
return True
|
|
380
|
+
continue
|
|
381
|
+
|
|
382
|
+
# Run command
|
|
383
|
+
result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
|
|
384
|
+
|
|
385
|
+
if result.returncode == 0:
|
|
386
|
+
return True
|
|
387
|
+
else:
|
|
388
|
+
print(f" {renderer_name} failed: {result.stderr.strip()}")
|
|
389
|
+
|
|
390
|
+
except subprocess.TimeoutExpired:
|
|
391
|
+
print(f" {renderer_name} timed out")
|
|
392
|
+
except FileNotFoundError:
|
|
393
|
+
print(f" {renderer_name} not available")
|
|
394
|
+
except Exception as e:
|
|
395
|
+
print(f" {renderer_name} error: {e}")
|
|
396
|
+
|
|
397
|
+
return False
|
|
398
|
+
finally:
|
|
399
|
+
if cfg_path:
|
|
400
|
+
try:
|
|
401
|
+
os.unlink(cfg_path)
|
|
402
|
+
except Exception:
|
|
403
|
+
pass
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def generate_with_puppeteer(
|
|
407
|
+
mmd_file: Path,
|
|
408
|
+
output_file: Path,
|
|
409
|
+
timeout: int = 60,
|
|
410
|
+
max_text_size: int = 2000000,
|
|
411
|
+
max_edges: int = 20000,
|
|
412
|
+
) -> bool:
|
|
413
|
+
"""Generate PNG using Puppeteer with HTML template."""
|
|
414
|
+
try:
|
|
415
|
+
mmd_content = mmd_file.read_text(encoding='utf-8')
|
|
416
|
+
|
|
417
|
+
html_template = f"""
|
|
418
|
+
<!DOCTYPE html>
|
|
419
|
+
<html>
|
|
420
|
+
<head>
|
|
421
|
+
<meta charset="utf-8">
|
|
422
|
+
<script src="https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.min.js"></script>
|
|
423
|
+
<style>
|
|
424
|
+
body {{ margin: 0; padding: 20px; background: white; font-family: Arial, sans-serif; }}
|
|
425
|
+
.mermaid {{ max-width: none; }}
|
|
426
|
+
</style>
|
|
427
|
+
</head>
|
|
428
|
+
<body>
|
|
429
|
+
<div class="mermaid">
|
|
430
|
+
{mmd_content}
|
|
431
|
+
</div>
|
|
432
|
+
<script>
|
|
433
|
+
mermaid.initialize({{ startOnLoad: true, theme: 'default', maxTextSize: {max_text_size}, maxEdges: {max_edges} }});
|
|
434
|
+
</script>
|
|
435
|
+
</body>
|
|
436
|
+
</html>
|
|
437
|
+
"""
|
|
438
|
+
|
|
439
|
+
# Create temporary HTML
|
|
440
|
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as tmp_html:
|
|
441
|
+
tmp_html.write(html_template)
|
|
442
|
+
tmp_html_path = tmp_html.name
|
|
443
|
+
|
|
444
|
+
try:
|
|
445
|
+
# Use puppeteer screenshot
|
|
446
|
+
cmd = [
|
|
447
|
+
'npx', '-y', 'puppeteer',
|
|
448
|
+
'screenshot',
|
|
449
|
+
'--url', f'file://{tmp_html_path}',
|
|
450
|
+
'--output', str(output_file),
|
|
451
|
+
'--wait-for', '.mermaid',
|
|
452
|
+
'--full-page'
|
|
453
|
+
]
|
|
454
|
+
|
|
455
|
+
result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
|
|
456
|
+
|
|
457
|
+
return result.returncode == 0
|
|
458
|
+
|
|
459
|
+
finally:
|
|
460
|
+
os.unlink(tmp_html_path)
|
|
461
|
+
|
|
462
|
+
except Exception as e:
|
|
463
|
+
print(f" Puppeteer error: {e}")
|
|
464
|
+
return False
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
if __name__ == '__main__':
|
|
468
|
+
# CLI interface for testing
|
|
469
|
+
import argparse
|
|
470
|
+
|
|
471
|
+
parser = argparse.ArgumentParser(description='Generate PNG from Mermaid files')
|
|
472
|
+
parser.add_argument('input_dir', help='Directory with .mmd files')
|
|
473
|
+
parser.add_argument('output_dir', help='Output directory for PNG files')
|
|
474
|
+
|
|
475
|
+
args = parser.parse_args()
|
|
476
|
+
|
|
477
|
+
input_path = Path(args.input_dir)
|
|
478
|
+
output_path = Path(args.output_dir)
|
|
479
|
+
|
|
480
|
+
count = generate_pngs(input_path, output_path)
|
|
481
|
+
print(f"Generated {count} PNG files")
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""NLP Processing Pipeline for code2flow.
|
|
2
|
+
|
|
3
|
+
Provides query normalization, intent matching, and entity resolution
|
|
4
|
+
with multilingual support and fuzzy matching.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
__version__ = "0.2.5"
|
|
8
|
+
|
|
9
|
+
from .pipeline import NLPPipeline
|
|
10
|
+
from .normalization import QueryNormalizer
|
|
11
|
+
from .intent_matching import IntentMatcher
|
|
12
|
+
from .entity_resolution import EntityResolver
|
|
13
|
+
from .config import NLPConfig, FAST_NLP_CONFIG, PRECISE_NLP_CONFIG
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"NLPPipeline",
|
|
17
|
+
"QueryNormalizer",
|
|
18
|
+
"IntentMatcher",
|
|
19
|
+
"EntityResolver",
|
|
20
|
+
"NLPConfig",
|
|
21
|
+
"FAST_NLP_CONFIG",
|
|
22
|
+
"PRECISE_NLP_CONFIG",
|
|
23
|
+
]
|
code2flow/nlp/config.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""NLP Configuration - YAML-driven settings for NLP pipeline."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import List, Dict, Optional
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import yaml
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class NormalizationConfig:
|
|
11
|
+
"""Configuration for query normalization."""
|
|
12
|
+
# 1a: Lowercase conversion
|
|
13
|
+
lowercase: bool = True
|
|
14
|
+
# 1b: Punctuation removal
|
|
15
|
+
remove_punctuation: bool = True
|
|
16
|
+
# 1c: Whitespace normalization
|
|
17
|
+
normalize_whitespace: bool = True
|
|
18
|
+
# 1d: Unicode normalization (NFKC)
|
|
19
|
+
unicode_normalize: bool = True
|
|
20
|
+
# 1e: Stopword removal
|
|
21
|
+
remove_stopwords: bool = False
|
|
22
|
+
# Language-specific stopwords
|
|
23
|
+
stopwords: Dict[str, List[str]] = field(default_factory=lambda: {
|
|
24
|
+
"en": ["the", "a", "an", "is", "are", "was", "were"],
|
|
25
|
+
"pl": ["w", "z", "do", "na", "jest", "są"],
|
|
26
|
+
})
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class IntentMatchingConfig:
|
|
31
|
+
"""Configuration for intent matching."""
|
|
32
|
+
# 2a: Fuzzy matching threshold (0.0-1.0)
|
|
33
|
+
fuzzy_threshold: float = 0.8
|
|
34
|
+
# 2b: Semantic similarity threshold
|
|
35
|
+
semantic_threshold: float = 0.85
|
|
36
|
+
# 2c: Keyword matching weight
|
|
37
|
+
keyword_weight: float = 0.6
|
|
38
|
+
# 2d: Context window size
|
|
39
|
+
context_window: int = 3
|
|
40
|
+
# 2e: Multi-intent resolution strategy
|
|
41
|
+
multi_intent_strategy: str = "best_match" # best_match, combine, sequential
|
|
42
|
+
# Fuzzy matching algorithm
|
|
43
|
+
fuzzy_algorithm: str = "token_sort_ratio" # ratio, partial_ratio, token_sort_ratio
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class EntityResolutionConfig:
|
|
48
|
+
"""Configuration for entity resolution."""
|
|
49
|
+
# 3a: Entity types to extract
|
|
50
|
+
entity_types: List[str] = field(default_factory=lambda: [
|
|
51
|
+
"function", "class", "module", "variable", "file"
|
|
52
|
+
])
|
|
53
|
+
# 3b: Name matching threshold
|
|
54
|
+
name_match_threshold: float = 0.9
|
|
55
|
+
# 3c: Context-aware disambiguation
|
|
56
|
+
context_disambiguation: bool = True
|
|
57
|
+
# 3d: Hierarchical resolution (class.method -> method)
|
|
58
|
+
hierarchical_resolution: bool = True
|
|
59
|
+
# 3e: Alias resolution (short names -> qualified names)
|
|
60
|
+
alias_resolution: bool = True
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class MultilingualConfig:
|
|
65
|
+
"""Configuration for multilingual processing."""
|
|
66
|
+
# Supported languages
|
|
67
|
+
languages: List[str] = field(default_factory=lambda: ["en", "pl"])
|
|
68
|
+
# Default language
|
|
69
|
+
default_language: str = "en"
|
|
70
|
+
# Language detection confidence threshold
|
|
71
|
+
lang_detect_threshold: float = 0.7
|
|
72
|
+
# Cross-language matching
|
|
73
|
+
cross_language_matching: bool = True
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass
|
|
77
|
+
class NLPConfig:
|
|
78
|
+
"""Main NLP pipeline configuration."""
|
|
79
|
+
# Sub-configurations
|
|
80
|
+
normalization: NormalizationConfig = field(default_factory=NormalizationConfig)
|
|
81
|
+
intent_matching: IntentMatchingConfig = field(default_factory=IntentMatchingConfig)
|
|
82
|
+
entity_resolution: EntityResolutionConfig = field(default_factory=EntityResolutionConfig)
|
|
83
|
+
multilingual: MultilingualConfig = field(default_factory=MultilingualConfig)
|
|
84
|
+
|
|
85
|
+
# Pipeline stages
|
|
86
|
+
enable_normalization: bool = True
|
|
87
|
+
enable_intent_matching: bool = True
|
|
88
|
+
enable_entity_resolution: bool = True
|
|
89
|
+
|
|
90
|
+
# Logging
|
|
91
|
+
verbose: bool = False
|
|
92
|
+
|
|
93
|
+
@classmethod
|
|
94
|
+
def from_yaml(cls, path: str) -> "NLPConfig":
|
|
95
|
+
"""Load configuration from YAML file."""
|
|
96
|
+
with open(path, 'r', encoding='utf-8') as f:
|
|
97
|
+
data = yaml.safe_load(f)
|
|
98
|
+
|
|
99
|
+
return cls(
|
|
100
|
+
normalization=NormalizationConfig(**data.get('normalization', {})),
|
|
101
|
+
intent_matching=IntentMatchingConfig(**data.get('intent_matching', {})),
|
|
102
|
+
entity_resolution=EntityResolutionConfig(**data.get('entity_resolution', {})),
|
|
103
|
+
multilingual=MultilingualConfig(**data.get('multilingual', {})),
|
|
104
|
+
enable_normalization=data.get('enable_normalization', True),
|
|
105
|
+
enable_intent_matching=data.get('enable_intent_matching', True),
|
|
106
|
+
enable_entity_resolution=data.get('enable_entity_resolution', True),
|
|
107
|
+
verbose=data.get('verbose', False),
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
def to_yaml(self, path: str) -> None:
|
|
111
|
+
"""Save configuration to YAML file."""
|
|
112
|
+
data = {
|
|
113
|
+
'normalization': self.normalization.__dict__,
|
|
114
|
+
'intent_matching': self.intent_matching.__dict__,
|
|
115
|
+
'entity_resolution': self.entity_resolution.__dict__,
|
|
116
|
+
'multilingual': self.multilingual.__dict__,
|
|
117
|
+
'enable_normalization': self.enable_normalization,
|
|
118
|
+
'enable_intent_matching': self.enable_intent_matching,
|
|
119
|
+
'enable_entity_resolution': self.enable_entity_resolution,
|
|
120
|
+
'verbose': self.verbose,
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
|
124
|
+
with open(path, 'w', encoding='utf-8') as f:
|
|
125
|
+
yaml.dump(data, f, default_flow_style=False, allow_unicode=True)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# Predefined configurations
|
|
129
|
+
FAST_NLP_CONFIG = NLPConfig(
|
|
130
|
+
normalization=NormalizationConfig(
|
|
131
|
+
lowercase=True,
|
|
132
|
+
remove_punctuation=True,
|
|
133
|
+
normalize_whitespace=True,
|
|
134
|
+
unicode_normalize=True,
|
|
135
|
+
remove_stopwords=False,
|
|
136
|
+
),
|
|
137
|
+
intent_matching=IntentMatchingConfig(
|
|
138
|
+
fuzzy_threshold=0.7, # Lower threshold for speed
|
|
139
|
+
semantic_threshold=0.8,
|
|
140
|
+
keyword_weight=0.8, # Higher weight on keywords
|
|
141
|
+
fuzzy_algorithm="ratio", # Faster algorithm
|
|
142
|
+
),
|
|
143
|
+
entity_resolution=EntityResolutionConfig(
|
|
144
|
+
entity_types=["function", "class"],
|
|
145
|
+
name_match_threshold=0.85,
|
|
146
|
+
context_disambiguation=False, # Skip for speed
|
|
147
|
+
),
|
|
148
|
+
verbose=False,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
PRECISE_NLP_CONFIG = NLPConfig(
|
|
152
|
+
normalization=NormalizationConfig(
|
|
153
|
+
lowercase=True,
|
|
154
|
+
remove_punctuation=True,
|
|
155
|
+
normalize_whitespace=True,
|
|
156
|
+
unicode_normalize=True,
|
|
157
|
+
remove_stopwords=True,
|
|
158
|
+
),
|
|
159
|
+
intent_matching=IntentMatchingConfig(
|
|
160
|
+
fuzzy_threshold=0.9,
|
|
161
|
+
semantic_threshold=0.95,
|
|
162
|
+
keyword_weight=0.4,
|
|
163
|
+
context_window=5,
|
|
164
|
+
fuzzy_algorithm="token_sort_ratio",
|
|
165
|
+
),
|
|
166
|
+
entity_resolution=EntityResolutionConfig(
|
|
167
|
+
entity_types=["function", "class", "module", "variable", "file"],
|
|
168
|
+
name_match_threshold=0.95,
|
|
169
|
+
context_disambiguation=True,
|
|
170
|
+
hierarchical_resolution=True,
|
|
171
|
+
alias_resolution=True,
|
|
172
|
+
),
|
|
173
|
+
verbose=True,
|
|
174
|
+
)
|